# PUBG Placement Prediction

*[Competition found on Kaggle](https://www.kaggle.com/c/pubg-finish-placement-prediction/data)*

*By Aldrich Mangune and Joleena Marshall*

---

## Table of Contents

1. Data Exploration
2. Data Preprocessing
3. Modeling
4. Evaluation


In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

Helper functions taken from TitanicML notebook. These functions are just here to simplify code for data visualization.

In [2]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 8 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))
    

In [3]:
chunksize = 10000

chunks = []

for chunk in pd.read_csv("train_V2.csv", chunksize = chunksize):
    chunks.append(chunk)

data = pd.concat(chunks)

validation = pd.read_csv("train_V2.csv", nrows = 100000)
    
print(validation.shape)

(100000, 29)


In [4]:
data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Data Exploration

**Variable Description**

 - assists - Number of enemy players this player damaged that were killed by teammates.
 - boosts - Number of boost items used.
 - damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
 - DBNOs - Number of enemy players knocked.
 - headshotKills - Number of enemy players killed with headshots.
 - heals - Number of healing items used.
 - Id - Player’s Id
 - killPlace - Ranking in match of number of enemy players killed.
 - killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
 - killStreaks - Max number of enemy players killed in a short amount of time.
 - kills - Number of enemy players killed.
 - longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
 - matchDuration - Duration of match in seconds.
 - matchId - ID to identify match. There are no matches that are in both the training and testing set.
 - matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
 - rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
 - revives - Number of times this player revived teammates.
 - rideDistance - Total distance traveled in vehicles measured in meters.
 - roadKills - Number of kills while in a vehicle.
 - swimDistance - Total distance traveled by swimming measured in meters.
 - teamKills - Number of times this player killed a teammate.
 - vehicleDestroys - Number of vehicles destroyed.
 - walkDistance - Total distance traveled on foot measured in meters.
 - weaponsAcquired - Number of weapons picked up.
 - winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
 - groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
 - numGroups - Number of groups we have data for in the match.
 - maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
 - winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [5]:
data.describe()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
mean,0.2338149,1.106908,130.7171,0.6578755,0.2268196,1.370147,47.59935,505.006,0.9247833,0.5439551,...,0.164659,606.1157,0.003496091,4.509322,0.02386841,0.007918208,1154.218,3.660488,606.4601,0.4728216
std,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46294,627.5049,1.558445,0.7109721,...,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261157,1183.497,2.456544,739.7004,0.307405
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,0.0,0.0,84.24,0.0,0.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,0.0,2.0,186.0,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407
max,22.0,33.0,6616.0,53.0,64.0,80.0,101.0,2170.0,72.0,20.0,...,39.0,40710.0,18.0,3823.0,12.0,5.0,25780.0,236.0,2013.0,1.0


In [7]:
data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 4446966 non-null object
groupId            4446966 non-null object
matchId            4446966 non-null object
assists            4446966 non-null int64
boosts             4446966 non-null int64
damageDealt        4446966 non-null float64
DBNOs              4446966 non-null int64
headshotKills      4446966 non-null int64
heals              4446966 non-null int64
killPlace          4446966 non-null int64
killPoints         4446966 non-null int64
kills              4446966 non-null int64
killStreaks        4446966 non-null int64
longestKill        4446966 non-null float64
matchDuration      4446966 non-null int64
matchType          4446966 non-null object
maxPlace           4446966 non-null int64
numGroups          4446966 non-null int64
rankPoints         4446966 non-null int64
revives            4446966 non-null int64
rideDistance       4446966 non-null flo

We can see that there is a row with a null winPlacePerc so we will just drop the instance. Since the dataset is fairly large, my intuition tells me that the overall accuracy will not be significantly affected by this.

In [12]:
nullwins = data[data['winPlacePerc'].isnull()]

data = data.dropna()

data.shape

(4446965, 29)

To reduce dimensions, we are combining rideDistance, walkDistance, and swimDistance to reduce 3 variables to 1

In [14]:
data1 = data.copy()

data1['totalDistance'] = data1['rideDistance'] + data1['swimDistance'] + data1['walkDistance']

data1 = data1.drop(columns=['rideDistance','swimDistance','walkDistance'])

data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4446965 entries, 0 to 4446965
Data columns (total 27 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
roadKills          int64
teamKills          int64
vehicleDestroys    int64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
totalDistance      float64
dtypes: float64(4), int64(19), object(4)
memory usage: 950.0+ MB


In [16]:
afk = data1[data1['totalDistance'] == 0]
afk.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,numGroups,rankPoints,revives,roadKills,teamKills,vehicleDestroys,weaponsAcquired,winPoints,winPlacePerc,totalDistance
29,ac5b57ff39979c,857cc55b2b6001,e019e04dee4f19,0,0,0.0,0,0,0,87,...,44,1534,0,0,0,0,0,0,0.0,0.0
116,6adb021f5165ff,58e5500bd40898,de5c692fe25a73,0,0,0.0,0,0,0,68,...,36,0,0,0,0,0,0,847,0.0,0.0
151,a2bbe20aa8789d,926e8a09bab249,e36e4203ed4831,0,0,0.0,0,0,0,92,...,41,-1,0,0,0,0,0,765,0.0,0.0
237,baaa694658e085,d034728f22cff7,fa71620624d3e7,0,0,0.0,0,0,0,94,...,26,-1,0,0,0,0,0,1510,0.0,0.0
283,3ab8128e6bcbe6,bb52a209f2e938,aabd2650b129e2,0,0,0.0,0,0,0,84,...,47,1500,0,0,0,0,0,0,0.1277,0.0


In [17]:
hackers = data2[data2['headshotKills'] > 30]
hackers.head()

NameError: name 'data2' is not defined

In [24]:
ohe = pd.get_dummies(data1['matchType'])
data1 = data1.drop('matchType',axis = 1 )
data1 = data1.join(ohe)

KeyError: 'matchType'

In [43]:
data2 = data1.copy()
    

# PlayersJoined feature
data2['playersJoined'] = data2.groupby('matchId')['matchId'].transform('count')

# Normalized features
data2['killsNorm'] = data2['kills']*((100-data2['playersJoined'])/100 + 1)
data2['damageDealtNorm'] = data2['damageDealt']*((100-data2['playersJoined'])/100 + 1)
data2['maxPlaceNorm'] = data2['maxPlace']*((100-data2['playersJoined'])/100 + 1)
data2['matchDurationNorm'] = data2['matchDuration']*((100-data2['playersJoined'])/100 + 1)
data2['totalDistanceNorm'] = data2['totalDistance']*((100-data2['playersJoined'])/100 + 1)


In [None]:
def feature_engineer(dataframe):
    
    
    data = data.dropna()
    
    data2['playersJoined'] = data2.groupby('matchId')['matchId'].transform('count')

    # Normalized features
    data2['killsNorm'] = data2['kills']*((100-data2['playersJoined'])/100 + 1)
    data2['damageDealtNorm'] = data2['damageDealt']*((100-data2['playersJoined'])/100 + 1)
    data2['maxPlaceNorm'] = data2['maxPlace']*((100-data2['playersJoined'])/100 + 1)
    data2['matchDurationNorm'] = data2['matchDuration']*((100-data2['playersJoined'])/100 + 1)
    data2['totalDistanceNorm'] = data2['totalDistance']*((100-data2['playersJoined'])/100 + 1)


In [44]:
data2 = data2.drop(columns=['Id','groupId','matchId','kills','damageDealt','maxPlace','matchDuration','totalDistance'])
data2.head()

Unnamed: 0,assists,boosts,DBNOs,headshotKills,heals,killPlace,killPoints,killStreaks,longestKill,numGroups,...,solo,solo-fpp,squad,squad-fpp,playersJoined,killsNorm,damageDealtNorm,maxPlaceNorm,matchDurationNorm,totalDistanceNorm
0,0,0,0,0,0,60,1241,0,0.0,26,...,0,0,0,1,96,0.0,0.0,29.12,1358.24,254.592
1,0,0,0,0,0,57,0,0,0.0,25,...,0,0,0,1,91,0.0,99.7023,28.34,1936.93,1575.098505
2,1,0,0,0,0,47,0,0,0.0,47,...,0,0,0,0,98,0.0,69.36,51.0,1344.36,165.036
3,0,0,0,0,0,75,0,0,0.0,30,...,0,0,0,1,91,0.0,35.861,33.79,1565.24,220.943
4,0,0,0,0,0,45,0,1,58.53,95,...,0,1,0,0,97,1.03,103.0,99.91,1466.72,51.2425


## Preparing the Data

In [45]:
df = data2.copy()

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4446965 entries, 0 to 4446965
Data columns (total 40 columns):
assists              int64
boosts               int64
DBNOs                int64
headshotKills        int64
heals                int64
killPlace            int64
killPoints           int64
killStreaks          int64
longestKill          float64
numGroups            int64
rankPoints           int64
revives              int64
roadKills            int64
teamKills            int64
vehicleDestroys      int64
weaponsAcquired      int64
winPoints            int64
winPlacePerc         float64
crashfpp             uint8
crashtpp             uint8
duo                  uint8
duo-fpp              uint8
flarefpp             uint8
flaretpp             uint8
normal-duo           uint8
normal-duo-fpp       uint8
normal-solo          uint8
normal-solo-fpp      uint8
normal-squad         uint8
normal-squad-fpp     uint8
solo                 uint8
solo-fpp             uint8
squad              

In [47]:
y = df.pop('winPlacePerc').values

In [48]:
x = df.values
print(x.shape)
print(y.shape)

(4446965, 39)
(4446965,)


In [49]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, random_state=0)



In [65]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=5, random_state=2)

In [66]:
forest.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
           oob_score=False, random_state=2, verbose=0, warm_start=False)

In [67]:
forest.score(x_test, y_test)

0.9151255159989593

## Building a Neural Network Model

In [50]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.utils import np_utils

Using TensorFlow backend.


In [51]:
df.shape

(4446965, 39)

In [58]:
model = Sequential()
model.add(Dense(32, input_shape=(39,)))
model.add(Activation('relu'))
#model.add(Dense(8, activation='relu'))
model.add(Dense(1))
# model.add(Activation('sigmoid'))
model.compile(optimizer='ADAM', loss='mean_squared_error', metrics=['accuracy'],)

In [60]:
model.fit(x_train, y_train, verbose=1, epochs = 1, batch_size=10)
score = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy of neural network model using keras at epoch {}= {:.2f}".format(1, score[1]))

Epoch 1/1
Accuracy of neural network model using keras at epoch 1= 0.08


## Building a k-nn model

In [63]:
from sklearn.neighbors import KNeighborsRegressor
# Create KNN classifier
knn = KNeighborsRegressor(n_neighbors = 5)
# Fit the classifier to the data
knn.fit(x_train,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [64]:
knn.score(x_test, y_test)

0.81232764580845096

## Naive Bayes

In [70]:
from sklearn.naive_bayes import GaussianNB
# Create KNN classifier
gnb = GaussianNB()
# Fit the classifier to the data
gnb.fit(x_train,y_train)

ValueError: Unknown label type: (array([ 0.    ,  0.0101,  0.0102, ...,  0.9898,  0.9899,  1.    ]),)

## Predicting

In [73]:
def feature_engineer_test(dataframe):
    
    newdata = dataframe.copy()
    
    newdata = newdata.dropna()

    newdata['totalDistance'] = newdata['rideDistance'] + newdata['swimDistance'] + newdata['walkDistance']

    newdata = newdata.drop(columns=['rideDistance','swimDistance','walkDistance'])
    
    ohe = pd.get_dummies(newdata['matchType'])
    newdata = newdata.drop('matchType',axis = 1 )
    newdata = newdata.join(ohe)
    
    newdata['playersJoined'] = newdata.groupby('matchId')['matchId'].transform('count')

    # Normalized features
    newdata['killsNorm'] = newdata['kills']*((100-newdata['playersJoined'])/100 + 1)
    newdata['damageDealtNorm'] = newdata['damageDealt']*((100-newdata['playersJoined'])/100 + 1)
    newdata['maxPlaceNorm'] = newdata['maxPlace']*((100-newdata['playersJoined'])/100 + 1)
    newdata['matchDurationNorm'] = newdata['matchDuration']*((100-newdata['playersJoined'])/100 + 1)
    newdata['totalDistanceNorm'] = newdata['totalDistance']*((100-newdata['playersJoined'])/100 + 1)
    
    newdata = newdata.drop(columns=['groupId','matchId','kills','damageDealt','maxPlace','matchDuration','totalDistance'])
    
    return newdata

test = pd.read_csv("C:/Users/aldrich_mangune/Personal/CMPE188/pubg-seer/test_V2.csv", index_col = False)
fetest = feature_engineer_test(test)

In [None]:
ids = fetest['Id']
predictions = model.predict(fetest.drop('Id', axis=1))

output = pd.DataFrame({ 'Id' : ids, 'winPlacePerc': predictions })
output.to_csv('predictions.csv', index = False)