# PUBG Placement Prediction

*[Competition found on Kaggle](https://www.kaggle.com/c/pubg-finish-placement-prediction/data)*

*By Aldrich Mangune and Joleena Marshall*

---

## Table of Contents

1. Data Exploration
2. Data Preprocessing
3. Modeling
4. Evaluation


In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

Helper functions taken from TitanicML notebook. These functions are just here to simplify code for data visualization.

In [2]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = df.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 8 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))
    

In [3]:
chunksize = 10000

chunks = []

for chunk in pd.read_csv("train_V2.csv", chunksize = chunksize):
    chunks.append(chunk)

data = pd.concat(chunks)
"""
data = pd.read_csv("train_V2.csv", nrows = 5000)
    
print(data.shape)
"""


'\ndata = pd.read_csv("train_V2.csv", nrows = 5000)\n    \nprint(data.shape)\n'

In [4]:
data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


## Data Exploration

**Variable Description**

 - assists - Number of enemy players this player damaged that were killed by teammates.
 - boosts - Number of boost items used.
 - damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
 - DBNOs - Number of enemy players knocked.
 - headshotKills - Number of enemy players killed with headshots.
 - heals - Number of healing items used.
 - Id - Player’s Id
 - killPlace - Ranking in match of number of enemy players killed.
 - killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
 - killStreaks - Max number of enemy players killed in a short amount of time.
 - kills - Number of enemy players killed.
 - longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
 - matchDuration - Duration of match in seconds.
 - matchId - ID to identify match. There are no matches that are in both the training and testing set.
 - matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
 - rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
 - revives - Number of times this player revived teammates.
 - rideDistance - Total distance traveled in vehicles measured in meters.
 - roadKills - Number of kills while in a vehicle.
 - swimDistance - Total distance traveled by swimming measured in meters.
 - teamKills - Number of times this player killed a teammate.
 - vehicleDestroys - Number of vehicles destroyed.
 - walkDistance - Total distance traveled on foot measured in meters.
 - weaponsAcquired - Number of weapons picked up.
 - winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
 - groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
 - numGroups - Number of groups we have data for in the match.
 - maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
 - winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [6]:
data.describe()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
mean,0.2338149,1.106908,130.7171,0.6578755,0.2268196,1.370147,47.59935,505.006,0.9247833,0.5439551,...,0.164659,606.1157,0.003496091,4.509322,0.02386841,0.007918208,1154.218,3.660488,606.4601,0.4728216
std,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46294,627.5049,1.558445,0.7109721,...,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261157,1183.497,2.456544,739.7004,0.307405
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,0.0,0.0,84.24,0.0,0.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,0.0,2.0,186.0,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407
max,22.0,33.0,6616.0,53.0,64.0,80.0,101.0,2170.0,72.0,20.0,...,39.0,40710.0,18.0,3823.0,12.0,5.0,25780.0,236.0,2013.0,1.0


In [7]:
data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 4446966 non-null object
groupId            4446966 non-null object
matchId            4446966 non-null object
assists            4446966 non-null int64
boosts             4446966 non-null int64
damageDealt        4446966 non-null float64
DBNOs              4446966 non-null int64
headshotKills      4446966 non-null int64
heals              4446966 non-null int64
killPlace          4446966 non-null int64
killPoints         4446966 non-null int64
kills              4446966 non-null int64
killStreaks        4446966 non-null int64
longestKill        4446966 non-null float64
matchDuration      4446966 non-null int64
matchType          4446966 non-null object
maxPlace           4446966 non-null int64
numGroups          4446966 non-null int64
rankPoints         4446966 non-null int64
revives            4446966 non-null int64
rideDistance       4446966 non-null flo

We can see that there is a row with a null winPlacePerc so we will just drop the instance. Since the dataset is fairly large, my intuition tells me that the overall accuracy will not be significantly affected by this.

In [9]:
nullwins = data[data['winPlacePerc'].isnull()]

data1 = data.dropna()

data1.shape

(4446965, 29)

In [11]:
data2 = data1

data2['totalDistance'] = data2['rideDistance'] + data2['swimDistance'] + data2['walkDistance']

data2.drop(columns=['rideDistance','swimDistance','walkDistance'])

data2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,totalDistance
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0.0,0,0.0,0,0,244.8,1,1466,0.4444,244.8
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0.0045,0,11.04,0,0,1434.0,5,0,0.64,1445.0445
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0.0,0,0.0,0,0,161.8,2,0,0.7755,161.8
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0.0,0,0.0,0,0,202.7,3,0,0.1667,202.7
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0.0,0,0.0,0,0,49.75,2,0,0.1875,49.75


In [None]:
afk = data[data['walkDistance'] == 0]
afk.

## Building a decision tree model

In [9]:
y = data1.pop('winPlacePerc').values

In [11]:
x = data1.values
print(x.shape)
print(y.shape)

(4446965, 28)
(4446965,)


In [13]:
y

array([ 0.4444,  0.64  ,  0.7755, ...,  0.4815,  0.8   ,  0.5464])

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, random_state=0)



In [16]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=5, random_state=2)

In [17]:
forest.fit(x_train, y_train)

ValueError: could not convert string to float: 'squad'

## Building a Naive Bayes model