In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [6]:
df= PUBG.upload(pubg.csv)

NameError: ignored

In [None]:
df.head(5)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

>We found that there is one particular player with a 'winPlacePerc' of NaN. The case was that this match had only one player. We will delete this row from our dataset.

In [None]:
df[df['winPlacePerc'].isnull()]

> Let's delete this entry

In [None]:
df.drop(2744604,inplace= True)

In [None]:
df[df['winPlacePerc'].isnull()]

>And as we can see above he is gone

### Feature Engineering

#### Players joined

>This is likely a very valuable feature for our model. If we know how many people are in a match we can normalize other features and get stronger predictions on individual players.

In [None]:
df['playersjoined']= df.groupby('matchId')['matchId'].transform('count')

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(df[df['playersjoined']>=75]['playersjoined'])
plt.title('Players Joined')
plt.show()

>As we can see, There are a few matches with fewer than 75 players that are not displayed here. As you can see most of the matches are nearly packed a have nearly 100 players. It is nevertheless interesting to take these features into our analysis.

##### Now that we have a feature 'playersJoined' we can normalize other features based on the amount of players. Features that can be valuable to normalize are:

>1.kills

>2.damageDealt

>3.maxPlace

>4.matchDuration

In [None]:
print('kills:',df['kills'].nunique())
print('damageDealt:', df['damageDealt'].nunique())
print('maxPlace:', df['maxPlace'].nunique())
print('matchDuration:', df['matchDuration'].nunique())

###### matchType
>There are 3 game modes in the game. One can play solo, or with a friend (duo), or with 3 other friends (squad).

In [None]:
df['matchType'].value_counts()

In [None]:
fig, ax= plt.subplots(1,2, figsize=(12,4))
df.groupby('matchId')['matchType'].first().value_counts().plot.bar(ax=ax[0])
mapper= lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) or ('crash' in x) else 'squad'
df['matchType']=df['matchType'].apply(mapper)
df.groupby('matchId')['matchType'].first().value_counts().plot.bar(ax=ax[1])

>solo  <-- solo,solo-fpp,normal-solo,normal-solo-fpp

>duo   <-- duo,duo-fpp,normal-duo,normal-duo-fpp,crashfpp,crashtpp

>squad <-- squad,squad-fpp,normal-squad,normal-squad-fpp,flarefpp,flaretpp

In [None]:
solos = df[df['numGroups']>50]
duos = df[(df['numGroups']>25) & (df['numGroups']<=50)]
squads = df[df['numGroups']<=25]
print("There are {} ({:.2f}%) solo games, {} ({:.2f}%) duo games and {} ({:.2f}%) squad games.".format(len(solos), 100*len(solos)/len(df), len(duos), 100*len(duos)/len(df), len(squads), 100*len(squads)/len(df),))

In [None]:
f,ax1 = plt.subplots(figsize =(20,10))
sns.pointplot(x='kills',y='winPlacePerc',data=solos,color='black',alpha=0.8)
sns.pointplot(x='kills',y='winPlacePerc',data=duos,color='#CC0000',alpha=0.8)
sns.pointplot(x='kills',y='winPlacePerc',data=squads,color='#3399FF',alpha=0.8)
plt.text(37,0.6,'Solos',color='black',fontsize = 17,style = 'italic')
plt.text(37,0.55,'Duos',color='#CC0000',fontsize = 17,style = 'italic')
plt.text(37,0.5,'Squads',color='#3399FF',fontsize = 17,style = 'italic')
plt.xlabel('Number of kills',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Solo vs Duo vs Squad Kills',fontsize = 20,color='blue')
plt.grid()
plt.show()

>This is very interesting. Solos and duos behave the same, but when playing squads kills don't matter that much.

###### The attribute DBNOs means enemy players knocked. A "knock" can happen only in duos or squads, because the teammates have the chance to "revive" the knocked player in a given time. So a knocked player can be revived or die. If he is revived, the next time he will be knocked, his teammates will have less time to revive him.

In [None]:
f,ax1 = plt.subplots(figsize =(20,10))
sns.pointplot(x='DBNOs',y='winPlacePerc',data=duos,color='#CC0000',alpha=0.8)
sns.pointplot(x='DBNOs',y='winPlacePerc',data=squads,color='#3399FF',alpha=0.8)
sns.pointplot(x='assists',y='winPlacePerc',data=duos,color='#FF6666',alpha=0.8)
sns.pointplot(x='assists',y='winPlacePerc',data=squads,color='#CCE5FF',alpha=0.8)
sns.pointplot(x='revives',y='winPlacePerc',data=duos,color='#660000',alpha=0.8)
sns.pointplot(x='revives',y='winPlacePerc',data=squads,color='#000066',alpha=0.8)
plt.text(14,0.5,'Duos - Assists',color='#FF6666',fontsize = 17,style = 'italic')
plt.text(14,0.45,'Duos - DBNOs',color='#CC0000',fontsize = 17,style = 'italic')
plt.text(14,0.4,'Duos - Revives',color='#660000',fontsize = 17,style = 'italic')
plt.text(14,0.35,'Squads - Assists',color='#CCE5FF',fontsize = 17,style = 'italic')
plt.text(14,0.3,'Squads - DBNOs',color='#3399FF',fontsize = 17,style = 'italic')
plt.text(14,0.25,'Squads - Revives',color='#000066',fontsize = 17,style = 'italic')
plt.xlabel('Number of DBNOs/Assits/Revives',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Duo vs Squad DBNOs, Assists, and Revives',fontsize = 20,color='blue')
plt.grid()
plt.show()

>The attribute assist can also happen only in duos or squads. It generally means that the player had an involvement in a kill.

>The attribute revive also happens in duos or squads.

#### Killing without moving
>We try to identify cheaters by checking if people are getting kills without moving. We first identify the totalDistance travelled by a player and then set a boolean value to True if someone got kills without moving a single inch. We will remove cheaters in our outlier detection section.

In [None]:
# Create feature totalDistance
df['TotalDistances']= df['rideDistance']+df['walkDistance']+df['swimDistance']

In [None]:
# Create feature killsWithoutMoving
df['KillsWithoutMoving']=((df['kills']>0) & (df['TotalDistances']==0))

In [None]:
pd.DataFrame(df['KillsWithoutMoving']).tail()

In [None]:
df['KillsWithoutMoving'].value_counts()

> As we can see, There are 1535 Players who get kills without moving.

In [None]:
# Check players who kills without moving
display(df[df['KillsWithoutMoving']== True].shape)
df[df['KillsWithoutMoving']== True].head(10)

> We can say these are the outliers.

In [None]:
#removing of Outliers
df.drop(df[df['KillsWithoutMoving']==True].index, inplace=True)

#### Players
>players in match, group
100 players join the same server, so in the case of duos the max teams are 50 and in the case of squads the max teams are 25.

In [None]:
group = df.groupby(['matchId','groupId','matchType'])['Id'].count().to_frame('players').reset_index()
group.loc[group['players'] > 4, 'players'] = '5+'
group['players'] = group['players'].astype(str)

fig, ax = plt.subplots(1, 3, figsize=(16, 4))
for mt, ax in zip(['solo','duo','squad'], ax.ravel()):
    ax.set_xlabel(mt)
    group[group['matchType'] == mt]['players'].value_counts().sort_index().plot.bar(ax=ax)

In [None]:
sub = df[df['matchId']=='41a634f62f86b7']
sub_grp = sub[sub['groupId']=='128b07271aa012']

print('matchId==\'41a634f62f86b7\' & groupId==\'128b07271aa012\'')
print('-'*50)
print('players:',len(sub))
print('groups:',sub['groupId'].nunique())
print('numGroups:',sub['numGroups'].unique())
print('maxPlace:',sub['maxPlace'].unique())
print('-'*50)
print('max-group players:',len(sub_grp))
print('max-group winPlacePerc:',sub_grp['winPlacePerc'].unique())
print('-'*50)
print('winPlacePerc:',sub['winPlacePerc'].sort_values().unique())

##### matchDuration

In [None]:
fig, ax= plt.subplots(1,2, figsize=(12,4))
df['matchDuration'].hist(bins=50, ax=ax[0])
df.query('matchDuration >= 1400 & matchDuration <= 1800')['matchDuration'].hist(bins=50, ax=ax[1])

###### Min matchDuration

In [None]:
df[df['matchDuration'] == df['matchDuration'].min()].head()

##### Max matchDuration

In [None]:
df[df['matchDuration']==df['matchDuration'].max()].head()

#### Boosts, Heals

In [None]:
fig, ax= plt.subplots(2,2, figsize=(14,8))
cols=['boosts','heals']
for col, ax in zip(cols,ax):
    sub=df[['winPlacePerc',col]].copy()
    mv= (sub[col].max()//5)+1
    sub[col] = pd.cut(sub[col], [5*x for x in range(0,mv)], right=False)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    df[col].hist(bins=20, ax=ax[1])

##### We create a feature called 'healsandboosts' by adding heals and boosts.

In [None]:
df['heals&boosts']= df['heals']+df['boosts']
df[['heals','boosts','heals&boosts']].tail()

In [None]:
f,ax1 = plt.subplots(figsize =(20,10))
sns.pointplot(x='heals',y='winPlacePerc',data=df,color='lime',alpha=0.8)
sns.pointplot(x='boosts',y='winPlacePerc',data=df,color='blue',alpha=0.8)
plt.text(4,0.6,'Heals',color='lime',fontsize = 17,style = 'italic')
plt.text(4,0.55,'Boosts',color='blue',fontsize = 17,style = 'italic')
plt.xlabel('Number of heal/boost items',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Heals vs Boosts',fontsize = 20,color='blue')
plt.grid()
plt.show()

>We can say that,So healing and boosting, definitely are correlated with winPlacePerc. Boosting is more.

>In every plot, there is an abnormal behavior when values are 0  

##### Revives

In [None]:
print('solo player has revives:', 'solo' in df.query('revives>0')['matchType'].unique())

>So, as we can see Solo Players cant Revive themselevs.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 4))
col = 'revives'
sub = df.loc[~df['matchType'].str.contains('solo'),['winPlacePerc',col]].copy()
sub[col] = pd.cut(sub[col], [5*x for x in range(0,8)], right=False)
sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
df[col].hist(bins=20, ax=ax[1])

##### KillPlace

In [None]:
df.groupby(['matchType'])['killPlace'].describe()[['min','mean','max']]

In [None]:
plt.figure(figsize=(14,7))
col='killPlace'
sub=df[['winPlacePerc',col]].copy()
sub[col] = pd.cut(sub[col], [10*x for x in range(0,11)], right=False)
sub.groupby(col).mean()['winPlacePerc'].plot.bar()

In [None]:
plt.figure(figsize=(12,6))
sub.groupby(col).mean()['winPlacePerc'].plot.pie()

> From above bar chart & pie chart we can see, killPlace is a sorted ranking of kills and winPlacePerc in each match

In [None]:
subMatch = df[df['matchId'] == df['matchId'].min()].sort_values(['winPlacePerc','killPlace'])
cols = ['groupId','kills','winPlacePerc','killPlace']
subMatch[cols]

#### Kills

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 4))

col = 'kills'
sub = df[['winPlacePerc',col]].copy()
sub[col] = pd.cut(sub[col], [5*x for x in range(0,20)], right=False)
sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
df[df['kills'] < 20][col].hist(bins=20, ax=ax[1])

##### Plotting win placement percentage vs kills

In [None]:
sns.jointplot(x="winPlacePerc", y="kills", data=df, height=10, ratio=3, color="r")
plt.show()

>Apparentrly killing has a correlation with winning. Finally let's group players based on kills (0 kills, 1-2 kills, 3-5 kills, 6-10 kills and 10+ kills).

In [None]:
killss = df.copy()

killss['killCat']=pd.cut(killss['kills'], [-1, 0, 2, 5, 10, 60], labels=['0_kills','1-2_kills', '3-5_kills', '6-10_kills', '10+_kills'])

plt.figure(figsize=(15,8))
sns.boxplot(x="killCat", y="winPlacePerc", data=killss)
plt.show()

In [None]:
sub = df['matchType'].str.contains('solo')
pd.concat([df.loc[sub].groupby('matchId')['kills'].sum().describe(),
         df.loc[~sub].groupby('matchId')['kills'].sum().describe()], keys=['solo','team'], axis=1).T

>Kills summary of match.

#### KillStreaks & DBNOs 

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 10))

cols = ['killStreaks','DBNOs']
for col, ax in zip(cols, ax):
    sub = df[['winPlacePerc',col]].copy()
    sub[col] = pd.cut(sub[col], 6)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    df[col].hist(bins=20, ax=ax[1])

In [None]:
plt.figure(figsize=(12,8))
sub.groupby(col).mean()['winPlacePerc'].plot.pie()

#### RoadKills

In [None]:
# Players who got more than 10 roadKills
df[df['roadKills'] > 10]

In [None]:
# Drop roadKill 'Cheaters'
df.drop(df[df['roadKills'] > 10].index, inplace=True)

>Let's plot the total kills for every player first. It doesn't look like there are too many outliers.

In [None]:
# Plot the distribution of kills ('More than 45 kills')
plt.figure(figsize=(12,6))
sns.countplot(data=df, x=df['kills']).set_title('kills')
plt.show()

##### HeadshotKills

In [None]:
# Players who made a minimum of 10 kills and have a headshotKills of 100%
display(df[(df['headshotKills'] == 1) & (df['kills'] > 9)].shape)
df[(df['headshotKills'] == 1) & (df['kills'] > 9)].head(10)

In [None]:
# Plot the distribution of longestKill
plt.figure(figsize=(12,4))
sns.distplot(df['longestKill'], bins=10)
plt.show()

>We can see, Most kills are made from a distance of 100 meters or closer. There are however some outliers who killed from more than 1km away. This is probably done by cheaters.

In [None]:
# Check out players who made kills with a distance of more than 1 km
display(df[df['longestKill'] >= 1000].shape)
df[df['longestKill'] >= 1000].head(10)

In [None]:
# Remove outliers
df.drop(df[df['longestKill'] >= 1000].index, inplace=True)

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(16, 12))

cols = ['headshotKills','roadKills']
for col, ax in zip(cols, ax):
    sub = df[['winPlacePerc',col]].copy()
    sub.loc[sub[col] >= 5, col] = '5+'  
    sub[col] = sub[col].astype(str)
    sub.groupby(col).mean()['winPlacePerc'].plot.bar(ax=ax[0])
    df[col].hist(bins=20, ax=ax[1])

##### WeaponsAcquired

>Most people acquire between 0 and 10 weapons in a game, but we also see some people acquire more than 80 weapons! 
Let's check these guys out.

In [None]:
# Players who acquired more than 80 weapons
display(df[df['weaponsAcquired'] >= 80].shape)
df[df['weaponsAcquired'] >= 80].head()

In [None]:
# Plot the distribution of weaponsAcquired
plt.figure(figsize=(12,4))
sns.distplot(df['weaponsAcquired'], bins=100)
plt.show()

In [None]:
df[df['weaponsAcquired'] == 236]

>As we can see, player 3f2bcf53b108c4 acquired 236 weapons in one game. We can say it is an outlier.

In [None]:
# Remove outliers
df.drop(df[df['weaponsAcquired'] >= 80].index, inplace=True)

##### Travelling 
>Let's check out Distance travelled.

>(rideDistance, walkDistance and swimDistance)

In [None]:
# Summary statistics for the Distance features
df[['walkDistance', 'rideDistance', 'swimDistance']].describe()

In [None]:
# Plot the distribution of walkDistance
plt.figure(figsize=(12,4))
sns.distplot(df['walkDistance'], bins=10)
plt.show()

In [None]:
# walkDistance anomalies
display(df[df['walkDistance'] >= 10000].shape)
df[df['walkDistance'] >= 10000].head(10)

In [None]:
# Remove outliers
df.drop(df[df['walkDistance'] >= 10000].index, inplace=True)

In [None]:
# Plot the distribution of rideDistance
plt.figure(figsize=(12,4))
sns.distplot(df['rideDistance'], bins=10)
plt.show()

In [None]:
# rideDistance
display(df[df['rideDistance'] >= 20000].shape)
df[df['rideDistance'] >= 20000].head(10)

>Cheaters do ride like these, they use to ride over 20000m

In [None]:
# Remove outliers
df.drop(df[df['rideDistance'] >= 20000].index, inplace=True)

##### swimDistance

In [None]:
# Plot the distribution of swimDistance
plt.figure(figsize=(12,4))
sns.distplot(df['swimDistance'], bins=10)
plt.show()

In [None]:
swim = df.copy()

swim['swimDistance'] = pd.cut(swim['swimDistance'], [-1, 0, 5, 20, 5286], labels=['0m','1-5m', '6-20m', '20m+'])

plt.figure(figsize=(15,8))
sns.boxplot(x="swimDistance", y="winPlacePerc", data=swim)
plt.show()

>It seems that if you swim, you rise to the top. In PUBG there are currently 3 maps.But we have to keep in mind that, One of them has almost no water.

In [None]:
# Players who swam more than 2 km
df[df['swimDistance'] >= 2000]

In [None]:
# Remove outliers
df.drop(df[df['swimDistance'] >= 2000].index, inplace=True)

In [None]:
sub = df[['walkDistance','rideDistance','swimDistance','winPlacePerc']].copy()
walk = df['walkDistance']
sub['walkDistanceBin'] = pd.cut(walk, [0, 0.001, walk.quantile(.25), walk.quantile(.5), walk.quantile(.75), 99999])
sub['rideDistanceBin'] = (df['rideDistance'] > 0).astype(int)
sub['swimDistanceBin'] = (df['swimDistance'] > 0).astype(int)

fig, ax = plt.subplots(1, 3, figsize=(16, 3), sharey=True)
sub.groupby('walkDistanceBin').mean()['winPlacePerc'].plot.bar(ax=ax[0])
sub.groupby('rideDistanceBin').mean()['winPlacePerc'].plot.bar(ax=ax[1])
sub.groupby('swimDistanceBin').mean()['winPlacePerc'].plot.bar(ax=ax[2])
del sub, walk

##### winPlacePerc

In [None]:
df['winPlacePerc'].describe()

In [None]:
print('match count:', df['matchId'].nunique())

# not contains 1st place
maxPlacePerc = df.groupby('matchId')['winPlacePerc'].max()
print('match [not contains 1st place]:', len(maxPlacePerc[maxPlacePerc != 1]))
del maxPlacePerc

# edge case
sub = df[(df['maxPlace'] > 1) & (df['numGroups'] == 1)]
print('match [maxPlace>1 & numGroups==1]:', len(sub.groupby('matchId')))
print('Unique winPlacePerc:', sub['winPlacePerc'].unique())

In [None]:
pd.concat([df[df['winPlacePerc'] == 1].head(5),
           df[df['winPlacePerc'] == 0].head(5)],
          keys=['winPlacePerc_1', 'winPlacePerc_0'])

#### Match Summary

In [None]:
cols = ['kills','teamKills','DBNOs','revives','assists','boosts','heals','damageDealt',
    'walkDistance','rideDistance','swimDistance','weaponsAcquired']

aggs = ['count','min','mean','max']
# summary of solo-match
grp = df.loc[df['matchType'].str.contains('solo')].groupby('matchId')
grpSolo = grp[cols].sum()
# summary of team-match
grp = df.loc[~df['matchType'].str.contains('solo')].groupby('matchId')
grpTeam = grp[cols].sum()

pd.concat([grpSolo.describe().T[aggs], grpTeam.describe().T[aggs]], keys=['solo', 'team'], axis=1)

In [None]:
grpSolo.nlargest(5, 'kills') 

In [None]:
grpTeam.nlargest(5, 'kills')

#### vehicleDestroys

In [None]:
df.vehicleDestroys.value_counts()

In [None]:
f,ax1 = plt.subplots(figsize =(20,10))
sns.pointplot(x='vehicleDestroys',y='winPlacePerc',data=df,color='#606060',alpha=0.8)
plt.xlabel('Number of Vehicle Destroys',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Vehicle Destroys/ Win Ratio',fontsize = 20,color='blue')
plt.grid()
plt.show()

>As we can see, Destroying a vehicle in my experience shows that a player has skills.

>And Destroying a single vehicle increases your chances of winning

### Correlation

In [None]:
cols = ['kills','teamKills','DBNOs','revives','assists','boosts','heals','damageDealt',
    'walkDistance','rideDistance','swimDistance','weaponsAcquired']
cols.extend(['killPlace','winPlacePerc'])
group = df.groupby(['matchId','groupId'])[cols]

fig, ax = plt.subplots(3, 1, figsize=(12, 18), sharey=True)
for df, ax in zip([group.mean(), group.min(), group.max()], ax.ravel()):
    sns.heatmap(df.corr(), annot=True, linewidths=.6, fmt='.2f', vmax=1, vmin=-1, center=0, cmap='Blues', ax=ax)

del df

>There is a small correlation between rideDistance and winPlacePerc.

>In terms of the target variable (winPlacePerc), there are a few variables high medium to high correlation. The highest positive correlation is walkDistance and the highest negative the killPlace.

#### Let's zoom to the top-5 most positive correlated variables with the target.

In [None]:
#k = 5 #number of variables for heatmap
#f,ax = plt.subplots(figsize=(11, 11))
#cols = df.corr().nlargest(k, 'winPlacePerc')['winPlacePerc'].index
#cm = np.corrcoef(df[cols].values.T)
#sns.set(font_scale=1.25)
#hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
# plt.show()

In [None]:
sns.set()
cols = ['winPlacePerc', 'walkDistance', 'boosts', 'weaponsAcquired', 'damageDealt', 'killPlace']
sns.pairplot(df[cols], size = 2.5)
plt.show()