In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Gathering Data

In [None]:
data=pd.read_csv('/kaggle/input/pubg-finish-placement-prediction/train_V2.csv')

# Assessing Data

In [None]:
data.head(5)

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data['matchType'].unique()

### Changing matchType column to category

In [None]:
data=data.astype({'matchType':'category'})

##### Testing

In [None]:
data.info()

# Analysis

# Univariate analysis

# Function to find distributions

In [None]:
def pdfcdf(x,bins):
    counts, edges=np.histogram(x, bins=bins)
    pdf=counts/sum(counts)
    cdf=np.cumsum(pdf)
    plt.plot(edges[1:],pdf, label='Probabilty Density Function')
    plt.plot(edges[1:],cdf, label='Cumilative Density Function')
    plt.title('Probability and Cumulative density graphs')
    plt.legend(loc="right")

# Function to assess discreet variables

In [None]:
def assess_discreet(x):
    print(x.value_counts())
    sns.boxplot(x)
    plt.show()
    x.value_counts().plot(kind='pie', autopct='%0.2f')
    plt.show()
    pdfcdf(x,int(x.max()))
    plt.show()

# Function to assess continuous variables

In [None]:
def assess_continuous(x):
    sns.violinplot(x)
    plt.title('Violin plot of all the players')
    plt.show()
    sns.distplot(x,rug=True)
    plt.title('Distplot for all the players')
    plt.show()
    pdfcdf(x,int(x.max()))
    plt.title('Distribution functions for all players')
    plt.show()

# Function to get distplot of 99% of the players

In [None]:
def most_players(x):
    sns.distplot(x[x<x.quantile(0.90)],rug=True, color='orange')
    plt.title('Distplot for 90% players')

## Analysing the win percentage

In [None]:
sns.boxplot(data['winPlacePerc'])

##### Conclusion: Most players make it to slightly less than 50% of the game

1. ## Analysing matchType

In [None]:
data['matchType'].value_counts()

In [None]:
data['matchType'].value_counts().plot(kind='pie', autopct='%0.2f')

1. ## Analysing assists

In [None]:
assess_discreet(data['assists'])
plt.hist(data['assists'], bins=22)

## Assessing boosts

In [None]:
assess_discreet(data['boosts'])
plt.hist(data['boosts'], bins=33)

## Assessing damage dealt

In [None]:
assess_continuous(data['damageDealt'])

In [None]:
most_players(data['damageDealt'])

## Assessing knockouts (DBNOs)

In [None]:
assess_discreet(data['DBNOs'])

In [None]:
most_players(data['DBNOs'])

## Assessing number of headshotKills

In [None]:
assess_discreet(data['headshotKills'])

## Assessing heals

In [None]:
data['heals'].unique()

In [None]:
assess_discreet(data['heals'])

In [None]:
most_players(data['heals'])

## Assessing killPlace

In [None]:
sns.boxplot(data['killPlace'])
plt.show()
pdfcdf(data['killPlace'],100)

##### Conclusion: evenly distributed because there has to be one team/player at every rank

## Assessing killPoints

In [None]:
assess_continuous(data['killPoints'])

##### Conclusion: Every player starts with a minimum kill point

## Assessing kills

In [None]:
assess_discreet(data['kills'])

In [None]:
most_players(data['kills'])

In [None]:
print('Percentage of players who did not kill a single player={0:.03f}%'.format(len(data[data['kills']==0])/len(data)*100))
print('Maximum number of kills in a match=',data['kills'].max())

##### Conclusion: Lower the kills, higher the number, so most players end with less number of kills

## Assessing killStreaks

In [None]:
data['killStreaks'].unique()

In [None]:
assess_discreet(data['killStreaks'])

##### Conclusion: Few players are experienced enough to have a good kill streak

## Assessing longestKill

In [None]:
assess_continuous(data['longestKill'])
most_players(data['longestKill'])

In [None]:
print('Percentage of meelee/point-blank attacks={0:.3f}%'.format((len(data[data['longestKill']==0])/len(data)*100)))
print('Average kill distance={0:.3f}m'.format(np.mean(data['longestKill'])))
print('Longest kill distance={0:.3f}m'.format(data['longestKill'].max()))

##### Conclusion: Close kills are the most common modes of kills. There is a good number of meelee/ point-blank attacks as well

## Assessing matchDuration

In [None]:
assess_continuous(data['matchDuration'])

In [None]:
#Finding upper and lower quartiles
print('Most common range of match duration={}s to {}s'.format(np.percentile(data['matchDuration'],25),np.percentile(data['matchDuration'],75)))

##### Conclusion: There are two peaks, which indicates that a match most commonly ends with either a quick open fight or after a long duration of camping and hiding

## Assessing maxPlace

In [None]:
data['maxPlace'].unique()

In [None]:
assess_discreet(data['maxPlace'])

## Assessing numGroups

In [None]:
data['numGroups'].unique()

In [None]:
len(data['numGroups'].unique())

In [None]:
assess_discreet(data['numGroups'])

##### Observation: Highly correlated with maxPlace

In [None]:
data[['numGroups','maxPlace']].corr()

## Assessing rankPoints

In [None]:
assess_continuous(data['rankPoints'])

In [None]:
most_players(data['rankPoints'])

In [None]:
print('Range of ranks={} to {}'.format(data['rankPoints'].min(),data['rankPoints'].max()))
print('Percentage of new players={0:.4}%'.format(len(data[data['rankPoints']==-1])/len(data)*100))

## Assessing revives

In [None]:
data['revives'].unique()

In [None]:
assess_discreet(data['revives'])

##### Conclusion: most players do not get a revive/need a revive/die before they could be revived

## Assessing rideDistance

In [None]:
assess_continuous(data['rideDistance'])

In [None]:
#Finding upper and lower quartiles of players who have used a vehicle
print('Most common distance ridden in matches={}m to {}m'.format(np.percentile(data[data['rideDistance']!=0]['rideDistance'],25),np.percentile(data[data['rideDistance']!=0]['rideDistance'],75)))
#games where ride distance =0 (no vehicle was ridden)
print('Percentage of games where players do not use a vehicle={0:.03f}%'.format((len(data[data['rideDistance']==0])/len(data)*100)))

## Assessing roadKills

In [None]:
assess_discreet(data['roadKills'])

##### Conclusion: Most players do not use vehicles to kill enemies, it is a rare tactic

## Assessing swimDistance

In [None]:
sns.violinplot(data['swimDistance'])

In [None]:
pdfcdf(data['swimDistance'],40)

In [None]:
#Finding upper and lower quartiles of players who have swam at all
print('Most common distance swam in matches={}m to {}m'.format(np.percentile(data[data['swimDistance']!=0]['swimDistance'],25),np.percentile(data[data['swimDistance']!=0]['swimDistance'],75)))
#games where ride distance =0 (no vehicle was ridden)
print('Percentage of games where players do not use a vehicle={0:.03f}%'.format((len(data[data['rideDistance']==0])/len(data)*100)))

## Assessing teamKills

In [None]:
data['teamKills'].unique()

In [None]:
assess_discreet(data['teamKills'])

##### Conclusion: most players do not kill team mates

## Assessing vehicleDestroys

In [None]:
assess_discreet(data['vehicleDestroys'])

In [None]:
print('Maximum vehicles destroyed by a player in a game=',data['vehicleDestroys'].max())

##### Conclusion: Most players do not destroy a vehicle in a game

## Assessing walkDistance

In [None]:
assess_continuous(data['walkDistance'])

In [None]:
most_players(data['walkDistance'])

In [None]:
#Finding upper and lower quartiles of players who have swam at all
print('Most common distance swam in matches={}m to {}m'.format(np.percentile(data['walkDistance'],25),np.percentile(data['walkDistance'],75)))

In [None]:
print('Average ride distance=',np.mean(data['rideDistance']))
print('Median ride distance=',np.median(data['rideDistance']))
print('Average walk distance=',np.mean(data['walkDistance']))
print('Median walk distance=',np.median(data['walkDistance']))

##### Conclusion: players on an avergae move on foot more than on a vehicle

## Assessing weapons acquired

In [None]:
data['weaponsAcquired'].unique()

In [None]:
assess_discreet(data['weaponsAcquired'])

In [None]:
#Finding upper and lower quartiles of players picking up weapons
print('Most common number of weapons acquired={} to {}'.format(np.percentile(data['weaponsAcquired'],25),np.percentile(data['weaponsAcquired'],75)))

In [None]:
#1.5* IQR
print('Upper outlier=',1.5*(5-2))

In [None]:
#Taking 11 as maximum by excluding outliers
plt.hist(data[data['weaponsAcquired']<=11]['weaponsAcquired'], bins=11)

## Assessing winPoints

In [None]:
len(data['winPoints'].unique())

In [None]:
min(data[data['winPoints']>0]['winPoints'])

In [None]:
assess_continuous(data['winPoints'])

##### Conclusion: Around 60% players score win points under 1500, and in that 60% most players score 0. rest start with a minimum of around 300.**

# Conclusion on Univariate analysis:
### Highly skewed data since most players are eliminated near the first half of the game

# Multivariate Analysis

In [None]:
f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.1f', label='Correlation between variables',ax=ax)

In [None]:
f,ax = plt.subplots(figsize=(10, 10))
print('Heatmap of correlations where the players have won the match:')
sns.heatmap(data[data['winPlacePerc']==1].corr(), annot=True, linewidths=.5, fmt= '.1f', label='Correlation between variables',ax=ax)

In [None]:
print('Correlations of all variables with winning position percentage')
print(data.corr().loc['winPlacePerc',:])

# Goal: to determine what to do and what not to do in order to win a game
### This implies the variable we shall be comparing against shall be winPlacePerc
### Also, only those fields which may have at least some logical relevance has been considered

# Tactical moves
##### In this section we shall see how the various tactical moves affect the probability of winning the game

## Assessing assists

In [None]:
sns.jointplot(x='winPlacePerc',y='assists',data=data)

##### Conclusion: Assisting team mates helps, but has very low relevance, with a player winning the game

## Assessing damage dealt

In [None]:
sns.jointplot(x='winPlacePerc',y='damageDealt',data=data, color='r')

##### Conclusion: damage dealt by a player has a good correlation with winning matches.

## Assessing number of knock outs

In [None]:
sns.jointplot(x='winPlacePerc',y='DBNOs',data=data, color='orange')

##### Conclusion: Damage dealt has correlation with winning but it is not very prominent

## Assessing kills

In [None]:
sns.jointplot(x='winPlacePerc',y='kills',data=data, color='grey')

##### Conclusion: Number of kills has a good correlation to winning a match

## Assessing kill rank

In [None]:
sns.jointplot(x='winPlacePerc',y='killPlace',data=data, color='lime')

##### Conclusion: Killing more than required enemies actually has a negative correlation with winning, thus showing in order to win a PUBG game one has to kill just as many as required, and not look for killing more players as this increases the risk of getting killed in turn

## Assessing team kills

In [None]:
sns.jointplot(x='winPlacePerc',y='teamKills',data=data, color='blue')

##### Conclusion: Team kills has vitrually no relavance with a player winning the match. Interestingly, team mates are killed mostly either at the very begginning or the very end of the game.

## Concluding Tactical moves:

In [None]:
f,ax = plt.subplots(figsize =(20,10))
sns.lineplot(x='assists', y='winPlacePerc', data=data, color='black',ax=ax)
sns.lineplot(x='DBNOs', y='winPlacePerc', data=data, color='orange', ax=ax)
sns.lineplot(x='kills', y='winPlacePerc', data=data, color='grey', ax=ax)
sns.lineplot(x='killPlace', y='winPlacePerc', data=data, color='lime', ax=ax)
#Selecting only teams games for team kills
sns.lineplot(x='teamKills', y='winPlacePerc', data=data[data['numGroups']>25], color='blue', ax=ax)
plt.text(80,0.8,'Assists',fontsize = 17,style = 'italic')
plt.text(80,0.75,'Knockouts',color='orange',fontsize = 17,style = 'italic')
plt.text(80,0.70,'Kills',color='grey',fontsize = 17,style = 'italic')
plt.text(80,0.65,'Kill rank',color='lime',fontsize = 17,style = 'italic')
plt.text(80,0.60,'Team kills',color='blue',fontsize = 17,style = 'italic')
plt.xlabel('Tactical moves')
plt.grid()
plt.show()

### Conclusion: Tactical and offensive moves generally increase the chances of winning the match greatly, but players should try to kill only as many as required as seeking to kill more enemies affect the win percentage negatively

# Health moves

##### In this section we shall see how making health moves affect the probabilty of winning the game

## Assessing boosts

In [None]:
sns.jointplot(x='winPlacePerc',y='boosts',data=data, color='red')

##### Conclusion: Boosting health frequently gives a better chance of winning the game. Boosting health shows a very strong correlation

## Assessing heals

In [None]:
sns.jointplot(x='winPlacePerc',y='heals',data=data, color='lime')

#### Conclusion: Healing shows a strong positive correlation to winning the game. The more a player heals and keeps at good health, the better chances the player has of winning

## Assessing revives

In [None]:
sns.jointplot(x='winPlacePerc',y='revives',data=data, color='green')

##### Conclusion: Revives have some correlation with winning percentage, but it is not crucial enough to determine the outcome. It is a good practice to revive ones team mates

# Concluding health moves

In [None]:
f,ax = plt.subplots(figsize =(20,10))
sns.lineplot(x='boosts', y='winPlacePerc', data=data, color='red',ax=ax)
sns.lineplot(x='heals', y='winPlacePerc', data=data, color='lime',ax=ax)
sns.lineplot(x='revives', y='winPlacePerc', data=data, color='green',ax=ax)
plt.text(70,0.65,'Revives',color='green',fontsize = 17,style = 'italic')
plt.text(70,0.60,'Boosts',color='red',fontsize = 17,style = 'italic')
plt.text(70,0.55,'Heals',color='lime',fontsize = 17,style = 'italic')
plt.xlabel('Healing moves')
plt.grid()
plt.show()

### Conclusion: Health moves always helps increase the chances of winning the game. Personal health, by boosting and healing, are more important than the health of team mates like by giving revives.

# Weapons acquired
### This factor has not been clubbed with any other factor and we shall analyse it as a stand alone factor

In [None]:
sns.jointplot(x='winPlacePerc',y='weaponsAcquired',data=data, color='red')

### Conclusion: It is observed that the number of weapons acquired throughout the match has no particular relevance to winning a match

# Tactical skills
### This section is dedicated to finding how special tactical skills which are not 

## Assessing headshot kills

In [None]:
sns.jointplot(x='winPlacePerc',y='headshotKills',data=data, color='orange')

##### Conclusion: Headshot kills require considerable skills, and it shows that the more skilled the players are, the better chance of winning they have, since it shows a low, but positive correlation.

##### More than a general correlation, there is a sharp spike for the winning teams. These consist of sniping enemies using headshots to win matches near the end, showing headshot sniping is a preffered skill to have to make the final push to the win (domain knowledge 💪😎)

## Assessing kill ranges

In [None]:
sns.jointplot(x='winPlacePerc',y='longestKill',data=data, color='blue')

##### Conclusion: The greater the range of kills the better the chance of winning. Long range kills show a strong correlation to winning percentage, showing thaty shooting enemies from a distance is a very preffered skill to have. The  further away you can kill an enemy from,
1. the better skill you have
2. the safer you are and you survive more

##### This also shows that it is best not to engage in close combat with enemies

## Assesing kill points

In [None]:
sns.jointplot(x='winPlacePerc',y='killPoints',data=data, color='green')

##### Conclusion: kill points have very weak correlation with winning a match thus does not affect it. It is equally probable to 

# Concluding tactical skills

In [None]:
plt.scatter(data['winPlacePerc'], data['longestKill'], color='red',alpha=0.2, label='Longest kill')
plt.scatter(data['winPlacePerc'], data['killPoints'],color='green', alpha=0.2, label='Kill points')
plt.scatter(data['winPlacePerc'], data['headshotKills'], color='blue',alpha=0.2, label='Headshot kills')
plt.xlabel('Win place percentage')
plt.legend(loc="upper right")
plt.ylabel('Tactical skills')
plt.grid()

### Conclusion: Weapon tactical skills such as ability to kill from a distance and ability to make headshot kills definitly boost chances of winning the game, whereas increasing the kill rating has no relevance with winning games whatsoever

# Travelling
### This section includes how much players have travelled in different modes of travelling

## Assessing ride distance

In [None]:
sns.jointplot(x='winPlacePerc',y='rideDistance',data=data, color='black')

##### Conclusion: It shows a positive correlation with wining, showing that players who position themselves well by moving with the help of vehicles are better off for winning the match

## Assessing walk distance

In [None]:
sns.jointplot(x='winPlacePerc',y='walkDistance',data=data, color='pink')

##### Conclusion: Players who walk a lot have a very high chance of winning the matches. This shows that mostly walking to position themselves better have a far better chance of winning. This also shows that stealth gives a better advantage to winning the game since walking is stealthier than riding a vehicle since walking has a higher correlation with winning than riding

## Assessing swim distance

In [None]:
sns.jointplot(x='winPlacePerc',y='swimDistance',data=data, color='blue')

##### Conclusion: Swimming is an extreme measure in PUBG and it is rare; and those who do it to survive have a better chance of winning when need comes. The relevance is low because swimming also makes players very vulnerable to attacks and is a last ditch survival method.

# Concluding travelling

In [None]:
plt.scatter(data['winPlacePerc'], data['rideDistance'], color='black',alpha=0.7, label='Ride Distance')
plt.scatter(data['winPlacePerc'], data['walkDistance'],color='pink', alpha=0.7, label='Walk Distance')
plt.scatter(data['winPlacePerc'], data['swimDistance'],color='blue', alpha=0.7, label='Swim Distance')
plt.xlabel('Win place percentage')
plt.legend(loc="upper right")
plt.ylabel('Tactical skills')
plt.grid()

### Conclusion: Travelling to take proper position in the battlefield provides a greater probabilty to win.

# Match played
### This section discusses about the attributes of the match played such as the category of the match and match duration

## Assessing Match type

In [None]:
# f,ax = plt.subplots(figsize =(20,10))
sns.catplot(x='matchType', y='winPlacePerc', data=data, kind='strip')

##### Conclusion: Regardless of the mode the game is played in, all match modes provide to players an equal chance of winning.

## Assessing match duration

In [None]:
sns.jointplot(x='winPlacePerc',y='matchDuration',data=data, color='brown')

##### Conclusion: Match duration has no correlation whatsoever with the outcome of the match

### Concluding match types:

# Summing up the most influential points
### This section contains graphs of the factors that influence the chance of winning (or losing) a match the most

## positives:

### Discreet values

In [None]:
f,ax = plt.subplots(figsize =(20,10))
sns.lineplot(x='headshotKills', y='winPlacePerc', data=data, color='red',ax=ax)
sns.lineplot(x='boosts', y='winPlacePerc', data=data, color='lime',ax=ax)
sns.lineplot(x='heals', y='winPlacePerc', data=data, color='brown',ax=ax)
sns.lineplot(x='DBNOs', y='winPlacePerc', data=data, color='orange',ax=ax)
sns.lineplot(x='kills', y='winPlacePerc', data=data, color='black',ax=ax)
plt.text(70,0.75,'Headshot Kills',color='red',fontsize = 17,style = 'italic')
plt.text(70,0.70,'Heals',color='brown',fontsize = 17,style = 'italic')
plt.text(70,0.65,'Boosts',color='lime',fontsize = 17,style = 'italic')
plt.text(70,0.60,'Knockouts',color='orange',fontsize = 17,style = 'italic')
plt.text(70,0.55,'Kills',color='black',fontsize = 17,style = 'italic')
plt.xlabel('Winning factors')
plt.grid()
plt.show()

### Continuous values

In [None]:
plt.scatter(data['winPlacePerc'], data['walkDistance'], color='black',alpha=0.7, label='Walk Distance')
plt.scatter(data['winPlacePerc'], data['damageDealt'], color='yellow',alpha=0.7, label='Damage Dealt')
plt.scatter(data['winPlacePerc'], data['longestKill'],color='blue', alpha=0.7, label='Longest Kill')
plt.xlabel('Win place percentage')
plt.legend(loc="upper right")
plt.ylabel('Winning factors')
plt.grid()
plt.show()

## Negative factor

In [None]:
sns.jointplot(x='winPlacePerc',y='killPlace',data=data, color='purple')

# Final synopsys
### To win a game in PUBG, we have to take care of the following factors:
1. Try to avoid engaging enemy from close quarters. Always try to shoot enemies from a distance.
2. Try to take proper position in the battle, especially by walking, since it is stealthier and harder to detect and hence increases the chance of having to face an enemy
3. Fight enemies only when required, do not seek out extra enemies. Kill points make no difference to the outcome but a higher kill ranking (killPlace) results in a lower probability of winning.
4. When required, kill an enemy, just not knock them out or damage them. There is very high correlation between damage dealt, players knocked out and kills, and all three have a high correlation with win percentage. This means it is best to make sure that the enemey is dead and not just injured.
5. It is best to take an enemy out with a headshot because it has a positive correlation with winning. Combining this point with point 1, it is understood it is best to use a sniper to take enemies down from a distance to increase chances of winning
6. Match duration has no correlation with winning. A match may be fought both quickly or by camping and hiding and it would not have much affect on the outcome.
7. It is extremely helpful to take repeated boosts and keeping healed all the time. Both these factors have very high corellation with win percentage and therefore recommended.
8. It does not make a difference on how many weapons a player has acquired, it has no correlation with win percentage. An average player uses 2-5 weapons in a match.