<center><img src = "https://media.giphy.com/media/XVbrX433vn6rqkexSj/giphy.gif"></center>


## Importing Libraries

In [None]:
# Handling warnings
import warnings
warnings.filterwarnings("ignore")

# Importing standard libraries
import numpy as np
import pandas as pd

#visulation
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (16,6)

import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import catboost as cb
from sklearn.metrics import mean_squared_error , r2_score

## Reading The Data

In [None]:
df = pd.read_csv('PUBG_Game_Prediction_data.csv')
df.head()

In [None]:
df.info()

## Data Descriptions

<ul type='circle'>
    <li><b> DBNO </b> - Number of enemy players knocked.
    <li><b> assists </b> - Number of enemy players this player damaged that were killed by teammates.
    <li><b> boosts  </b> - Number of boost items used.
    <li><b> damageDealt  </b> -Total damage dealt. Note: Self inflicted damage is subtracted.
    <li><b> headshotKills  </b> - Number of enemy players killed with headshots.
    <li><b> heals  </b> - Number of healing items used.
    <li><b> Id  </b> -  Player’s Id.
    <li><b> killPlace </b> -Ranking in match of number of enemy players killed.
    <li><b> killPoints  </b> -Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
    <li><b> killStreaks </b> - Max number of enemy players killed in a short amount of time.
    <li><b> kills  </b> - Number of enemy players killed.
    <li><b> longestKill  </b> -Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
    <li><b> matchDuration  </b> - Duration of match in seconds
    <li><b> matchId </b> -ID to identify match. There are no matches that are in both the training and testing set.
    <li><b> matchType  </b> - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
    <li><b> rankPoints  </b> - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
    <li><b> revives  </b> - Number of times this player revived teammates.
    <li><b> rideDistance </b> -  Total distance traveled in vehicles measured in meters.
    <li><b> roadKills </b> - Number of kills while in a vehicle.
    <li><b> swimDistance  </b> - Total distance traveled by swimming measured in meters.
    <li><b> teamKills  </b> - Number of times this player killed a teammate
    <li><b> vehicleDestroys  </b> -Number of vehicles destroyed
    <li><b> walkDistance  </b> - Total distance traveled on foot measured in meters.-
    <li><b> weaponsAcquired  </b> - Number of weapons picked up.
    <li><b> winPoints  </b> - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
    <li><b> groupId  </b> -ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
    <li><b> numGroups  </b> -Number of groups we have data for in the match.
    <li><b> maxPlace  </b> - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
    <li><b> winPlacePerc  </b> -The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.
</ul>

## Data Wrangling

###### Checking the rows for missing winPlacePerc

<center><img src = "https://media.giphy.com/media/OPRbXcsGctvZC/giphy.gif"></center>

In [None]:
df.drop(df[df['winPlacePerc'].isnull()].index , inplace = True)

Prepare new paramter to know the number of parameter joine in the game

In [None]:
df['PlayerJoined'] = df.groupby('matchId')['matchId'].transform('count')
df.head()

In [None]:
sns.countplot(df[df['PlayerJoined'] >= 75]['PlayerJoined'])
plt.show()

## Analysing the data

###### Kill without moving?

In [None]:
# total distance travelled
df['totalDistance'] = df['rideDistance']+df['swimDistance']+df['walkDistance']

df['killWithoutMoving'] = ((df['totalDistance']==0) & (df['kills']>0))

In [None]:
df.drop(df[df['killWithoutMoving']== True].index, inplace= True)

###### Extra-ordinary Road kills

<center><img src = "https://media.giphy.com/media/3o7aD85usFbbbrCR3i/giphy.gif"></center>

In [None]:
df[df['roadKills'] > 5].shape

In [None]:
df.drop(df[df['roadKills'] >5].index , inplace = True)

###### Finding no of kill by the players of individual person

<center><img src = "https://media.giphy.com/media/3oxHQfzBhpKtgdDWtW/giphy.gif"></center>

In [None]:
sns.countplot(data = df, x = df['kills']).set_title("Distribution of kills by the person")
plt.xlabel("kills")
plt.ylabel("No of players done that kill")
plt.show()

In [None]:
sns.countplot(data= df , x = df[df['kills'] >= 12]['kills']).set_title("Distribution of kills by the person > 12")
plt.xlabel("kills")
plt.ylabel("No of players done that kill")
plt.show()

In [None]:
df.drop(df[df['kills'] > 20].index , inplace = True)

###### Head Shot

<center><img src = "https://media.giphy.com/media/l3mZrOajz5VCZf7Hy/giphy.gif"></center>

In [None]:
df['headshot_rate'] = df['headshotKills']/df['kills']
df['headshot_rate'] = df['headshot_rate'].fillna(0)

In [None]:
sns.distplot(df['headshot_rate'] , bins= 20).set_title("Distribution of HeadShot rate")
plt.ylabel("Count of players")
plt.show()

In [None]:
df[(df['headshot_rate'] >= 1) &(df['kills'] > 5)].shape

In [None]:
df.drop(df[(df['headshot_rate'] >= 1) &(df['kills'] > 5)].index , inplace = True)

## Longest Shot

<center><img src = "https://media.giphy.com/media/3ohs7YomxqOz4GRHcQ/giphy.gif"></center>

In [None]:
sns.distplot(df['longestKill'] , bins = 50).set_title('Histogram Showing Longest Kill')
plt.ylabel("Count of Players")
plt.show()

In [None]:
df[df['longestKill'] > 500].shape

In [None]:
df.drop(df[df['longestKill'] > 500].index , inplace = True)

## Weapon Change

In [None]:
sns.distplot(df['weaponsAcquired'] , bins = 100).set_title('Histogram of weapon Acquired')
plt.ylabel("Total Weapon")
plt.show()

In [None]:
df[df['weaponsAcquired'] >= 15].shape

In [None]:
df.drop(df[df['weaponsAcquired'] >= 15].index , inplace = True)

## EDA

In [None]:
df.shape

In [None]:
df.isna().sum(axis = 1).sum()

In [None]:
# Correlation of parameter with winPrediction
plt.figure(figsize=[30,30])
sns.heatmap(df.corr() , annot = True)
plt.show()

In [None]:
corr = pd.DataFrame(df.corr() , columns=df.columns)
corr['winPlacePerc'].sort_values(ascending = False)

## Feature Engineering

In [None]:
normalising_factor = ((100 - df['PlayerJoined'])/100 +1)

In [None]:
df.columns

In [None]:
df['killsNorm'] = df['kills']*normalising_factor
df['damageDealtNorm'] = df['damageDealt']*normalising_factor
df['macPlaceNorm'] = df['maxPlace'] * normalising_factor
df['matchDurationNorm']= df['matchDuration']*normalising_factor

df['traveldistance'] = df['walkDistance'] + df['swimDistance'] + df['rideDistance']
df['healsnboosts'] = df['heals'] + df['boosts']
df['assist'] = df['assists'] + df['revives']

In [None]:
df.columns

In [None]:
data = df.drop(columns=['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'kills','matchDuration', 'maxPlace', 'rideDistance','swimDistance', 'walkDistance'])

In [None]:
data.head()

## ML-Catboost Model

###### Handling categorical data

In [None]:
x = data.drop(['winPlacePerc'] , axis = 1)
y = data['winPlacePerc']

In [None]:
x = pd.get_dummies(x , columns = ['matchType' , 'killWithoutMoving'])

In [None]:
x.head()

###### Splitting the data

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 42) 

In [None]:
print(x_train.shape , y_train.shape)
print(x_test.shape ,y_test.shape)

## CatBoost Model

In [None]:
import catboost as cb

In [None]:
train_dataset = cb.Pool(x_train , y_train)
test_dataset =cb.Pool(x_test , y_test)


In [None]:
model = cb.CatBoostRegressor(loss_function = 'RMSE')

In [None]:
grid = {
    'iterations' :[100 , 200],
    'learning_rate':[0.03,0.01,0.1],
    'depth':[2,4,6,8,10]
}
model.grid_search(grid , train_dataset)

In [None]:
feature_importance_df = pd.DataFrame()
feature_importance_df['features'] = features
feature_importance_df['importance'] = model.feature_importances_

feature_importance_df = feature_importance_df.sort_values(by=['importance'] , ascending = False)

In [None]:
feature_importance_df

In [None]:
plt.bar(feature_importance_df.features , feature_importance_df.importance , colors='turquoise')
plt.ylabel('Feature Importance')
plt.xlabel('Features')
plt.show()

## Prediction

In [None]:
pred = model.predict(x_test)

In [None]:
rmse = nq.sqrt(mean_squred_error(pred , y_test))
r2 = r2_score(pred , y_test)

print("Testing Performance")

print(f"RMSE = {rmse:0.2f}")
printf(f"r2 Score = {r2:0.2f}")

<center><img src = "https://media.giphy.com/media/KB89dMAtH79VIvxNCW/giphy.gif"></center>