In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# PUBG Finish Placement Prediction Project
#### In this project we will be working with a Pubg finish placement data set, indicating the finishing placement of players. We will try to create a model that will predict finishing placement in future based of the features of the past placement.
#### This data set contains the following features:

> - DBNOs - Number of enemy players knocked.
- assists - Number of enemy players this player damaged that were killed by teammates.
- boosts - Number of boost items used.
- damageDealt - Total damage dealt. Note: Self inflicted damage is subtracted.
- headshotKills - Number of enemy players killed with headshots.
- heals - Number of healing items used.
- Id - Player’s Id
- killPlace - Ranking in match of number of enemy players killed.
- killPoints - Kills-based external ranking of player. (Think of this as an Elo ranking where only kills matter.) If there is a value other than -1 in rankPoints, then any 0 in killPoints should be treated as a “None”.
- killStreaks - Max number of enemy players killed in a short amount of time.
- kills - Number of enemy players killed.
- longestKill - Longest distance between player and player killed at time of death. This may be misleading, as downing a player and driving away may lead to a large longestKill stat.
- matchDuration - Duration of match in seconds.
- matchId - ID to identify match. There are no matches that are in both the training and testing set.
- matchType - String identifying the game mode that the data comes from. The standard modes are “solo”, “duo”, “squad”, “solo-fpp”, “duo-fpp”, and “squad-fpp”; other modes are from events or custom matches.
- rankPoints - Elo-like ranking of player. This ranking is inconsistent and is being deprecated in the API’s next version, so use with caution. Value of -1 takes place of “None”.
- revives - Number of times this player revived teammates.
- rideDistance - Total distance traveled in vehicles measured in meters.
- roadKills - Number of kills while in a vehicle.
- swimDistance - Total distance traveled by swimming measured in meters.
- teamKills - Number of times this player killed a teammate.
- vehicleDestroys - Number of vehicles destroyed.
- walkDistance - Total distance traveled on foot measured in meters.
- weaponsAcquired - Number of weapons picked up.
- winPoints - Win-based external ranking of player. (Think of this as an Elo ranking where only winning matters.) If there is a value other than -1 in rankPoints, then any 0 in winPoints should be treated as a “None”.
- groupId - ID to identify a group within a match. If the same group of players plays in different matches, they will have a different groupId each time.
- numGroups - Number of groups we have data for in the match.
- maxPlace - Worst placement we have data for in the match. This may not match with numGroups, as sometimes the data skips over placements.
- winPlacePerc - The target of prediction. This is a percentile winning placement, where 1 corresponds to 1st place, and 0 corresponds to last place in the match. It is calculated off of maxPlace, not numGroups, so it is possible to have missing chunks in a match.

In [None]:
# import all packages and set plots to be embedded inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
plt.style.use('fivethirtyeight')

# Gathering Data

In [None]:
df = pd.read_csv(r'/kaggle/input/pubg-finish-placement-prediction/train_V2.csv')

# Assess Data

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#statistical summary about the data
df.describe()

In [None]:
#check for missing data
df.isnull().sum()

In [None]:
#delete missing record
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
#check for duplicates
df.duplicated().sum()

In [None]:
#showing the columns name and position
for i,col in enumerate(df.columns):
    print(i,col)

# EDA

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.assists,bins=80,kde=False)
plt.show()

***The Disturbution of assists showing that almost all assists were under 5 assist and the most of assists are 0 which indicate that players tend to kill the enemies without a help from a friend***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.boosts,bins=80,kde=False,color='#0000A0')
plt.show()

***The Disturbution of boosts showing that almost all boosts were under 10 boost and the most of boosts are 0 which indicate that players usally didn't use boosts***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.damageDealt,bins=80,kde=False,color='#800080')
plt.show()

***The Disturbution of damage dealta showing that almost all damage delta were under 1000  and the most of damage delta are 0 which indicate that players had been damaged as the damage enemies***

In [None]:
g = pd.cut(df['DBNOs'],[-1,0,1,2,3,4,np.inf],labels=['0','1','2','3','4','+5']).value_counts()

#initializing plot
ax = g.plot.barh(color = '#007482', fontsize = 15)

#giving a title
ax.set(title = 'The Most Common Number of Knocks')

#x-label
ax.set_ylabel('Number of knocks', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(22, 12)

#shwoing the plot
plt.show()

***THE plot showing that most number of knocks are 0 and a few number above 5***

In [None]:
g = pd.cut(df['headshotKills'],[-1,0,1,2,3,4,np.inf],labels=['0','1','2','3','4','+5']).value_counts()

#initializing plot
ax = g.plot.bar(color = '#800080', fontsize = 15)

#giving a title
ax.set(title = 'The Most Common Number of Headshots')

#x-label
ax.set_ylabel('Number of Headshots', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(22, 12)

#shwoing the plot
plt.show()

***The plots showing that most common number of headshots are 0 and almost there is no headshots above 4 which indicate that players didn't tend to knock enemies by headshots***

In [None]:
g = pd.cut(df['heals'],[-1,0,1,2,3,4,np.inf],labels=['0','1','2','3','4','+5']).value_counts()

#initializing plot
ax = g.plot.bar(color = '#FF00FF', fontsize = 15)

#giving a title
ax.set(title = 'The Most Common Number of Heals')

#x-label
ax.set_ylabel('Number of Heals', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(22, 12)

#shwoing the plot
plt.show()

***THE plot showing that most common number of heals are zero***

In [None]:
g = pd.cut(df['killPlace'],[-1,1,3,6,10,np.inf],labels=['1','2-3','4-6','7-10','+10']).value_counts()

#initializing plot
ax = g.plot.barh(color = '#808080', fontsize = 15)

#giving a title
ax.set(title = 'The Most Common Number of killplace')

#x-label
ax.set_ylabel('Number of killplace', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(22, 12)

#shwoing the plot
plt.show()

***THE plot showing that most common killplace are above 10***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.killPoints,bins=80,kde=False,color='#FF00FF')
plt.show()

***THE plot showing that almost all killpoints equal zero and few of them are from 1000 to 1500***

In [None]:
g = pd.cut(df['kills'],[0,1,3,6,10,np.inf],labels=['0-1','2-3','4-6','7-10','+10']).value_counts()

#initializing plot
ax = g.plot.barh(color = '#FFA500', fontsize = 15)

#giving a title
ax.set(title = 'The Most Common Number of Kills')

#x-label
ax.set_ylabel('Number of Kills', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(22, 12)

#shwoing the plot
plt.show()

***The plot showing that most number of kills are between 0 and 1 and between 2 and 3***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.matchDuration,bins=80,kde=False,color='#808000')
plt.show()

***THE Disturbution shows that most of match took time from 1300s to 1450s and also between 1750s to 2000s***

In [None]:
plt.figure(figsize=(22,10))
label=df.matchType.value_counts().index
plt.pie(df.matchType.value_counts(),explode=[0.1]*len(label),labels=label,autopct='%.1f%%',shadow=True)
plt.axis('equal')
plt.title('User Type')
plt.show()

***THE plots showing that most of players play in squad or duo while a few of them play a solo game***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.rankPoints,bins=80,kde=False,color='#000080')
plt.show()

***THE plot showing that most of rankpoints are 0 and there is too many between 1500 and 1700***

In [None]:
g = pd.cut(df['revives'],[-1,0,1,3,6,10,np.inf],labels=['0','1','2-30','4-6','7-10','+10']).value_counts()

#initializing plot
ax = g.plot.barh(color = '#00806A', fontsize = 15)

#giving a title
ax.set(title = 'The Most Common Number of Revives')

#x-label
ax.set_ylabel('Number of Revives', color = 'g', fontsize = '18')

#giving the figure size(width, height)
ax.figure.set_size_inches(22, 12)

#shwoing the plot
plt.show()

***THE plot showing that most of players didn't get revives and they get killed after they get knocked***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.rideDistance,bins=80,kde=False,color='#00806A')
plt.show()

***THE plot show that most of players didn't ride viechals too much and they tend to walk***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.swimDistance,bins=80,kde=False,color='#158000')
plt.show()

***THE plot show that almost all players didn't swim too much and they tend to walk***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.walkDistance,bins=80,kde=False,color='#006A80')
plt.show()

***THE plot show that most of players didn't walk too much and that cause they were killed after they down***

In [None]:
plt.figure(figsize=(22,10))
sns.distplot(df.winPlacePerc,bins=80,kde=False,color='#3CA6BC')
plt.show()

***The plot showing that many of player finished the game in last places where a few of them finished it in the fisrt one***

In [None]:
plt.figure(figsize=(22,10))
sns.scatterplot(x=df['winPlacePerc'],y=df['kills'])
plt.show()

***THE plot show that the player who finished the game in advanced places kill more people than other which is normal***

In [None]:
plt.figure(figsize=(22,10))
sns.scatterplot(x=df['winPlacePerc'],y=df['walkDistance'])
plt.show()

***THE plot show that the player who finished the game in advanced places walk more than other which is normal***

In [None]:
plt.figure(figsize=(22,10))
sns.scatterplot(x="winPlacePerc", y="boosts", data=df)
plt.show()

***THE plot show that the player who finished the game in advanced places use boosts more than other which is normal***

In [None]:
plt.figure(figsize =(20,10))
sns.pointplot(x='vehicleDestroys',y='winPlacePerc',data=df,color='#606060',alpha=0.8)
plt.xlabel('Number of Vehicle Destroys',fontsize = 15,color='blue')
plt.ylabel('Win Percentage',fontsize = 15,color='blue')
plt.title('Vehicle Destroys/ Win Ratio',fontsize = 20,color='blue')
plt.show()

***THE plot show that the player who finished the game in advanced places destroyed more vechicle than other players***

In [None]:
df.matchType=df.matchType.astype('category').cat.codes
plt.figure(figsize=(22, 15))
sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f')
plt.show()

In [None]:
#delete string columns ('Id','groupId','matchId')
#delete columns that had low correlation with winplaceperc ('Id','groupId','matchId')
df.drop(['Id','groupId','matchId','rankPoints','roadKills','vehicleDestroys'],axis=1,inplace=True)

In [None]:
#drop outliers from the data
for col in df.columns:
    df1=df[col]
    Q1 = df1.quantile(0.01)
    Q3 = df1.quantile(0.99)
    IQR = Q3-Q1
    minimum = Q1 - 1.5*IQR
    maximum = Q3 + 1.5*IQR
    condition = (df1 <= maximum) & (df1 >= minimum)
    df=df[condition]

In [None]:
#shape of data after deleting outliers
df.shape

In [None]:
#split the data
X=df.drop(['winPlacePerc'],axis=1)
y=df['winPlacePerc']

## Feature Selection using SelectKbest with f_regression as score funcation

In [None]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

In [None]:
best_feature = SelectKBest(score_func=f_regression,k='all')
fit = best_feature.fit(X,y)

In [None]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Feature','Score']
featureScores = featureScores.sort_values(by='Score',ascending=False).reset_index(drop=True)

featureScores

In [None]:
#select the best 15 feature
X= X[featureScores.Feature[:15].values]

## Scaling the data using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
cols = X.columns
scaler = StandardScaler()
X=scaler.fit_transform(X)
X=pd.DataFrame(X,columns=cols)

## Split the data into train set and  test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10,random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score

## Try Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()

In [None]:
model_reg = cross_val_score(reg,X_train,y_train,cv=3,scoring='neg_mean_squared_error')

In [None]:
-model_reg

## Tuning Linear Regression Parameter

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid={'fit_intercept':[True,False],'normalize':[True,False]}

In [None]:
grid= GridSearchCV(reg,param_grid,cv=3,scoring='neg_mean_squared_error')

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_estimator_

In [None]:
-grid.best_score_

## Try Lasso Model

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso=Lasso()

In [None]:
model_lasso = cross_val_score(lasso,X_train,y_train,cv=3,scoring='neg_mean_squared_error')

In [None]:
-model_lasso

## Try Elastic Net Model

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
elastic=ElasticNet()

In [None]:
model_elastic = cross_val_score(elastic,X_train,y_train,cv=3,scoring='neg_mean_squared_error')

In [None]:
-model_elastic

## Try Decision Tree Model

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree = DecisionTreeRegressor()

In [None]:
model_tree = cross_val_score(tree,X_train,y_train,cv=3,scoring='neg_mean_squared_error')

In [None]:
- model_tree

## Finally using VotingRegressor Model

In [None]:
from sklearn.ensemble import VotingRegressor

In [None]:
reg=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
tree=DecisionTreeRegressor()
regressor=[('Linear Regression', reg), ('decision Tree', tree)]
# i didn't use random forest in voting cause it took much time and i haven't now😢😢

In [None]:
vc = VotingRegressor(estimators=regressor)

In [None]:
vc.fit(X_train,y_train)

In [None]:
y_pred = vc.predict(X_test)

In [None]:
from sklearn.metrics import r2_score,mean_squared_error

In [None]:
mean_squared_error(y_test,y_pred)

In [None]:
r2_score(y_test,y_pred)

# Predict Testing Data

In [None]:
test = pd.read_csv(r'/kaggle/input/pubg-finish-placement-prediction/test_V2.csv')

In [None]:
test.head()

In [None]:
test_pred=test.copy()

In [None]:
test_pred = test_pred[X.columns]

In [None]:
test_pred=scaler.fit_transform(test_pred)
test_pred=pd.DataFrame(test_pred,columns=cols)

In [None]:
prediction = vc.predict(test_pred)

In [None]:
test['winPlacePerc'] = prediction

In [None]:
sub = pd.read_csv(r'/kaggle/input/pubg-finish-placement-prediction/sample_submission_V2.csv')

In [None]:
sub['winPlacePerc'] = test['winPlacePerc']

In [None]:
sub

In [None]:
sub.to_csv('submission.csv',index=False)