<a href="https://colab.research.google.com/github/Shubhanshu1902/PUBG-ML-project/blob/main/pubg_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PUBG Project
We have the dataset of different stats of players and we have to predict the rank of the player in the game.

Team Name:- PUBG Specialist

Team members:-
- Shubhanshu Agrawal(IMT2020078)
- Pratham Dandale(IMT2020038)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Importing required modules

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Importing dataset

In [None]:
train_df = pd.read_csv("./drive/MyDrive/PUbG/train_up.csv")
train_df.describe()

In [None]:
train_df.head()

# Pre-Processing the data

### Dealing with missing values

Calculating the missing values in each column

In [None]:
train_df.isna().sum()

There only 1 missing value in winPlacePerc. 

Checking the row with missing value

In [None]:
train_df.loc[train_df['winPlacePerc'].isna(),:]

Droping the row with missing value

In [None]:
train_df.drop(axis="rows", labels=train_df.index[train_df["winPlacePerc"].isna()], inplace=True)

In [None]:
train_df.isna().sum()

### Dealing the duplicated rows

Checking number of duplicated rows

In [None]:
train_df.duplicated().sum()

There are no duplicated rows

### Label Encoding


There is only 1 column which has categorical data,i.e., matchType. We will do label encoding in this column.

GroupID, MatchID and Id have object type. So we can label encode them to make them integer type

In [None]:
labelEncoder = LabelEncoder()
train_copy = train_df.copy()
encoded = labelEncoder.fit_transform(train_copy["matchType"])
train_copy["matchType"].unique(),np.unique(encoded)

In [None]:
train_copy["matchType"] = encoded
train_copy.info()

In [None]:
train_copy["Id"] = labelEncoder.fit_transform(train_copy["Id"])
train_copy["matchId"] = labelEncoder.fit_transform(train_copy["matchId"])
train_copy["groupId"] = labelEncoder.fit_transform(train_copy["groupId"])

train_df['Id'] = train_copy['Id']
train_df['matchId'] = train_copy['matchId']
train_df['groupId'] = train_copy['groupId']

train_copy.info()

### Removing related columns

We will make the heatmap of all the columns and check if they have corelation more than 0.8 or less than -0.8, we will remove them

In [None]:
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(train_copy.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

Relations:-
- killPlace and killStreaks : -0.8
- killPoints and rankPoints : -1.0
- killPoints and winPoints : 1.0
- maxPlace and numGroups : 1.0
- rankPoints and winPoints : -1.0

Removing killStreaks, rankPoints, killPoints, maxPlace

In [None]:
train_copy.drop(columns={"killStreaks","rankPoints","killPoints","maxPlace"},axis=1,inplace=True)
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(train_copy.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
plt.show()

### Removing Outliers

Removing outliers from every columns

In [None]:
class OutlierRemoval: 
    def __init__(self, lower_quartile, upper_quartile):
        self.lower_whisker = lower_quartile - 1.5*(upper_quartile - lower_quartile)
        self.upper_whisker = upper_quartile + 1.5*(upper_quartile - lower_quartile)
    def removeOutlier(self, x):
        return (x if x <= self.upper_whisker and x >= self.lower_whisker else (self.lower_whisker if x < self.lower_whisker else (self.upper_whisker)))

Box plot for all the columns

In [None]:
for i in list(train_copy.columns):
    j=train_copy[i]
    sns.boxplot(x=j)
    plt.show()

In [None]:
train_copy.columns

As we can see some columns liske assist, revives, etc have numerical values, removing outliers will be not beneficial

In [None]:
outliers = ['boosts', 'damageDealt', 'DBNOs',
       'heals', 'killPlace', 'longestKill', 'matchDuration', 'matchType','numGroups', 'rideDistance',
        'walkDistance', 'weaponsAcquired', 'winPoints']


for i in list(outliers):
    j=train_copy[i]
    OutlierRemObj = OutlierRemoval(j.quantile(0.25),j.quantile(0.75))
    remOut = j.apply(OutlierRemObj.removeOutlier)
    sns.boxplot(x=remOut)
    plt.show()

### Min max Normalization 

Formula:- 
$$x_i = \frac{x_i-min}{max-min}$$

In [None]:
train = train_copy.drop(columns={"Id","matchId","groupId"})
train = (train - train.min())/(train.max() - train.min())
train.head()

## Exploratory Data Analysis

Checking number of matches in each category

In [None]:
fig,ax = plt.subplots(1,2,figsize = (10,4))

train_df.groupby("matchId")["matchType"].first().value_counts().plot.bar(ax=ax[0])

seperator = lambda i: 'duo' if ('duo' in i) or ('crash' in i) else 'solo' if ('solo' in i) else 'squad' 
train_df['matchType'] = train_df['matchType'].apply(seperator)

train_df.groupby("matchId")["matchType"].first().value_counts().plot.bar(ax=ax[1])


In [None]:
train["matchType"] = labelEncoder.fit_transform(train_df["matchType"])
train["matchType"].unique()

We will now explore the relations of different parameters on win position

In [None]:
sns.jointplot(x="winPlacePerc", y="kills", data=train_df, height=10, ratio=3, color="r")
plt.show()

- We can see most kills are scattered between 0-10
- Some people with more kills have less position in match

In [None]:
sns.jointplot(x="winPlacePerc", y="assists", data=train_df, height=10, ratio=3, color="b")
plt.show()

- Most assists scattered between 0-5
- Some people with more kills have less position in match

In [None]:
sns.jointplot(x="winPlacePerc", y="walkDistance", data=train_df, height=10, ratio=3, color="r")
plt.show()

- High rank people have travelled more than low rank people

In [None]:
def fit_csv(model,name):
    test_df = pd.read_csv("./drive/MyDrive/PUbG/test_up.csv")
    answer = test_df["Id"]

    test_df.drop(columns={"killStreaks","rankPoints","killPoints","maxPlace"},axis=1,inplace=True)
    test_df = test_df.drop(columns={"Id","matchId","groupId"})
    
    seperator = lambda i: 'duo' if ('duo' in i) or ('crash' in i) else 'solo' if ('solo' in i) else 'squad' 
    test_df['matchType'] = test_df['matchType'].apply(seperator)
    test_df["matchType"] = labelEncoder.fit_transform(test_df["matchType"])
    
    test_df = (test_df - test_df.min())/(test_df.max() - test_df.min())
    prediction = model.predict(test_df)
    
    test_df = test_df.drop(columns = test_df.columns,axis=1)
    test_df["Id"] = answer
    test_df["winPlacePerc"] = prediction
    
    test_df.to_csv('./drive/MyDrive/PUbG/'+name,index=False)

In [None]:
from sklearn.metrics import mean_squared_error

def fit_model(model,X,Y,output_csv):
    train_x,test_x,train_y,test_y = train_test_split(X,Y,test_size=0.1)
    model.fit(train_x,train_y)
    predict_y = model.predict(test_x)
    mse = mean_squared_error(test_y, predict_y) 
    print("MEAN SQUARED ERROR :-", mse)
    fit_csv(model,output_csv)
    print("CSV created in drive")

In [None]:
from sklearn.model_selection import train_test_split

# train["Id"] = train_df["Id"]


x = train.drop(columns="winPlacePerc",axis=1)
y = train["winPlacePerc"]

# Decision Tree Model

In [None]:
from sklearn import tree

decision_tree = tree.DecisionTreeRegressor()
fit_model(decision_tree,x,y,"decision_tree.csv")

MEAN SQUARED ERROR :- 0.013886201662685713
CSV created in drive


# Random Forest Regression model

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(max_depth=6, random_state=2)
fit_model(random_forest,x,y,"random_forest.csv")

# Gradient Boosting Regression

In [None]:
from sklearn import ensemble

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

gbr = ensemble.GradientBoostingRegressor(**params)
fit_model(gbr,x,y,"gradient_boosting_regressor.csv")

# Elastic Net Regression

In [None]:
from sklearn.linear_model import ElasticNet

elasticNet = ElasticNet(random_state=0)
fit_model(elasticNet,x,y,"elastic_net.csv") 

# Stochastic Gradient Descent Regression

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

sgdregressor = reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000000, tol=1e-3))
fit_model(sgdregressor,x,y,"sgdRegressor.csv")

# Support Vector Machine

In [None]:
from sklearn.svm import SVR

svm = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
fit_model(sgdregressor,x,y,"svm,csv")

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

linReg = LinearRegression()
fit_model(linReg,x,y,"linear_reg.csv")

# XG Boost Regressor

In [None]:
import xgboost

In [None]:
from xgboost import XGBRegressor

params = {
    "n_estimators":1000, 
    "max_depth":7, 
    "eta":0.1, 
    "subsample":0.7, 
    "colsample_bytree":0.8,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

xgBoost = XGBRegressor(**params) 
fit_model(xgBoost,x,y,"xgboost.csv")