### **Importing the required packages**

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

### **Reading and Exploring the Data**

In [None]:
data = pd.read_csv('nba_final.csv')

In [None]:
data.head() #print the top 5 rows of the dataframe

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,,,,,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,...,Est,Back,2474,64,,,,,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,...,Est,Front,22774,29,,,,,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,...,West,Front,861,120,1.0,52.0,,,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,...,West,Front,4971,69,7.0,23.0,,,42.8,No


In [None]:
data.shape  #print the number of rows and columns in the dataframe

(1408, 45)

In [None]:
data.dtypes #print the datatype of the values in each column

Unnamed: 0,0
Rk,int64
Player.x,object
Player_ID,object
Pos1,object
Pos2,object
Age,int64
Tm,object
G,int64
GS,int64
MP,float64


##### We check the datatypes to verify whether each column has the correct datatype or not and also to know whether there are any object columns which we need to encode.

In [None]:
data.isnull().sum(axis = 0) #print the total number of missing values column-wise

Unnamed: 0,0
Rk,0
Player.x,0
Player_ID,0
Pos1,0
Pos2,1396
Age,0
Tm,0
G,0
GS,0
MP,0


In [None]:
data.isnull().sum(axis = 1) #print the total number of missing values row-wise

Unnamed: 0,0
0,6
1,5
2,5
3,3
4,3
...,...
1403,1
1404,1
1405,4
1406,1


In [None]:
data.isnull().sum(axis = 1) > 4   #mark the rows as True if they fulfill the mentioned condition and false otherwise

Unnamed: 0,0
0,True
1,True
2,True
3,False
4,False
...,...
1403,False
1404,False
1405,False
1406,False


In [None]:
data[data.isnull().sum(axis = 1) > 5] #print the rows which have more than 5 missing values

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,,,,,83.5,No
16,443,Anderson Varejão,varejan01,C,,34,GSW,14,1,6.6,...,West,Front,1724,40,,,,,42.0,No
27,341,Arinze Onuaku,onuakar01,C,,29,ORL,8,0,3.5,...,Est,Front,216,129,,,,,90.0,No
37,60,Bobby Brown,brownbo02,PG,,32,HOU,25,0,4.9,...,West,Back,206,70,,,,,52.0,No
48,378,Brian Roberts,roberbr01,PG,,31,CHO,41,2,10.1,...,Est,Back,484,91,,,,,61.8,No
51,67,Bruno Caboclo,cabocbr01,SF,,21,TOR,9,0,4.4,...,Est,Front,5185,66,,,,,58.5,No
54,302,C.J. Miles,milescj01,SF,,29,IND,76,29,23.4,...,Est,Front,1965,95,,,,,73.0,No
55,455,C.J. Watson,watsocj01,PG,,32,ORL,62,9,16.3,...,Est,Back,369,92,,,,,62.2,No
56,463,C.J. Wilcox,wilcocj01,SG,,26,ORL,22,0,4.9,...,Est,Back,287,93,,,,,62.8,No
62,109,Cheick Diallo,diallch01,PF,,20,NOP,17,0,11.7,...,West,Front,877,119,,,,,81.5,No


In [None]:
data = data[data.isnull().sum(axis = 1) <= 5]  #filter out the rows which has more than 5 missing values

In [None]:
data.isnull().sum()

Unnamed: 0,0
Rk,0
Player.x,0
Player_ID,0
Pos1,0
Pos2,1359
Age,0
Tm,0
G,0
GS,0
MP,0


In [None]:
data.shape  #check the shape again after row deletion

(1371, 45)

In [None]:
data.drop(columns = 'Pos2', inplace = True) #drop the Pos2 column since more than 95% of its values are missing

In [None]:
data = data.fillna(0) #fill the rest of missing values in the dataframe with 0

In [None]:
data.duplicated().sum() #check for the number of duplicate rows

np.int64(0)

In [None]:
data.drop_duplicates(inplace = True)  #drop the duplicate rows, if any

In [None]:
data.drop(columns = ['Player.x', 'Player_ID'], inplace = True)  #drop these 2 columns since they are not useful for ML process

#### **Encode the categorical columns**

In [None]:
obj_cols = data.select_dtypes('object').columns #select the column names of the columns which are of the object type

In [None]:
obj_cols  #list of columns which are object type

Index(['Pos1', 'Tm', 'Season', 'Conference', 'Role', 'Play'], dtype='object')

In [None]:
le = LabelEncoder()

for col in obj_cols:
  data[col] = le.fit_transform(data[col])

In [None]:
#check for data imbalance
data['Play'].value_counts()

Unnamed: 0_level_0,count
Play,Unnamed: 1_level_1
0,1298
1,73


### **Machine Learning Process**

In [None]:
X = data.drop(columns = 'Play') #store the input columns in X
y = data['Play']  #store the output column in y

In [None]:
#split the data into training and test sets and use stratified sampling since the data is imbalanced
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100, stratify = y)

#### **Mean Centering/Scaling/Standardization of the Data**

- Always perform this process after splitting the data into training and test.

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### **Apply the Logistic Regression on the data**

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [None]:
log_reg.coef_   #print the coefficients (beta1 to beta42), which are weightage of each column

array([[ 0.55599738,  0.20563577,  0.68021607,  0.10722513,  1.57491189,
         1.07973303, -0.67160553,  0.41627396,  0.04806734,  0.02483851,
         0.58406804,  0.34452455, -0.55838499,  0.10080902, -0.18136329,
        -0.64537331,  0.20904553, -0.19200101,  0.148108  , -0.47414991,
        -0.10029033,  0.2153693 ,  0.23042837,  0.75398891,  0.06944723,
         0.19404955, -0.7295217 , -0.71186912,  0.32398553,  0.28350647,
         0.36893066,  0.05812181,  0.17763094,  0.44381383, -0.20531763,
        -0.87151468,  0.81503037, -0.39114073,  0.88706089,  0.01481096,
        -1.18597038]])

In [None]:
y_pred = log_reg.predict(X_test_scaled)

In [None]:
accuracy_score(y_test, y_pred)  #accuracy_score is unreliable in the case of imbalance data, so we don't trust it

0.96

In [None]:
roc_auc_score(y_test, y_pred) #this is the correct performance of the model

np.float64(0.7275641025641026)

### **Now we will transform the data with PCA and check the performance of the model using the PCA transformed data**