In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

#### 1. Reading the data and processing it

In [2]:
data = pd.read_csv('nba_final.csv')

In [3]:
data.head()

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,,,,,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,...,Est,Back,2474,64,,,,,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,...,Est,Front,22774,29,,,,,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,...,West,Front,861,120,1.0,52.0,,,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,...,West,Front,4971,69,7.0,23.0,,,42.8,No


In [4]:
data.shape

(1408, 45)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rk          1408 non-null   int64  
 1   Player.x    1408 non-null   object 
 2   Player_ID   1408 non-null   object 
 3   Pos1        1408 non-null   object 
 4   Pos2        12 non-null     object 
 5   Age         1408 non-null   int64  
 6   Tm          1408 non-null   object 
 7   G           1408 non-null   int64  
 8   GS          1408 non-null   int64  
 9   MP          1408 non-null   float64
 10  FG          1408 non-null   float64
 11  FGA         1408 non-null   float64
 12  FG.         1404 non-null   float64
 13  X3P         1408 non-null   float64
 14  X3PA        1408 non-null   float64
 15  X3P.        1309 non-null   float64
 16  X2P         1408 non-null   float64
 17  X2PA        1408 non-null   float64
 18  X2P.        1393 non-null   float64
 19  eFG.        1404 non-null  

In [7]:
#check the number of missing values for each column
data.isnull().sum()

Rk               0
Player.x         0
Player_ID        0
Pos1             0
Pos2          1396
Age              0
Tm               0
G                0
GS               0
MP               0
FG               0
FGA              0
FG.              4
X3P              0
X3PA             0
X3P.            99
X2P              0
X2PA             0
X2P.            15
eFG.             4
FT               0
FTA              0
FT.             47
ORB              0
DRB              0
TRB              0
AST              0
STL              0
BLK              0
TOV              0
PF               0
PTS              0
Salary          62
mean_views     138
Season           0
Conference       0
Role             0
Fvot             0
FRank            0
Pvot           159
PRank          159
Mvot           404
MRank          404
Score            0
Play             0
dtype: int64

In [8]:
#check for the duplicate rows
data[data.duplicated()]

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play


In [9]:
data['Pos2'] = data['Pos2'].replace(np.nan, 'None')

In [10]:
data = data.fillna(0)

In [11]:
data.isnull().sum()

Rk            0
Player.x      0
Player_ID     0
Pos1          0
Pos2          0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG.           0
X3P           0
X3PA          0
X3P.          0
X2P           0
X2PA          0
X2P.          0
eFG.          0
FT            0
FTA           0
FT.           0
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Salary        0
mean_views    0
Season        0
Conference    0
Role          0
Fvot          0
FRank         0
Pvot          0
PRank         0
Mvot          0
MRank         0
Score         0
Play          0
dtype: int64

In [12]:
data.head()

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,...,West,Front,786,123,0.0,0.0,0.0,0.0,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,...,Est,Back,2474,64,0.0,0.0,0.0,0.0,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,...,Est,Front,22774,29,0.0,0.0,0.0,0.0,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,...,West,Front,861,120,1.0,52.0,0.0,0.0,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,...,West,Front,4971,69,7.0,23.0,0.0,0.0,42.8,No


In [13]:
#drop few columns from the dataset
data.drop(columns = ['Player.x', 'Player_ID'], inplace = True)

In [14]:
#check the columns which are object datatype
data.select_dtypes('object')

Unnamed: 0,Pos1,Pos2,Tm,Season,Conference,Role,Play
0,C,,DAL,2016-17,West,Front,No
1,PG,,IND,2016-17,Est,Back,No
2,SF,,ORL,2016-17,Est,Front,No
3,PF,,MIN,2016-17,West,Front,No
4,PF,,POR,2016-17,West,Front,No
...,...,...,...,...,...,...,...
1403,C,,POR,2018-19,West,Front,No
1404,SG,,CHI,2018-19,Est,Back,No
1405,SG,,DET,2018-19,Est,Back,No
1406,C,,DET,2018-19,Est,Front,No


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
Le = LabelEncoder()

In [17]:
data['Pos1'] = Le.fit_transform(data['Pos1'])
data['Pos2'] = Le.fit_transform(data['Pos2'])
data['Tm'] = Le.fit_transform(data['Tm'])
data['Season'] = Le.fit_transform(data['Season'])
data['Conference'] = Le.fit_transform(data['Conference'])
data['Role'] = Le.fit_transform(data['Role'])
data['Play'] = Le.fit_transform(data['Play'])

In [17]:
data.head()

Unnamed: 0,Rk,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,...,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,0,1,24,6,22,0,7.4,0.8,1.9,...,1,1,786,123,0.0,0.0,0.0,0.0,83.5,0
1,58,2,1,32,11,65,0,13.8,1.9,4.6,...,0,0,2474,64,0.0,0.0,0.0,0.0,48.2,0
2,157,3,1,21,21,80,72,28.7,4.9,10.8,...,0,1,22774,29,0.0,0.0,0.0,0.0,40.0,0
3,352,1,1,25,17,18,0,7.5,1.3,3.0,...,1,1,861,120,1.0,52.0,0.0,0.0,75.5,0
4,10,1,1,26,24,61,25,29.1,3.0,7.6,...,1,1,4971,69,7.0,23.0,0.0,0.0,42.8,0


In [18]:
data['Play'].value_counts()

Play
0    1335
1      73
Name: count, dtype: int64

### Machine Learning Process

In [19]:
X = data.drop(columns = 'Play')
y = data['Play']

#### Scaling the Data

In [20]:
scaler = StandardScaler()

In [21]:
X = scaler.fit_transform(X)

### Apply LDA on the Data

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [24]:
lda = LinearDiscriminantAnalysis()
X = lda.fit_transform(X, y)

#### Splitting the Data

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

### Apply the Logistic Regression

In [27]:
X.shape

(1408, 1)

In [28]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [29]:
y_pred = log_reg.predict(X_test)

In [30]:
roc_auc_score(y_test, y_pred)

0.8962546816479401