## Import packages and set up notebook

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [2]:
pd.options.display.max_columns = 50

### Load NBA dataset to df

In [3]:
df_tr = pd.read_csv('../data/raw/2022_train.csv')
df_te = pd.read_csv('../data/raw/2022_test.csv')

### Explore raw data

In [4]:
df_tr.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [5]:
df_te.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,7.3,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,35.1,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,0.0,1.8
2,2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,44.8,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,13.5,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,38.7,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9


In [6]:
print(df_tr.shape)
print(df_te.shape)

(8000, 21)
(3799, 20)


In [7]:
df_tr.info()
df_te.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           8000 non-null   int64  
 1   GP           8000 non-null   int64  
 2   MIN          8000 non-null   float64
 3   PTS          8000 non-null   float64
 4   FGM          8000 non-null   float64
 5   FGA          8000 non-null   float64
 6   FG%          8000 non-null   float64
 7   3P Made      8000 non-null   float64
 8   3PA          8000 non-null   float64
 9   3P%          8000 non-null   float64
 10  FTM          8000 non-null   float64
 11  FTA          8000 non-null   float64
 12  FT%          8000 non-null   float64
 13  OREB         8000 non-null   float64
 14  DREB         8000 non-null   float64
 15  REB          8000 non-null   float64
 16  AST          8000 non-null   float64
 17  STL          8000 non-null   float64
 18  BLK          8000 non-null   float64
 19  TOV   

In [8]:
df_tr.describe()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,7798.5,62.777875,18.576662,7.267088,2.807037,6.231212,44.6089,0.264525,0.816562,19.5837,1.392525,1.947788,71.365825,1.077838,2.1685,3.2453,1.624513,0.648687,0.245212,1.257763,0.833625
std,2309.54541,17.118774,8.935263,4.318732,1.693373,3.584559,6.155453,0.384093,1.060964,16.003155,0.926153,1.252352,10.430447,0.78567,1.392224,2.085154,1.355986,0.407626,0.821037,0.72327,0.37244
min,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,-38.5,0.0,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,8.4,0.7,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,19.5,1.2,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,30.6,1.9,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,82.1,8.1,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


In [9]:
df_te.describe()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
count,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0
mean,1899.0,62.853909,18.650224,7.328034,2.835404,6.30258,44.599079,0.255962,0.79692,19.234746,1.399842,1.953567,71.612924,1.096025,2.179495,3.275783,1.636483,0.653593,0.257726,1.25791
std,1096.821164,17.15174,8.727259,4.294724,1.688427,3.579221,6.040168,0.380987,1.052862,15.968989,0.92614,1.250376,10.457336,0.785678,1.371935,2.070646,1.335496,0.410573,0.63966,0.712449
min,0.0,6.0,3.7,0.7,0.3,0.8,25.1,-1.0,-2.7,-38.0,0.0,0.0,23.7,0.0,0.2,0.3,0.0,0.0,-7.1,0.1
25%,949.5,51.0,12.2,4.2,1.6,3.7,40.5,0.0,0.1,8.5,0.7,1.0,65.0,0.5,1.2,1.8,0.6,0.4,0.1,0.7
50%,1899.0,63.0,17.0,6.4,2.5,5.5,44.6,0.3,0.8,19.4,1.2,1.7,71.5,0.9,1.9,2.8,1.3,0.6,0.2,1.1
75%,2848.5,74.0,23.3,9.4,3.7,8.1,48.5,0.5,1.5,30.25,1.9,2.6,78.0,1.5,2.9,4.3,2.3,0.9,0.4,1.6
max,3798.0,126.0,68.0,33.0,13.4,26.2,74.6,1.6,4.3,73.8,7.8,9.8,127.1,6.9,12.0,18.5,9.0,2.7,14.8,5.2


In [10]:
# examine extent of imbalance in target variable.

df_tr.groupby(['TARGET_5Yrs']).size()

TARGET_5Yrs
0    1331
1    6669
dtype: int64

## Prepare Data - notes from eda
1. Drop Id column from both train and test data set
2. Pop out TARGET_5Yrs column from training set
3. Fit scaling to training data and transform both training an testing data with the same scaling parameters <br>
    a. StandardScaler.fit(df_tr_cleaned)<br>
    b. StandardScaler.transform(df_tr_cleaned)<br>
    c. StadardScaler.transfor(df_te_cleaned)<br>
4. resample training data upsample, downsample and SMOTE (synthetic sampling see lec2 slide 36) train model on resampled sets.