In [5]:
# Model prediction using LogisticRegressionCV with SMOTE and using liblinear with predict proba

# Import all necessary packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE

In [21]:
# Loading the files as DataFrame and standardise
sample = pd.read_csv("sample_submission.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
target = train.pop("TARGET_5Yrs")
scaler = StandardScaler()
print(scaler.fit_transform(train), end="\n\n" + "-"*10*9 + "\n\n")
print(scaler.mean_)

[[ 0.93005882 -1.73183431  1.00610018 ...  1.1072419  -0.05507101
   0.47321012]
 [-0.38091406 -1.7314013   0.71400493 ... -0.36478721  0.43214835
   0.1966711 ]
 [-0.28687803 -1.73096829  1.29819543 ... -0.6101254  -0.05507101
  -0.90948499]
 ...
 [-0.33188458  1.73096829  1.29819543 ...  1.35258008 -0.05507101
   0.74974914]
 [-1.39921865  1.7314013  -1.38908087 ... -0.85546358  0.06673383
  -1.0477545 ]
 [-0.99491401  1.73183431 -0.80489037 ...  0.61656553 -0.66409522
   0.1966711 ]]

------------------------------------------------------------------------------------------

[6.85697100e+03 7.79850000e+03 6.27778750e+01 1.85766625e+01
 7.26708750e+00 2.80703750e+00 6.23121250e+00 4.46089000e+01
 2.64525000e-01 8.16562500e-01 1.95837000e+01 1.39252500e+00
 1.94778750e+00 7.13658250e+01 1.07783750e+00 2.16850000e+00
 3.24530000e+00 1.62451250e+00 6.48687500e-01 2.45212500e-01
 1.25776250e+00]


In [7]:
# check for any null values on train and test
print(train.isnull().sum(), end="\n\n" + "-"*10*9 + "\n\n")
print(test.isnull().sum())

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64

------------------------------------------------------------------------------------------

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64


In [8]:
# Prove that the target variable is imbalance - 83% is "1" with 6669 occurences out of 8000 entries
print(target.describe(), end="\n\n" + "-"*10*9 + "\n\n")
print(target.value_counts())

count    8000.000000
mean        0.833625
std         0.372440
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: TARGET_5Yrs, dtype: float64

------------------------------------------------------------------------------------------

1    6669
0    1331
Name: TARGET_5Yrs, dtype: int64


In [20]:
print(train.where(train < 0).count())

Id_old        0
Id            0
GP            2
MIN           0
PTS           0
FGM           0
FGA           0
FG%           0
3P Made    1629
3PA        1658
3P%         878
FTM           0
FTA           0
FT%           1
OREB          0
DREB          0
REB           0
AST           0
STL           0
BLK        1048
TOV           0
dtype: int64


In [10]:
print(test.where(train < 0).count())

Id_old       0
Id           0
GP           0
MIN          0
PTS          0
FGM          0
FGA          0
FG%          0
3P Made    780
3PA        801
3P%        435
FTM          0
FTA          0
FT%          1
OREB         0
DREB         0
REB          0
AST          0
STL          0
BLK        482
TOV          0
dtype: int64


In [26]:
# change negative values to 0 in train and test
train[train < 0] = 0
test[test < 0] = 0

In [27]:
train.head()

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,10556,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,...,2.0,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6
1,5342,3800,75,21.8,10.5,4.2,7.9,55.1,0.0,0.0,...,2.4,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4
2,5716,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,...,0.4,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6
3,13790,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,...,0.9,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9
4,5470,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,...,0.2,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7


In [28]:
# check negative values again
print(train.where(train < 0).count(), end="\n\n" + "-"*10*9 + "\n\n")
print(test.where(train < 0).count())

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64

------------------------------------------------------------------------------------------

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64


In [29]:
# resampling using SMOTE
sm = SMOTE(random_state=123)

train_resampled, target_resampled = sm.fit_resample(train, target)

In [30]:
# Check distribution after SMOTE
print(target_resampled.describe(), end="\n\n" + "-"*10*9 + "\n\n")
print(target_resampled.value_counts())

count    13338.000000
mean         0.500000
std          0.500019
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          1.000000
Name: TARGET_5Yrs, dtype: float64

------------------------------------------------------------------------------------------

1    6669
0    6669
Name: TARGET_5Yrs, dtype: int64


In [31]:
# split train into train and validation
X_train, X_val, y_train, y_val = train_test_split(train_resampled, target_resampled, test_size=0.2, random_state=8)

In [32]:
# Instantiate LogisticRegression Class into reg
reg = LogisticRegressionCV(solver='liblinear', max_iter=1000, random_state=123)

# fitting
reg.fit(X_train,y_train)

LogisticRegressionCV(max_iter=1000, random_state=123, solver='liblinear')

In [33]:
# predictions probability
y_train_preds = reg.predict_proba(X_train)[:,1]
y_val_preds = reg.predict_proba(X_val)[:,1]

In [34]:
# AUROC scores
print('Train AUROC score:',roc_auc_score(y_train,y_train_preds))
print('Validation AUROC score:', roc_auc_score(y_val,y_val_preds))

Train AUROC score: 0.7153827875405183
Validation AUROC score: 0.7342571167074814


In [35]:
# prepare submission
y_test_preds = reg.predict_proba(test)[:,1]

In [36]:
# check if there is out of binary values
print(list(y_test_preds[y_test_preds > 1]))
print(list(y_test_preds[y_test_preds < 0]))

[]
[]


In [37]:
#printout prediction into a file
submission = pd.DataFrame({'Id':test['Id'],'TARGET_5Yrs':y_test_preds})
submission.to_csv('submission_week_3_2.csv',index=False)

In [38]:
# save model
from joblib import dump  

dump(reg,  'LogisticRegressionCV_negative_values_to_zero_SMOTE_liblinear_predict_proba_1.joblib')

['LogisticRegressionCV_negative_values_to_zero_SMOTE_liblinear_predict_proba_1.joblib']