In [12]:
# Model prediction using LogisticRegressionCV with SMOTE and using liblinear with predict proba

# Import all necessary packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE

In [13]:
# Loading the files as DataFrame and standardise
sample = pd.read_csv("sample_submission.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
target = train.pop("TARGET_5Yrs")
scaler = StandardScaler()
print(scaler.fit_transform(train), end="\n\n----\n\n")
print(scaler.mean_)

[[ 0.93005882 -1.73183431  1.00610018 ...  1.1072419  -0.05507101
   0.47321012]
 [-0.38091406 -1.7314013   0.71400493 ... -0.36478721  0.43214835
   0.1966711 ]
 [-0.28687803 -1.73096829  1.29819543 ... -0.6101254  -0.05507101
  -0.90948499]
 ...
 [-0.33188458  1.73096829  1.29819543 ...  1.35258008 -0.05507101
   0.74974914]
 [-1.39921865  1.7314013  -1.38908087 ... -0.85546358  0.06673383
  -1.0477545 ]
 [-0.99491401  1.73183431 -0.80489037 ...  0.61656553 -0.66409522
   0.1966711 ]]

----

[6.85697100e+03 7.79850000e+03 6.27778750e+01 1.85766625e+01
 7.26708750e+00 2.80703750e+00 6.23121250e+00 4.46089000e+01
 2.64525000e-01 8.16562500e-01 1.95837000e+01 1.39252500e+00
 1.94778750e+00 7.13658250e+01 1.07783750e+00 2.16850000e+00
 3.24530000e+00 1.62451250e+00 6.48687500e-01 2.45212500e-01
 1.25776250e+00]


In [14]:
# check for any null values on train
train.isnull().sum()

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64

In [15]:
# check for any null values on test
test.isnull().sum()

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64

In [16]:
# Prove that the target variable is imbalance - 83% is "1" with 6669 occurences out of 8000 entries
print(target.describe(),end="\n\n-------\n\n")
print(target.value_counts())

count    8000.000000
mean        0.833625
std         0.372440
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: TARGET_5Yrs, dtype: float64

-------

1    6669
0    1331
Name: TARGET_5Yrs, dtype: int64


In [17]:
# resampling using SMOTE
sm = SMOTE(random_state=123)

train_resampled, target_resampled = sm.fit_resample(train, target)

In [25]:
# Check distribution after SMOTE
print(target_resampled.describe(),end="\n\n-------\n\n")
print(target_resampled.value_counts())

count    13338.000000
mean         0.500000
std          0.500019
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          1.000000
Name: TARGET_5Yrs, dtype: float64

-------

1    6669
0    6669
Name: TARGET_5Yrs, dtype: int64


In [29]:
# split train into train and validation
X_train, X_val, y_train, y_val = train_test_split(train_resampled, target_resampled, test_size=0.2, random_state=8)

In [30]:
# Instantiate LogisticRegression Class into reg
reg = LogisticRegressionCV(solver='liblinear')

# fitting
reg.fit(X_train,y_train)

LogisticRegressionCV(solver='liblinear')

In [31]:
# predictions probability
y_train_preds = reg.predict_proba(X_train)[:,1]
y_val_preds = reg.predict_proba(X_val)[:,1]

In [32]:
#RMSE and MAE scores for this model on training set and validation set
print("Train RMSE - " + str(mse(y_train, y_train_preds, squared=False)))
print("Train MAE - " + str(mae(y_train, y_train_preds)))
print("Val RMSE - " + str(mse(y_val, y_val_preds, squared=False)))
print("Val MAE - " + str(mae(y_val, y_val_preds)))

# AUROC scores
print('Train AUROC score:',roc_auc_score(y_train,y_train_preds))
print('Validation AUROC score:', roc_auc_score(y_val,y_val_preds))

Train RMSE - 0.4617055382037367
Train MAE - 0.4277028840731858
Val RMSE - 0.45500206381688113
Val MAE - 0.42129689798222114
Train AUROC score: 0.716734651629199
Validation AUROC score: 0.7358329229378471


In [33]:
# prepare submission
y_test_preds = reg.predict_proba(test)[:,1]

In [34]:
# check if there is out of binary values
print(list(y_test_preds[y_test_preds > 1]))
print(list(y_test_preds[y_test_preds < 0]))

[]
[]


In [35]:
#printout prediction into a file
submission = pd.DataFrame({'Id':test['Id'],'TARGET_5Yrs':y_test_preds})
submission.to_csv('submission_week_2_14.csv',index=False)

In [36]:
# save model
from joblib import dump  

dump(reg,  'LogisticRegressionCV_SMOTE_liblinear_predict_proba_1.joblib')

['LogisticRegressionCV_SMOTE_liblinear_predict_proba_1.joblib']