In [1]:
# Model prediction using LogisticRegression with upsampling and using liblinear with predict proba

# Import all necessary packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [2]:
sample = pd.read_csv("sample_submission.csv")
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
#target = train.pop("TARGET_5Yrs")
scaler = StandardScaler()
print(scaler.fit_transform(train), end="\n\n----\n\n")
print(scaler.mean_)

[[ 0.93005882 -1.73183431  1.00610018 ... -0.05507101  0.47321012
   0.44674394]
 [-0.38091406 -1.7314013   0.71400493 ...  0.43214835  0.1966711
   0.44674394]
 [-0.28687803 -1.73096829  1.29819543 ... -0.05507101 -0.90948499
   0.44674394]
 ...
 [-0.33188458  1.73096829  1.29819543 ... -0.05507101  0.74974914
   0.44674394]
 [-1.39921865  1.7314013  -1.38908087 ...  0.06673383 -1.0477545
   0.44674394]
 [-0.99491401  1.73183431 -0.80489037 ... -0.66409522  0.1966711
   0.44674394]]

----

[6.85697100e+03 7.79850000e+03 6.27778750e+01 1.85766625e+01
 7.26708750e+00 2.80703750e+00 6.23121250e+00 4.46089000e+01
 2.64525000e-01 8.16562500e-01 1.95837000e+01 1.39252500e+00
 1.94778750e+00 7.13658250e+01 1.07783750e+00 2.16850000e+00
 3.24530000e+00 1.62451250e+00 6.48687500e-01 2.45212500e-01
 1.25776250e+00 8.33625000e-01]


In [3]:
train.isnull().sum()

Id_old         0
Id             0
GP             0
MIN            0
PTS            0
FGM            0
FGA            0
FG%            0
3P Made        0
3PA            0
3P%            0
FTM            0
FTA            0
FT%            0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
TARGET_5Yrs    0
dtype: int64

In [4]:
test.isnull().sum()

Id_old     0
Id         0
GP         0
MIN        0
PTS        0
FGM        0
FGA        0
FG%        0
3P Made    0
3PA        0
3P%        0
FTM        0
FTA        0
FT%        0
OREB       0
DREB       0
REB        0
AST        0
STL        0
BLK        0
TOV        0
dtype: int64

In [5]:
# Prove that the target variable is imbalance - 83% is "1" with 6669 occurences out of 8000 entries
print(train.TARGET_5Yrs.describe(),end="\n\n-------\n\n")
print(train.TARGET_5Yrs.value_counts())

count    8000.000000
mean        0.833625
std         0.372440
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: TARGET_5Yrs, dtype: float64

-------

1    6669
0    1331
Name: TARGET_5Yrs, dtype: int64


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 22 columns):
Id_old         8000 non-null int64
Id             8000 non-null int64
GP             8000 non-null int64
MIN            8000 non-null float64
PTS            8000 non-null float64
FGM            8000 non-null float64
FGA            8000 non-null float64
FG%            8000 non-null float64
3P Made        8000 non-null float64
3PA            8000 non-null float64
3P%            8000 non-null float64
FTM            8000 non-null float64
FTA            8000 non-null float64
FT%            8000 non-null float64
OREB           8000 non-null float64
DREB           8000 non-null float64
REB            8000 non-null float64
AST            8000 non-null float64
STL            8000 non-null float64
BLK            8000 non-null float64
TOV            8000 non-null float64
TARGET_5Yrs    8000 non-null int64
dtypes: float64(18), int64(4)
memory usage: 1.3 MB


In [7]:
from sklearn.utils import resample

# separate 1s and 0s

fiveyears = train[train.TARGET_5Yrs==1]
lessyears = train[train.TARGET_5Yrs==0]

# upsampling minority
lessyears_upsampled = resample(lessyears, replace=True, n_samples=len(fiveyears), random_state=123)

# combine fiveyears and lessyears_unsampled
up_sampling=pd.concat([fiveyears, lessyears_upsampled])

In [8]:
# check new class counts
up_sampling.TARGET_5Yrs.value_counts()

1    6669
0    6669
Name: TARGET_5Yrs, dtype: int64

In [9]:
# Create a target dataframe
target = up_sampling.pop("TARGET_5Yrs")

In [10]:
X_train, X_val, y_train, y_val = train_test_split(up_sampling, target, test_size=0.2, random_state=8)

In [11]:
# Instantiate LogisticRegression Class into reg
reg = LogisticRegression(solver='liblinear')

# fitting
reg.fit(X_train,y_train)

LogisticRegression(solver='liblinear')

In [33]:
# predictions probability
y_train_preds = reg.predict_proba(X_train)[:,1]
y_val_preds = reg.predict_proba(X_val)[:,1]

In [34]:
#RMSE and MAE scores for this model on training set and validation set
print("Train RMSE - " + str(mse(y_train, y_train_preds, squared=False)))
print("Train MAE - " + str(mae(y_train, y_train_preds)))
print("Val RMSE - " + str(mse(y_val, y_val_preds, squared=False)))
print("Val MAE - " + str(mae(y_val, y_val_preds)))

# AUROC scores
print('Train AUROC score:',roc_auc_score(y_train,y_train_preds))
print('Validation AUROC score:', roc_auc_score(y_val,y_val_preds))

Train RMSE - 0.4656553764838133
Train MAE - 0.43396152580674635
Val RMSE - 0.4665964404250694
Val MAE - 0.4345550671733113
Train AUROC score: 0.7066710289404331
Validation AUROC score: 0.7047796299293305


In [35]:
# prepare submission
y_test_preds = reg.predict_proba(test)[:,1]

In [37]:
# check if there is out of binary values
print(list(y_test_preds[y_test_preds > 1]))
print(list(y_test_preds[y_test_preds < 0]))

[]
[]


In [38]:
submission = pd.DataFrame({'Id':test['Id'],'TARGET_5Yrs':y_test_preds})
submission.to_csv('submission_week_2_9.csv',index=False)

In [43]:
from joblib import dump  

dump(reg,  'LogisticRegression_upsampling_liblinear_predict_proba_1.joblib')

['LogisticRegression_upsampling_liblinear_predict_proba_1.joblib']

In [40]:
# use predict proba 0
# predictions probability
y_train_preds = reg.predict_proba(X_train)[:,0]
y_val_preds = reg.predict_proba(X_val)[:,0]

#RMSE and MAE scores for this model on training set and validation set
print("Train RMSE - " + str(mse(y_train, y_train_preds, squared=False)))
print("Train MAE - " + str(mae(y_train, y_train_preds)))
print("Val RMSE - " + str(mse(y_val, y_val_preds, squared=False)))
print("Val MAE - " + str(mae(y_val, y_val_preds)))

# AUROC scores
print('Train AUROC score:',roc_auc_score(y_train,y_train_preds))
print('Validation AUROC score:', roc_auc_score(y_val,y_val_preds))

# prepare submission
y_test_preds = reg.predict_proba(test)[:,0]

# check if there is out of binary values
print(list(y_test_preds[y_test_preds > 1]))
print(list(y_test_preds[y_test_preds < 0]))

Train RMSE - 0.590687631523455
Train MAE - 0.5660384741932536
Val RMSE - 0.5904253584245199
Val MAE - 0.5654449328266887
Train AUROC score: 0.293328971059567
Validation AUROC score: 0.2952203700706695
[]
[]


In [41]:
submission = pd.DataFrame({'Id':test['Id'],'TARGET_5Yrs':y_test_preds})
submission.to_csv('submission_week_2_10.csv',index=False)

In [42]:
from joblib import dump  

dump(reg,  'LogisticRegression_upsampling_liblinear_predict_proba_0.joblib')

['LogisticRegression_upsampling_liblinear_predict_proba_0.joblib']