In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv("../data/train.csv").sample(frac=1.0)
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

f_cols = [col for col in train.columns if col not in ["id", "target"]]

In [3]:
def get_predictions(x):
    return [1 if xi >= 0.5 else 0 for xi in x]

In [4]:
X, y = train[f_cols].values, train["target"].values

In [5]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7685416666666666
              precision    recall  f1-score   support

         0.0       0.63      0.52      0.57        90
         1.0       0.75      0.82      0.79       160

    accuracy                           0.72       250
   macro avg       0.69      0.67      0.68       250
weighted avg       0.71      0.72      0.71       250



In [6]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7690277777777778
              precision    recall  f1-score   support

         0.0       0.60      0.56      0.58        90
         1.0       0.76      0.79      0.78       160

    accuracy                           0.71       250
   macro avg       0.68      0.67      0.68       250
weighted avg       0.70      0.71      0.71       250



In [7]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X, y)

In [8]:
X_test = test[f_cols].values

In [9]:
y_pred = log_reg.predict_proba(X_test)[:,1]

In [10]:
submission["target"] = y_pred

In [11]:
submission.to_csv("../submissions/01_weigthed_logreg.csv", index=False)

In [12]:
# Score 0.740
submission.head()

Unnamed: 0,id,target
0,250,0.19309
1,251,0.048199
2,252,0.703955
3,253,0.999904
4,254,0.220861


In [13]:
sort_idx = np.argsort(np.abs(log_reg.coef_))
sort_coef_vals = np.abs(log_reg.coef_)[0][sort_idx[0]]

In [14]:
sort_coef_vals[-50:]

array([0.30505514, 0.30626714, 0.31393156, 0.31494067, 0.31710385,
       0.31729446, 0.31954889, 0.32098995, 0.3224138 , 0.326504  ,
       0.32794461, 0.3336388 , 0.3347309 , 0.33614598, 0.33668442,
       0.34154986, 0.34219336, 0.34337978, 0.3456093 , 0.34593102,
       0.35350673, 0.35435691, 0.35672601, 0.35806973, 0.35862476,
       0.36249289, 0.36580139, 0.36622405, 0.36806515, 0.37181101,
       0.37477274, 0.3815167 , 0.39074524, 0.40714677, 0.41621236,
       0.43716457, 0.45825316, 0.46220986, 0.46545007, 0.46797557,
       0.49355177, 0.49886881, 0.56463455, 0.56491636, 0.56590056,
       0.58287906, 0.59937679, 0.64393797, 0.89200031, 1.08009901])

In [15]:
feature_idx = sort_idx[0][-20:]

In [16]:
feature_idx

array([244, 258, 201, 183,   1, 194, 299, 199,  17, 101,  43,  24, 117,
        82, 295, 217,  73,  91,  65,  33])

In [17]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[:,feature_idx][train_index], X[:,feature_idx][test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.8890972222222222
              precision    recall  f1-score   support

         0.0       0.71      0.79      0.75        90
         1.0       0.87      0.82      0.85       160

    accuracy                           0.81       250
   macro avg       0.79      0.80      0.80       250
weighted avg       0.81      0.81      0.81       250



In [18]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X[:,feature_idx], y)

In [19]:
# 0.801
X_test = test[f_cols].values[:,feature_idx]
y_pred = log_reg.predict_proba(X_test)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/02_weigthed_logreg_20features.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.901263
1,251,0.36811
2,252,0.875366
3,253,0.988967
4,254,0.210552


## Scaled

In [20]:
from sklearn import preprocessing

In [21]:
scaler = preprocessing.StandardScaler().fit(X)

In [22]:
X_scaled = scaler.transform(X)

In [23]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.767638888888889
              precision    recall  f1-score   support

         0.0       0.62      0.57      0.59        90
         1.0       0.77      0.81      0.79       160

    accuracy                           0.72       250
   macro avg       0.69      0.69      0.69       250
weighted avg       0.72      0.72      0.72       250



In [24]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled, y)

In [25]:
X_test_scaled = scaler.transform(X_test)

In [26]:
# 0.737
X_test = test[f_cols].values
X_test_scaled = scaler.transform(X_test)
y_pred = log_reg.predict_proba(X_test_scaled)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/03_weigthed_logreg_stdscaled.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.228902
1,251,0.036039
2,252,0.677954
3,253,0.999898
4,254,0.249494


In [27]:
sort_idx = np.argsort(np.abs(log_reg.coef_))
sort_coef_vals = np.abs(log_reg.coef_)[0][sort_idx[0]]

In [28]:
feature_idx = sort_idx[0][-20:]

In [29]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X_scaled[:,feature_idx][train_index], X_scaled[:,feature_idx][test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.8888888888888888
              precision    recall  f1-score   support

         0.0       0.71      0.79      0.75        90
         1.0       0.87      0.82      0.85       160

    accuracy                           0.81       250
   macro avg       0.79      0.80      0.80       250
weighted avg       0.81      0.81      0.81       250



In [30]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled[:,feature_idx], y)

In [31]:
# 0.801
X_test = test[f_cols].values
X_test_scaled = scaler.transform(X_test)[:,feature_idx]
y_pred = log_reg.predict_proba(X_test_scaled)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/04_weigthed_logreg_stdscaled_20features.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.900693
1,251,0.366339
2,252,0.874833
3,253,0.989062
4,254,0.212043


## Robost scaled

In [32]:
from sklearn import preprocessing

In [33]:
scaler = preprocessing.RobustScaler().fit(X)

In [34]:
X_scaled = scaler.transform(X)

In [35]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7657638888888889
              precision    recall  f1-score   support

         0.0       0.60      0.54      0.57        90
         1.0       0.76      0.80      0.78       160

    accuracy                           0.71       250
   macro avg       0.68      0.67      0.68       250
weighted avg       0.70      0.71      0.70       250



In [36]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled, y)

In [37]:
X_test_scaled = scaler.transform(X_test)

In [38]:
sort_idx = np.argsort(np.abs(log_reg.coef_))
sort_coef_vals = np.abs(log_reg.coef_)[0][sort_idx[0]]

In [39]:
feature_idx = sort_idx[0][-20:]

In [40]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X_scaled[:,feature_idx][train_index], X_scaled[:,feature_idx][test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.8895833333333333
              precision    recall  f1-score   support

         0.0       0.72      0.79      0.75        90
         1.0       0.87      0.82      0.85       160

    accuracy                           0.81       250
   macro avg       0.80      0.81      0.80       250
weighted avg       0.82      0.81      0.81       250



In [41]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled[:,feature_idx], y)

In [42]:
# 0.802
X_test = test[f_cols].values
X_test_scaled = scaler.transform(X_test)[:,feature_idx]
y_pred = log_reg.predict_proba(X_test_scaled)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/05_weigthed_logreg_robostscaled_20features.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.882927
1,251,0.367765
2,252,0.85499
3,253,0.983947
4,254,0.245938


## Standard scaled with regularisation on C

In [43]:
from sklearn import preprocessing

In [44]:
scaler = preprocessing.StandardScaler().fit(X)

In [45]:
X_scaled = scaler.transform(X)

In [46]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced", C=0.1).fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7690277777777778
              precision    recall  f1-score   support

         0.0       0.60      0.53      0.56        90
         1.0       0.75      0.80      0.78       160

    accuracy                           0.70       250
   macro avg       0.68      0.67      0.67       250
weighted avg       0.70      0.70      0.70       250



In [47]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled, y)

In [48]:
X_test_scaled = scaler.transform(X_test)

In [49]:
# 0.737
X_test = test[f_cols].values
X_test_scaled = scaler.transform(X_test)
y_pred = log_reg.predict_proba(X_test_scaled)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/06_weigthed_logreg_stdscaled_c_01.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.228902
1,251,0.036039
2,252,0.677954
3,253,0.999898
4,254,0.249494


In [50]:
sort_idx = np.argsort(np.abs(log_reg.coef_))
sort_coef_vals = np.abs(log_reg.coef_)[0][sort_idx[0]]

In [51]:
feature_idx = sort_idx[0][-20:]

In [52]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X_scaled[:,feature_idx][train_index], X_scaled[:,feature_idx][test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.8888888888888888
              precision    recall  f1-score   support

         0.0       0.71      0.79      0.75        90
         1.0       0.87      0.82      0.85       160

    accuracy                           0.81       250
   macro avg       0.79      0.80      0.80       250
weighted avg       0.81      0.81      0.81       250



In [53]:
log_reg = LogisticRegression(random_state=0, class_weight="balanced").fit(X_scaled[:,feature_idx], y)

In [54]:
# 0.801
X_test = test[f_cols].values
X_test_scaled = scaler.transform(X_test)[:,feature_idx]
y_pred = log_reg.predict_proba(X_test_scaled)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/07_weigthed_logreg_stdscaled_c_01_20features.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.900693
1,251,0.366339
2,252,0.874833
3,253,0.989062
4,254,0.212043
