In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv("../data/train.csv").sample(frac=1.0)
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

f_cols = [col for col in train.columns if col not in ["id", "target"]]

In [3]:
def get_predictions(x):
    return [1 if xi >= 0.5 else 0 for xi in x]

In [4]:
X, y = train[f_cols].values, train["target"].values

In [5]:
X.shape

(250, 300)

In [6]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, C=1.).fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7685416666666666
              precision    recall  f1-score   support

         0.0       0.63      0.52      0.57        90
         1.0       0.75      0.82      0.79       160

    accuracy                           0.72       250
   macro avg       0.69      0.67      0.68       250
weighted avg       0.71      0.72      0.71       250



In [7]:
# Logreg MLE
log_reg = LogisticRegression(random_state=0, C=1.0).fit(X, y)

In [8]:
X_test = test[f_cols].values

In [9]:
y_pred = log_reg.predict_proba(X_test)[:,1]

In [10]:
submission["target"] = y_pred

In [11]:
# LB result 0.74
submission.to_csv("../submissions/09_logreg_MLE.csv", index=False)

In [12]:
# Score 0.740
submission.head()

Unnamed: 0,id,target
0,250,0.247957
1,251,0.065619
2,252,0.758937
3,253,0.999923
4,254,0.278511


# MAP l2 estimate C=0.3

In [13]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, C=.3, penalty="l2").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7696527777777779
              precision    recall  f1-score   support

         0.0       0.62      0.50      0.56        90
         1.0       0.75      0.83      0.79       160

    accuracy                           0.71       250
   macro avg       0.69      0.67      0.67       250
weighted avg       0.70      0.71      0.70       250



In [14]:
log_reg = LogisticRegression(random_state=0, C=.3, penalty="l2").fit(X_train, y_train)

In [15]:
# 0.741
X_test = test[f_cols].values
y_pred = log_reg.predict_proba(X_test)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/12_logreg_MAP_l2_c_05.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.362123
1,251,0.242299
2,252,0.776578
3,253,0.999183
4,254,0.644545


# Bayesian approach

In [None]:
import pymc3 as pm
import theano as tt
from scipy.special import expit

In [None]:
with pm.Model() as model:
    # Alpha is the interception
    alpha = pm.Normal("alpha", mu=0, sd=3)
    # The prior for the features varibles which are included
    beta = pm.Normal("beta", mu=0, sd=3, shape=X.shape[1])
    # Deterministic function
    p = pm.math.dot(X,beta)
    # Likelihood
    y_obs = pm.Bernoulli("y_obs", pm.invlogit(p + alpha),  observed=y)
    

In [None]:
with model:
    trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)

In [None]:
results = pd.DataFrame({'var': np.arange(300), 
                       'beta':np.apply_along_axis(np.mean, 0, trace['beta']),
                        'alpha':np.apply_along_axis(np.mean, 0, trace['alpha'])
                       })

In [None]:
results.head(10).T

In [None]:
estimate = trace['beta']
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )

In [None]:
print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

In [None]:
0.729
X= test[f_cols].values
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )
submission["target"] = preds
submission.to_csv("../submissions/11_bayesian_logreg.csv", index=False)
submission.head()

In [None]:
results

In [None]:
log_reg.intercept_

In [None]:
log_reg.coef_

# MAP l2 estimate C=0.05

High regularization
http://www.natelemoine.com/pdfs/Lemoine%202019a.pdf

In [16]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, C=.1, penalty="l2").fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7708333333333334
              precision    recall  f1-score   support

         0.0       0.64      0.47      0.54        90
         1.0       0.74      0.85      0.79       160

    accuracy                           0.71       250
   macro avg       0.69      0.66      0.66       250
weighted avg       0.70      0.71      0.70       250



In [17]:
log_reg = LogisticRegression(random_state=0, C=.05, penalty="l2").fit(X_train, y_train)

In [18]:
# 0.746
X_test = test[f_cols].values
y_pred = log_reg.predict_proba(X_test)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/12_logreg_MAP_l2_c_05.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.571925
1,251,0.457926
2,252,0.646091
3,253,0.98324
4,254,0.632913


# Bayesian approach

In [19]:
import pymc3 as pm
import theano as tt
from scipy.special import expit

In [20]:
with pm.Model() as model:
    # Alpha is the interception
    alpha = pm.Normal("alpha", mu=0, sd=3)
    # The prior for the features varibles which are included
    beta = pm.Normal("beta", mu=0, sd=20, shape=X.shape[1])
    # Deterministic function
    p = pm.math.dot(X,beta)
    # Likelihood
    y_obs = pm.Bernoulli("y_obs", pm.invlogit(p + alpha),  observed=y)
    

In [21]:
with model:
    trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)

  trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
NUTS: [beta, alpha]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 252 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [22]:
results = pd.DataFrame({'var': np.arange(300), 
                       'beta':np.apply_along_axis(np.mean, 0, trace['beta']),
                        'alpha':np.apply_along_axis(np.mean, 0, trace['alpha'])
                       })

In [23]:
results.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
var,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
beta,11.715648,-21.405001,6.904288,3.353643,-18.134023,-10.916444,-0.162802,3.718685,-13.468696,-20.899708
alpha,2.264032,2.264032,2.264032,2.264032,2.264032,2.264032,2.264032,2.264032,2.264032,2.264032


In [24]:
estimate = trace['beta']
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )

In [25]:
print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        90
         1.0       1.00      1.00      1.00       160

    accuracy                           1.00       250
   macro avg       1.00      1.00      1.00       250
weighted avg       1.00      1.00      1.00       250



In [26]:
0.714
X= test[f_cols].values
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )
submission["target"] = preds
submission.to_csv("../submissions/13_bayesian_logreg_s20.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.253239
1,251,0.139481
2,252,0.288882
3,253,0.910143
4,254,0.37894


In [27]:
results

Unnamed: 0,var,beta,alpha
0,0,11.715648,2.264032
1,1,-21.405001,2.264032
2,2,6.904288,2.264032
3,3,3.353643,2.264032
4,4,-18.134023,2.264032
5,5,-10.916444,2.264032
6,6,-0.162802,2.264032
7,7,3.718685,2.264032
8,8,-13.468696,2.264032
9,9,-20.899708,2.264032


In [28]:
log_reg.intercept_

array([1.00962123])

In [29]:
log_reg.coef_

array([[ 8.62333522e-02, -1.32457981e-01, -1.84989170e-03,
        -9.88221237e-05, -1.43798806e-01, -1.05239688e-01,
        -2.56458977e-02,  6.51421372e-03, -6.84273324e-02,
        -1.69388239e-01, -7.70970407e-03, -7.21240993e-02,
         3.78468875e-03,  1.67919972e-01,  7.72187010e-02,
        -1.05707967e-01, -1.43149044e-01,  1.84025268e-01,
         8.98385499e-02, -5.97453228e-03,  7.53141801e-02,
        -1.49262332e-02, -1.77155449e-02,  5.16991824e-03,
         1.94442474e-01,  7.31605159e-02, -5.56520968e-02,
         4.61571368e-02, -7.87562984e-02,  4.01156095e-02,
         1.27765472e-01,  4.75198406e-03, -1.32407765e-03,
         4.54007292e-01, -4.62177076e-03, -3.37444049e-02,
         2.64129015e-02,  1.36123155e-02, -6.94730867e-03,
        -3.78262117e-02,  2.06241750e-03, -3.64518672e-02,
         6.69864420e-02, -1.92844416e-01, -6.17029729e-02,
        -1.29880245e-01,  1.30024530e-01,  2.64485749e-02,
        -3.82977830e-02, -2.93852617e-02,  1.01727613e-0