In [1]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv("../data/train.csv").sample(frac=1.0)
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

f_cols = [col for col in train.columns if col not in ["id", "target"]]

In [3]:
def get_predictions(x):
    return [1 if xi >= 0.5 else 0 for xi in x]

In [4]:
X, y = train[f_cols].values, train["target"].values

In [5]:
X.shape

(250, 300)

In [6]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, C=1.).fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7685416666666666
              precision    recall  f1-score   support

         0.0       0.63      0.52      0.57        90
         1.0       0.75      0.82      0.79       160

    accuracy                           0.72       250
   macro avg       0.69      0.67      0.68       250
weighted avg       0.71      0.72      0.71       250



In [7]:
# Logreg MLE
log_reg = LogisticRegression(random_state=0, C=1.0).fit(X, y)

In [8]:
X_test = test[f_cols].values
y_pred = log_reg.predict_proba(X_test)[:,1]
submission["target"] = y_pred

In [9]:
# LB result 0.74
submission.to_csv("../submissions/09_logreg_MLE.csv", index=False)

In [10]:
# Score 0.740
submission.head()

Unnamed: 0,id,target
0,250,0.247957
1,251,0.065619
2,252,0.758937
3,253,0.999923
4,254,0.278511


# Gaussian logreg

## MAP l2 estimate C=0.3

In [9]:
loo = LeaveOneOut()
preds = np.zeros(len(y))
for i, (train_index, test_index) in enumerate(loo.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    clf = LogisticRegression(random_state=0, C=.3, penalty="l2", solver='liblinear').fit(X_train, y_train)
    preds[test_index] = clf.predict_proba(X_test)[:,1]

print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.7697916666666667
              precision    recall  f1-score   support

         0.0       0.58      0.62      0.60        90
         1.0       0.78      0.74      0.76       160

    accuracy                           0.70       250
   macro avg       0.68      0.68      0.68       250
weighted avg       0.71      0.70      0.70       250



In [12]:
log_reg = LogisticRegression(random_state=0, C=.3, penalty="l2").fit(X_train, y_train)

In [13]:
# 0.741
X_test = test[f_cols].values
y_pred = log_reg.predict_proba(X_test)[:,1]
submission["target"] = y_pred
submission.to_csv("../submissions/12_logreg_MAP_l2_c_05.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.373342
1,251,0.159244
2,252,0.685791
3,253,0.999216
4,254,0.380589


## Bayesian gaussian

In [14]:
import pymc3 as pm
import theano as tt
from scipy.special import expit

In [49]:
with pm.Model() as model:
    # Alpha is the interception
    alpha = pm.Normal("alpha", mu=0, sd=3)
    # The prior for the features varibles which are included
    beta = pm.Normal("beta", mu=0, sd=3, shape=X.shape[1])
    # Deterministic function
    p = pm.math.dot(X,beta)
    # Likelihood
    y_obs = pm.Bernoulli("y_obs", pm.invlogit(p + alpha),  observed=y)
    

In [16]:
with model:
    trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)

  trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (1 chains in 1 job)
NUTS: [beta, alpha]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 58 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [17]:
results = pd.DataFrame({'var': np.arange(300), 
                       'beta':np.apply_along_axis(np.mean, 0, trace['beta']),
                        'alpha':np.apply_along_axis(np.mean, 0, trace['alpha'])
                       })

In [18]:
results.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
var,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
beta,1.79678,-3.162023,0.491135,0.457325,-2.651044,-1.85476,-0.082372,0.521617,-1.780452,-3.042158
alpha,7.904928,7.904928,7.904928,7.904928,7.904928,7.904928,7.904928,7.904928,7.904928,7.904928


In [19]:
estimate = trace['beta']
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )

In [20]:
estimate = trace['beta']
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )

In [21]:
0.729
X= test[f_cols].values
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )
submission["target"] = preds
#submission.to_csv("../submissions/11_bayesian_logreg.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.376991
1,251,0.217868
2,252,0.426083
3,253,0.955587
4,254,0.438808


# Slap-and-spike prior

In [88]:
X, y = train[f_cols].values, train["target"].values

In [89]:
with pm.Model() as model:
    # priors inclusion probability
    xi = pm.Bernoulli("xi", .1, shape=X.shape[1])
    # Alpha is the interception
    alpha = pm.Normal("alpha", mu=0, sd=3)
    # The prior for the features varibles which are included
    beta = pm.Normal("beta", mu=0, sd=1, shape=X.shape[1])
    # Deterministic function
    p = pm.math.dot(X,xi * beta) 
    # Likelihood
    y_obs = pm.Bernoulli("y_obs", pm.invlogit(p + alpha),  observed=y)
 

In [90]:
with model:
    trace = pm.sample(4000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)
    

  trace = pm.sample(4000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [xi]
>NUTS: [beta, alpha]


Sampling 1 chain for 1_000 tune and 4_000 draw iterations (1_000 + 4_000 draws total) took 140 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [91]:
results = pd.DataFrame({'var': np.arange(300), 
                        'inclusion_probability':np.apply_along_axis(np.mean, 0, trace['xi']),
                       'beta':np.apply_along_axis(np.mean, 0, trace['beta']),
                       'beta_given_inclusion': np.apply_along_axis(np.sum, 0, trace['xi']*trace['beta'])
                            /np.apply_along_axis(np.sum, 0, trace['xi'])
                       })

In [92]:
results.sort_values('inclusion_probability', ascending = False).head(10)


Unnamed: 0,var,inclusion_probability,beta,beta_given_inclusion
33,33,1.0,2.169756,2.169756
65,65,1.0,1.871748,1.871748
217,217,0.99925,-1.330508,-1.331512
91,91,0.99725,-1.377645,-1.381338
199,199,0.9735,1.228635,1.256696
73,73,0.9605,-1.09512,-1.13514
295,295,0.8155,-0.800856,-0.97365
108,108,0.78525,-0.736136,-0.944836
189,189,0.73875,-0.700753,-0.939767
117,117,0.67875,-0.617163,-0.924473


## MAP estimate

In [93]:
from scipy.special import expit

In [94]:
map_estimate = results["inclusion_probability"] * results["beta"]

In [99]:
X = test[f_cols].values
map_preds = expit(trace["alpha"].mean() + np.dot(X, np.transpose(map_estimate)))

In [101]:
# 0.853
submission["target"] = map_preds
submission.to_csv("../submissions/14_sas_MAP_logreg.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.974777
1,251,0.883088
2,252,0.97557
3,253,0.997589
4,254,0.402101


## Bayesian

In [96]:
estimate = trace['beta'] * trace['xi'] 
X = test[f_cols].values
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )


In [108]:
# 0.855
submission["target"] = preds
submission.to_csv("../submissions/15_sas_Bayesian_logreg.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.837159
1,251,0.732443
2,252,0.840483
3,253,0.988602
4,254,0.452466


In [97]:
preds[:10]

array([0.83715891, 0.73244307, 0.84048265, 0.98860241, 0.4524659 ,
       0.36026956, 0.40309519, 0.22405392, 0.9329661 , 0.2476552 ])

In [100]:
map_preds[:10]

array([0.9747766 , 0.88308826, 0.97556966, 0.99758903, 0.40210115,
       0.27738468, 0.50479463, 0.04066577, 0.99641289, 0.10191702])

In [106]:
(map_preds[map_preds > 0.5]).mean(), (map_preds[map_preds <= 0.5]).mean()

(0.9084401431863095, 0.1648489675698241)

In [107]:
(preds[preds > 0.5]).mean(), (preds[preds <= 0.5]).mean()

(0.827885764432048, 0.25476991602511495)