In [14]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

import pymc3 as pm
import theano as tt
from scipy.special import expit


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv("../data/train.csv").sample(frac=1.0)
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

f_cols = [col for col in train.columns if col not in ["id", "target"]]

In [4]:
def get_predictions(x):
    return [1 if xi >= 0.5 else 0 for xi in x]

In [5]:
X, y = train[f_cols].values, train["target"].values

In [7]:
# inspired by
# https://www.kaggle.com/melondonkey/bayesian-spike-and-slab-in-pymc3
with pm.Model() as model:
    # priors inclusion probability
    xi = pm.Bernoulli("xi", .1, shape=X.shape[1])
    # Alpha is the interception
    alpha = pm.Normal("alpha", mu=0, sd=3)
    # The prior for the features varibles which are included
    beta = pm.Normal("beta", mu=0, sd=1, shape=X.shape[1])
    # Deterministic function
    p = pm.math.dot(X,xi * beta) 
    # Likelihood
    y_obs = pm.Bernoulli("y_obs", pm.invlogit(p + alpha),  observed=y)
    

In [10]:
with model:
    trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)

Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [xi]
>NUTS: [beta, alpha]
Sampling chain 0, 0 divergences: 100%|██████████| 2500/2500 [02:09<00:00, 19.38it/s]
Only one chain was sampled, this makes it impossible to run some convergence checks


In [11]:
results = pd.DataFrame({'var': np.arange(300), 
                        'inclusion_probability':np.apply_along_axis(np.mean, 0, trace['xi']),
                       'beta':np.apply_along_axis(np.mean, 0, trace['beta']),
                       'beta_given_inclusion': np.apply_along_axis(np.sum, 0, trace['xi']*trace['beta'])
                            /np.apply_along_axis(np.sum, 0, trace['xi'])
                       })

In [12]:
results.sort_values('inclusion_probability', ascending = False).head(20)


Unnamed: 0,var,inclusion_probability,beta,beta_given_inclusion
65,65,1.0,1.874077,1.874077
33,33,1.0,2.161439,2.161439
217,217,0.9985,-1.325436,-1.327343
91,91,0.9975,-1.39173,-1.3951
73,73,0.971,-1.127479,-1.154421
199,199,0.9695,1.218786,1.254068
108,108,0.8285,-0.794136,-0.948812
295,295,0.826,-0.767713,-0.947336
189,189,0.7595,-0.732797,-0.925824
117,117,0.672,-0.63635,-0.931888


In [16]:
#Scoring test.  Score new data from a single posterior sample
test_beta = trace['beta'][0]
test_inc = trace['xi'][0]
test_score = expit(trace['alpha'][0] + np.dot(X, test_inc * test_beta))  
test_score

array([8.55726144e-01, 1.21945280e-01, 3.01745903e-01, 9.99995898e-01,
       7.35813887e-01, 9.99697011e-01, 9.83724074e-01, 9.99188452e-01,
       9.99979899e-01, 4.75191119e-01, 9.99010193e-01, 2.39217631e-01,
       7.24111381e-01, 9.39814680e-01, 8.89372564e-01, 9.93994448e-01,
       9.98637670e-01, 5.78340623e-01, 8.60340801e-02, 6.52719124e-03,
       9.95931114e-01, 4.02702995e-01, 6.41360580e-01, 9.46183944e-01,
       9.99804028e-01, 9.73472926e-01, 9.99089198e-01, 4.45668144e-02,
       9.46977293e-01, 9.94989579e-01, 6.57891130e-01, 9.61181584e-01,
       3.78533341e-03, 8.92306765e-01, 8.62427094e-01, 8.70015639e-01,
       8.72288603e-01, 4.73555850e-01, 9.92072898e-01, 9.99999970e-01,
       9.82647425e-01, 8.26863100e-03, 9.99979723e-01, 9.39802509e-01,
       1.68768444e-02, 5.88930032e-02, 5.05850075e-01, 2.94068522e-01,
       9.65704200e-01, 9.99972849e-01, 8.18723739e-01, 9.85006373e-01,
       1.74793608e-01, 9.97950111e-01, 9.97590056e-01, 6.47727617e-01,
      

In [24]:
estimate = trace['beta'] * trace['xi'] 
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )


In [25]:
print(f"Models AUC score: {roc_auc_score(y, preds)}")
print(classification_report(y, get_predictions(preds)))

Models AUC score: 0.9983333333333333
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97        90
         1.0       0.98      0.99      0.98       160

    accuracy                           0.98       250
   macro avg       0.98      0.97      0.97       250
weighted avg       0.98      0.98      0.98       250



In [26]:
estimate = trace['beta'] * trace['xi'] 
X = test[f_cols].values
preds = np.apply_along_axis(np.mean, 1, expit(trace['alpha'] + np.dot(X, np.transpose(estimate) )) )


In [29]:
# 0.854
y_pred = preds
submission["target"] = y_pred
submission.to_csv("../submissions/08_bayesian_logreg_experiment_1.csv", index=False)
submission.head()

Unnamed: 0,id,target
0,250,0.842929
1,251,0.741758
2,252,0.859591
3,253,0.99076
4,254,0.487063


In [30]:
submission.target.describe()

count    19750.000000
mean         0.653300
std          0.304124
min          0.000085
25%          0.413365
50%          0.737336
75%          0.930217
max          0.999998
Name: target, dtype: float64