In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
import pandas as pd

## New data

In [None]:
X_train = pd.read_csv("X_train.txt", header=None, sep=" ")
y = pd.read_csv("y_train.txt", header=None).rename(columns={0:"label"})
X_train["label"] = y.label
# print(X_train.shape)
# X_train = X_train.sample(frac=1.).reset_index(drop=True)

X_test = pd.read_csv("X_test.txt", header=None, sep=" ")
y = pd.read_csv("y_test.txt", header=None).rename(columns={0:"label"})
X_test["label"] = y.label 

In [None]:
print(f'X_train and X_test shape: {X_train.shape, X_test.shape}')
# np.unique(X_train.label, return_counts=True)
train_idx = X_train.label.isin([4,5])
print(np.unique(X_train[train_idx].label, return_counts=True))

X_train and X_test shape: ((7767, 562), (3162, 562))
(array([4, 5]), array([1293, 1423]))


In [None]:
label_dict = {4:0, 5:1}

# Sitting vs standing
train_idx = X_train.label.isin([4,5])
X_train = X_train[train_idx].reset_index(drop=True)
X_train["label"] = X_train["label"].map(label_dict)

test_idx = X_test.label.isin([4,5])
X_test = X_test[test_idx].reset_index(drop=True)
X_test["label"] = X_test["label"].map(label_dict)
y_test = X_test["label"]

print(f'X_train and X_test shape after filtering: {X_train.shape, X_test.shape}')

In [None]:
def get_predictions(x):
    return [1 if xi >= 0.5 else 0 for xi in x]

In [None]:
print(f'X_train and X_test shape after filtering: {X_train.shape, X_test.shape}')

X_train and X_test shape after filtering: ((2716, 562), (1064, 562))


In [None]:
result_dataframe_dict = {}
weight_list_dict = {}

amount_of_data = [50, 100, 200, 400, 800, 1600, 2716 ] 

## Logreg

In [None]:
for ii in range(len(amount_of_data)):
  X_train_l = X_train.sample(amount_of_data[ii], random_state= 55).reset_index(drop=True)
  #print(f' X_train_l and X_test shape: {X_train_l.shape}, {X_test.shape}')
  #print(f'label ballance: {np.unique(X_train_l.label, return_counts=True)}')
  X, y = X_train_l[[col for col in X_train_l.columns if col != "label"]].values , X_train_l.label
  #print(f'X and y shape: {X.shape, y.shape}')

  log_reg = LogisticRegression(random_state=0, C=1.0, max_iter = 500).fit(X, y) # fit tain set
  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  y_pred = log_reg.predict_proba(_X_test)[:,1]
  roc_auc = roc_auc_score(y_test, get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(y_pred), output_dict=True)

  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 
  name_ = 'LogReg_' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  #print(report_df)



  weight_list_dict[name_] = log_reg.coef_[0]

## Gaussian LogReg
MAP L2 estimate C=0.3

In [None]:
for ii in range(len(amount_of_data)):
  X_train_g = X_train.sample(amount_of_data[ii], random_state= 55).reset_index(drop=True)
  X, y = X_train_g[[col for col in X_train_g.columns if col != "label"]].values , X_train_g.label

  log_reg = LogisticRegression(random_state=0, C=.3, penalty="l2", max_iter = 500).fit(X, y)
  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  y_pred = log_reg.predict_proba(_X_test)[:,1]
  roc_auc = roc_auc_score(y_test, get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(y_pred), output_dict=True)

  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 

  name_ = 'GaussianLogReg_' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  #print(report_df)


  weight_list_dict[name_] = log_reg.coef_[0]

## Bayesian MAP estimate with Spike-and-Slab prior and Bayesian LogReg

In [None]:
import pymc3 as pm
import theano as tt
from scipy.special import expit
from scipy.stats import norm, bernoulli

In [None]:
def spike_slab_log_prior(gamma: np.array, alpha: np.array, p, gamma_mu=0, sigma_beta=3):
    return (bernoulli.logpmf(gamma, p=p) + norm.logpdf(alpha, loc=gamma_mu, scale=sigma_beta)).sum()

In [None]:
def log_likelihood(a, gamma, alpha, X, T):
    y_x = expit(a + np.dot(X, np.transpose(gamma*alpha)))
    return (T*np.log(y_x) + ((1-T)*np.log(1-y_x))).sum()

In [None]:
prob = 0.1
gamma_mu = 0
sigma_beta = 3

def find_spike_slab_MAP(trace, X, y, prob, gamma_mu, sigma_beta):
    min_loss = np.inf
    cur_min = -1
    for i in range(len(trace)):
        tmp_trace = trace[i]
        tmp_spike_slab_log_prior = spike_slab_log_prior(tmp_trace["gamma_i"], tmp_trace["alpha"], p=prob, gamma_mu=gamma_mu, sigma_beta=sigma_beta)
        tmp_log_likelihood = log_likelihood(tmp_trace["a"], tmp_trace["gamma_i"], tmp_trace["alpha"], X, y)
        neq_loss = -(tmp_log_likelihood + tmp_spike_slab_log_prior)
        if neq_loss <= min_loss:
            min_loss = neq_loss
            cur_min = i
    return trace[cur_min]

In [None]:
for ii in range(len(amount_of_data)):
  X_train_s = X_train.sample(amount_of_data[ii], random_state= 55).reset_index(drop=True)
  X, y = X_train_s[[col for col in X_train_s.columns if col != "label"]].values , X_train_s.label

  # This model is inspired by the following notebook
  # https://www.kaggle.com/melondonkey/bayesian-spike-and-slab-in-pymc3
  prob = 0.1
  a_mu = 0
  a_var = 3
  gamma_var = 1
    
  with pm.Model() as model:
      # priors inclusion probability
      gamma_i = pm.Bernoulli("gamma_i", prob, shape=X.shape[1])
      # a is the interception
      a = pm.Normal("a", mu=a_mu, sd=a_var)
      # The prior for the features varibles which are included
      alpha = pm.Normal("alpha", mu=0, sd=gamma_var, shape=X.shape[1])
      # Deterministic function
      p = pm.math.dot(X,gamma_i * alpha) 
      # Likelihood
      y_obs = pm.Bernoulli("y_obs", pm.invlogit(p + a),  observed=y)

  with model:
    trace = pm.sample(2000, random_seed = 4816, cores = 1, progressbar = True, chains = 1)

  #################################################

  map_trace = find_spike_slab_MAP(trace, X, y, prob, gamma_mu, sigma_beta)
  map_estimate = map_trace["gamma_i"] * map_trace["alpha"]

  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  map_preds = expit(map_trace["a"] + np.dot(_X_test, np.transpose(map_estimate)))
  
  #print(f"Models AUC score: {roc_auc_score(y_test, map_preds)}")
  roc_auc = roc_auc_score(y_test, map_preds) #get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(map_preds), output_dict=True)
  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 
  name_ = 'Spike-and-Slab_Bayesian_MAP' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  weight_list_dict[name_] = map_trace
  
  #################################################
    
  estimate = trace['alpha'] * trace['gamma_i'] 
  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  preds = np.apply_along_axis(np.mean, 1, expit(trace['a'] + np.dot(_X_test, np.transpose(estimate) )) )

  # print(f"Models AUC score: {roc_auc_score(y_test, preds)}")
  roc_auc = roc_auc_score(y_test, preds) #get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(preds), output_dict=True)
  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 
  name_ = '_Bayesian_LogReg' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  ###
  results = pd.DataFrame({'var': np.arange(561), 
                        'inclusion_probability':np.apply_along_axis(np.mean, 0, trace['gamma_i']),
                       'alpha':np.apply_along_axis(np.mean, 0, trace['alpha']),
                       'alpha_given_inclusion': np.apply_along_axis(np.sum, 0, trace['gamma_i']*trace['alpha'])
                            /np.apply_along_axis(np.sum, 0, trace['gamma_i'])
                       })
  mean_trace_a = np.mean(trace['a'])

  weight_list_dict[name_] = results
  weight_list_dict[name_+'mean_trace_a'] = mean_trace_a



Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 314 seconds.
There were 10 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 325 seconds.
There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 426 seconds.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 670 seconds.
There were 17 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 913 seconds.
There were 46 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 1509 seconds.
There were 212 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks
Sequential sampling (1 chains in 1 job)
CompoundStep
>BinaryGibbsMetropolis: [gamma_i]
>NUTS: [alpha, a]


Sampling 1 chain for 1_000 tune and 2_000 draw iterations (1_000 + 2_000 draws total) took 2937 seconds.
There were 251 divergences after tuning. Increase `target_accept` or reparameterize.
Only one chain was sampled, this makes it impossible to run some convergence checks


In [None]:
result_dataframe_dict

{'GaussianLogReg_100':               precision    recall  f1-score  support       AUC  accuracy
 0              0.893478  0.809055  0.849174    508.0  0.860463  0.862782
 1              0.839404  0.911871  0.874138    556.0  0.860463  0.862782
 macro avg      0.866441  0.860463  0.861656   1064.0  0.860463  0.862782
 weighted avg   0.865221  0.862782  0.862219   1064.0  0.860463  0.862782,
 'GaussianLogReg_1600':               precision    recall  f1-score  support       AUC  accuracy
 0              0.956250  0.903543  0.929150    508.0  0.932887  0.934211
 1              0.916096  0.962230  0.938596    556.0  0.932887  0.934211
 macro avg      0.936173  0.932887  0.933873   1064.0  0.932887  0.934211
 weighted avg   0.935267  0.934211  0.934086   1064.0  0.932887  0.934211,
 'GaussianLogReg_200':               precision    recall  f1-score  support       AUC  accuracy
 0              0.858779  0.885827  0.872093    508.0  0.876367   0.87594
 1              0.892593  0.866906  0.87956

In [None]:
weight_list_dict

{'GaussianLogReg_100': array([ 6.57863335e-04, -3.97863572e-03, -3.29121978e-02,  1.17209050e-02,
         4.99981625e-04, -3.08634497e-02,  1.09644615e-02, -1.73375429e-03,
        -2.35189220e-02,  1.85170097e-03,  1.24503142e-02, -5.81669954e-02,
        -1.73814678e-02, -7.35229276e-05, -1.10276995e-02, -6.22458421e-03,
         1.53277408e-03, -8.78371153e-04, -9.65506673e-03,  9.93810486e-03,
        -4.43933256e-03, -1.65472341e-02, -9.90915763e-02, -8.50978842e-02,
        -2.23426155e-01,  2.21197349e-01, -1.03924962e-01,  6.50981777e-02,
        -9.38253871e-02, -8.99892406e-02,  1.30274955e-02,  1.60954702e-02,
        -1.22513845e-01, -5.76179227e-02,  9.04129507e-02, -9.14636425e-02,
        -2.14686842e-01,  6.68914936e-02,  1.13008389e-01, -9.48712054e-02,
         9.82326596e-02, -5.80390237e-01, -1.87496851e-01,  2.38734437e-02,
        -2.82065702e-03,  4.78764200e-03,  2.26928148e-02, -3.10344025e-03,
         5.44110344e-03,  1.01529301e-01, -5.71451948e-01, -2.0009

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
for ii in range(len(amount_of_data)):
  X_train_l = X_train.sample(amount_of_data[ii], random_state= 55).reset_index(drop=True)
  X, y = X_train_l[[col for col in X_train_l.columns if col != "label"]].values , X_train_l.label


  clf = GaussianNB().fit(X, y) # fit tain set
  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  y_pred = clf.predict_proba(_X_test)[:,1]
  roc_auc = roc_auc_score(y_test, get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(y_pred), output_dict=True)

  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 
  name_ = 'Naive_Bayes_' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  #print(report_df)

## Random Forest/ Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

n_fold = 5
folds = StratifiedKFold(n_splits=n_fold, shuffle=True)

In [None]:
dtc = DecisionTreeClassifier()

parameter_grid = {'max_features': [25, 50, 75, 100, 150, 200],
                  'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15]
                 }

grid_search_tree = GridSearchCV(dtc, param_grid=parameter_grid, cv=folds, scoring='roc_auc', n_jobs=-1)
grid_search_tree.fit(X_test, y_test)



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15],
                         'max_features': [25, 50, 75, 100, 150, 200]},
             scoring='roc_auc')

In [None]:
rfc = RandomForestClassifier()

parameter_grid = {'max_features': [25, 50, 75, 100, 150, 200],
                  'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15]
                 }

grid_search_forest = GridSearchCV(rfc, param_grid=parameter_grid, cv=folds, scoring='roc_auc', n_jobs=-1)
grid_search_forest.fit(X_test, y_test)



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
             estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15],
                         'max_features': [25, 50, 75, 100, 150, 200]},
             scoring='roc_auc')

In [None]:
for ii in range(len(amount_of_data)):
  X_train_l = X_train.sample(amount_of_data[ii], random_state= 55).reset_index(drop=True)
  X, y = X_train_l[[col for col in X_train_l.columns if col != "label"]].values , X_train_l.label

  rfc = RandomForestClassifier(**grid_search_forest.best_params_).fit(X, y)
  #clf = GaussianNB().fit(X, y) # fit tain set
  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  y_pred = rfc.predict_proba(_X_test)[:,1]
  roc_auc = roc_auc_score(y_test, get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(y_pred), output_dict=True)

  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 
  name_ = 'Random_Forest' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  #print(report_df)

In [None]:
for ii in range(len(amount_of_data)):
  X_train_l = X_train.sample(amount_of_data[ii], random_state= 55).reset_index(drop=True)
  X, y = X_train_l[[col for col in X_train_l.columns if col != "label"]].values , X_train_l.label

  dtc = DecisionTreeClassifier(**grid_search_tree.best_params_).fit(X, y)
  #clf = GaussianNB().fit(X, y) # fit tain set
  _X_test = X_test[[col for col in X_test.columns if col != "label"]].values
  y_pred = dtc.predict_proba(_X_test)[:,1]
  roc_auc = roc_auc_score(y_test, get_predictions(y_pred))
  report = classification_report(y_test, get_predictions(y_pred), output_dict=True)

  report_df = pd.DataFrame(report).transpose()
  report_df['AUC'] = roc_auc
  report_df['accuracy'] = report_df.loc['accuracy'][0]
  report_df = report_df.drop(['accuracy'], axis=0) 
  name_ = 'Decision_Tree' + str(amount_of_data[ii])
  result_dataframe_dict[name_] = report_df

  #print(report_df)

## Save tables to external file

In [None]:
import csv

In [None]:
for key, value in result_dataframe_dict.items():
  value['model'] = key

frames = list(result_dataframe_dict.values())
result_dataframe = pd.concat(frames)
# result_dataframe

In [None]:
temp_df_list = []
for key, value in weight_list_dict.items():
  #print(type(value))
  if isinstance(value, np.ndarray):
    temp_value  = pd.DataFrame(value, columns =['weights'])
    temp_value['model'] = key
    temp_df_list.append(temp_value)
  elif isinstance(value, dict):
    temp_value = pd.DataFrame(data=value) 
    temp_value['model'] = key
    temp_df_list.append(temp_value)
  elif isinstance(value, pd.DataFrame):
    temp_value = value 
    temp_value['model'] = key
    temp_df_list.append(temp_value)
  elif isinstance(value, float):
    temp_value = pd.DataFrame(data=[value], columns =['mean_trace_a']) 
    temp_value['model'] = key
    temp_df_list.append(temp_value)
  else:
    print('error??')


In [None]:
weight_dataframe = pd.concat(temp_df_list)

In [None]:
weight_dataframe

Unnamed: 0,weights,model,gamma_i,a,alpha,var,inclusion_probability,alpha_given_inclusion,mean_trace_a
0,0.0006398125,LogReg_50,,,,,,,
1,0.01112609,LogReg_50,,,,,,,
2,-0.009261175,LogReg_50,,,,,,,
3,0.01015276,LogReg_50,,,,,,,
4,-2.064688e-06,LogReg_50,,,,,,,
5,-0.01162121,LogReg_50,,,,,,,
6,0.01010091,LogReg_50,,,,,,,
7,0.002584499,LogReg_50,,,,,,,
8,0.002496618,LogReg_50,,,,,,,
9,0.008145971,LogReg_50,,,,,,,


In [None]:
# with open('mycsvfile.csv', 'w') as f:
#     w = csv.DictWriter(f, result_dataframe_dict.keys())
#     w.writeheader()
#     w.writerow(result_dataframe_dict)

In [None]:
weight_dataframe.to_csv('weight_dataframe.csv')

In [None]:
result_dataframe.to_csv('result_dataframe.csv')

In [None]:
#