### Todo

#### This cell is for defining various OPTIONS used for this notebook (working directory, how many rows and columns pandas displays for a dataframe, etc). 

#### Preferably this cell is also where we do important imports (for example pandas and numpy)

In [20]:
import os 
#Input the directory where your joined_data.csv is located 
#os.chdir('C:/Users/Trond/Documents/Master 2020/Processed data')
os.chdir('C:/Users/Briggstone/Documents/Master 2020/Processed data')
#os.chdir('C:/Users/MyPC/Documents/Andrijana/UiS/DATMAS Master oppgave/Processed data')

#Where you want the csv file of the merged data to be placed
output_filepath = 'C:/Users/Briggstone/Documents/Master 2020/Processed data'
#output_filepath = 'C:/Users/MyPC/Documents/Andrijana/UiS/DATMAS Master oppgave/Processed data'

import pandas as pd 
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import svm

# Set ipython's max row display
pd.set_option('display.max_row', 1000)

# Set iPython's max column width to 50
pd.set_option('display.max_columns', 50)


#### In this cell we import our training data, convert HALL into HALL_EVER and select only BL observations.

In [21]:
train = pd.read_csv('train.csv')
temptrain = pd.DataFrame(train.PATNO.unique(), columns = ["PATNO"])

HALL_EVER = []
for id in train.PATNO.unique():
    if train.loc[(train.PATNO == id) & (train.HALL == 1), "HALL"].empty:
        HALL_EVER.append(0)
    else:
        HALL_EVER.append(1)

temptrain["HALL_EVER"] = HALL_EVER
train.drop("HALL", axis = 1, inplace = True)
train = train.merge(temptrain, how = "inner", on = "PATNO")

#Selecting only Baseline observations
train = train.loc[train.EVENT_ID == "BL", :]

#We can then safely drop EVENT_ID
train.drop("EVENT_ID", axis = 1, inplace = True)

# We form Y
Y = train.pop("HALL_EVER")

In [22]:
train.shape

(293, 22)

#### In this cell we apply random forest and boosted trees from XGBOOST

In [23]:
dtrain = xgb.DMatrix(train, label = Y)


#Random Forest
params = {
  'colsample_bynode': 0.8,
  'learning_rate': 1,
  'max_depth': 5,
  'num_parallel_tree': 100,
  'objective': 'binary:logistic',
  'subsample': 0.8,
  'tree_method': 'gpu_hist'
}

cv_results_RF = xgb.cv(params, dtrain = dtrain, num_boost_round=1, nfold = 10, as_pandas = True, seed = 1, metrics = ["error", "auc"])
print(cv_results_RF)


#Boosted trees
params = {
  'objective': 'binary:logistic',
  'tree_method': 'gpu_hist'
}

cv_results_BT = xgb.cv(params, dtrain = dtrain, num_boost_round=10, nfold = 10, as_pandas = True, seed = 1, metrics = ["error", "auc"])
print(cv_results_BT)


   train-error-mean  train-error-std  train-auc-mean  train-auc-std  \
0          0.152051         0.017769        0.946971       0.011634   

   test-error-mean  test-error-std  test-auc-mean  test-auc-std  
0         0.262529         0.08896       0.615074      0.163178  
   train-error-mean  train-error-std  train-auc-mean  train-auc-std  \
0          0.153959         0.012317        0.861563       0.030197   
1          0.122865         0.012582        0.927259       0.022643   
2          0.105042         0.012446        0.961868       0.010231   
3          0.086846         0.014462        0.979356       0.007951   
4          0.068273         0.012770        0.990426       0.004074   
5          0.053104         0.015115        0.994661       0.002900   
6          0.040589         0.012393        0.998135       0.001181   
7          0.030343         0.007793        0.999218       0.000688   
8          0.021623         0.007803        0.999648       0.000392   
9          0.01

#### In this cell we apply SVM from scikit-learn

In [24]:
clf1 = svm.SVC(random_state=0, gamma='auto', kernel='rbf')
roc_auc1 = cross_val_score(clf1, train, Y, cv=10, scoring='roc_auc')
roc_auc1

array([0.59090909, 0.49431818, 0.40340909, 0.45238095, 0.49404762,
       0.46428571, 0.60714286, 0.45833333, 0.33333333, 0.29761905])

In [25]:
accuracy1 = cross_val_score(clf1, train, Y, cv=10, scoring='accuracy')
accuracy1

array([0.73333333, 0.73333333, 0.63333333, 0.68965517, 0.72413793,
       0.68965517, 0.72413793, 0.72413793, 0.68965517, 0.72413793])

In [26]:
clf2 = svm.SVC(random_state=0, gamma='auto', kernel='linear')
roc_auc2 = cross_val_score(clf2, train, Y, cv=10, scoring='roc_auc')
roc_auc2

array([0.65909091, 0.63068182, 0.57386364, 0.5       , 0.57738095,
       0.70238095, 0.72619048, 0.57738095, 0.76190476, 0.66666667])

In [27]:
accuracy2 = cross_val_score(clf2, train, Y, cv=10, scoring='accuracy')
accuracy2

array([0.66666667, 0.73333333, 0.7       , 0.65517241, 0.75862069,
       0.75862069, 0.79310345, 0.68965517, 0.79310345, 0.75862069])