# Model Fitting  
---

In [1]:
# -- import packages --
import pandas as pd
import numpy as np 
import time
from matplotlib import pyplot as plt
from sklearn.svm import SVC, LinearSVC
from sklearn import preprocessing
from sklearn.metrics import classification_report, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold,  cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from numpy import mean,std
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Data  
---

In [2]:
# sucrose infusions csv dataframe, locally set path
path="C:\\Users\\19802\\Documents\\nibl\\pilot_connectivity\\mouse_data\\Sucrose_infusions_full_df.csv" 
data=pd.read_csv(path) # read in as dataframe

In [3]:
# -- print data info -- 
print("\n[INFO] data columns: \n\n%s \
    \n\n[INFO] reward: %s \
      \n\n[INFO] mouse IDs: %s \n \
      \n\n[INFO] treatment: %s \n \
      \n\n "%(data.columns.values,
                data["Reward"].unique(), 
                data['MouseId'].unique(),
                data['Treatment'].unique()))



[INFO] data columns: 

['Time(s)' 'PFC_delta' 'PFC_theta' 'PFC_alpha' 'PFC_beta' 'PFC_low_gamma'
 'PFC_high_gamma' 'BLA_delta' 'BLA_theta' 'BLA_alpha' 'BLA_beta'
 'BLA_low_gamma' 'BLA_high_gamma' 'NAc_delta' 'NAc_theta' 'NAc_alpha'
 'NAc_beta' 'NAc_low_gamma' 'NAc_high_gamma' 'vHip_delta' 'vHip_theta'
 'vHip_alpha' 'vHip_beta' 'vHip_low_gamma' 'vHip_high_gamma' 'Reward'
 'MouseId' 'Treatment']     

[INFO] reward: ['None' 'Water' 'Sucrose_5' 'Sucrose_15']       

[INFO] mouse IDs: ['E_A1' 'E_A3' 'E_A7' 'E_A8'] 
       

[INFO] treatment: ['Post_HFD' 'Post_removal' 'Pre_HFD'] 
       

 


## Prepare Data  
--- 

**Dummy variables**  
We use pandas package `get_dummies()` , passing the dataframe and the columns to convert: "MouseId" and "Treatment".   


In [4]:
data['MouseId'].unique() # View unique mouse ID labels in columns

array(['E_A1', 'E_A3', 'E_A7', 'E_A8'], dtype=object)

In [5]:
# -- label encoder on our data -- 
#le = preprocessing.LabelEncoder() # initialize encoder obj
#data['MouseId'] = le.fit_transform(data['MouseId']) # fit and transform the 5% data


In [6]:
data=pd.get_dummies(data, columns=["MouseId"]) # convert to dummy/indicator variables

In [8]:
data.filter(like="MouseId", axis=1).head() # view the new dummy/indicator variables

Unnamed: 0,MouseId_E_A1,MouseId_E_A3,MouseId_E_A7,MouseId_E_A8
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [9]:
data['Treatment'].unique()  # View unique treatment labels in columns

array(['Post_HFD', 'Post_removal', 'Pre_HFD'], dtype=object)

In [10]:
# -- label encoder on our data -- 
#le = preprocessing.LabelEncoder() # initialize encoder obj
#data['Treatment'] = le.fit_transform(data['Treatment']) # fit and transform the 5% data


In [11]:
data['Treatment'].unique()

array(['Post_HFD', 'Post_removal', 'Pre_HFD'], dtype=object)

In [12]:
data=pd.get_dummies(data, columns=["Treatment"]) # convert to dummy variables

In [13]:
data.filter(like="Treatment", axis=1).head() # view the new dummy/indicator variables

Unnamed: 0,Treatment_Post_HFD,Treatment_Post_removal,Treatment_Pre_HFD
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [14]:
data.columns.values # view new columns 

array(['Time(s)', 'PFC_delta', 'PFC_theta', 'PFC_alpha', 'PFC_beta',
       'PFC_low_gamma', 'PFC_high_gamma', 'BLA_delta', 'BLA_theta',
       'BLA_alpha', 'BLA_beta', 'BLA_low_gamma', 'BLA_high_gamma',
       'NAc_delta', 'NAc_theta', 'NAc_alpha', 'NAc_beta', 'NAc_low_gamma',
       'NAc_high_gamma', 'vHip_delta', 'vHip_theta', 'vHip_alpha',
       'vHip_beta', 'vHip_low_gamma', 'vHip_high_gamma', 'Reward',
       'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
       'Treatment_Post_HFD', 'Treatment_Post_removal',
       'Treatment_Pre_HFD'], dtype=object)

In [15]:
print("\n[INFO] new dummy data: \n\n")
display(data.head(3))


[INFO] new dummy data: 




Unnamed: 0,Time(s),PFC_delta,PFC_theta,PFC_alpha,PFC_beta,PFC_low_gamma,PFC_high_gamma,BLA_delta,BLA_theta,BLA_alpha,BLA_beta,BLA_low_gamma,BLA_high_gamma,NAc_delta,NAc_theta,NAc_alpha,NAc_beta,NAc_low_gamma,NAc_high_gamma,vHip_delta,vHip_theta,vHip_alpha,vHip_beta,vHip_low_gamma,vHip_high_gamma,Reward,MouseId_E_A1,MouseId_E_A3,MouseId_E_A7,MouseId_E_A8,Treatment_Post_HFD,Treatment_Post_removal,Treatment_Pre_HFD
0,0.0,2e-06,7e-06,2.757425e-07,2e-06,1e-06,2.524725e-07,2e-06,6e-06,5e-06,5e-06,7e-06,4.023675e-07,0.000107,2.5e-05,9e-06,1.1e-05,8e-06,1e-06,6.8e-05,9.4e-05,1.4e-05,1.9e-05,8e-06,7.75185e-07,,1,0,0,0,1,0,0
1,0.1,2e-06,7e-06,2.334475e-07,2e-06,1e-06,2.507175e-07,1e-06,6e-06,4e-06,5e-06,7e-06,4.345025e-07,0.000102,2.1e-05,9e-06,1.1e-05,7e-06,1e-06,4.6e-05,9.3e-05,1.8e-05,2.1e-05,9e-06,9.978e-07,,1,0,0,0,1,0,0
2,0.2,3e-06,7e-06,2.912275e-07,1e-06,1e-06,2.59645e-07,2e-06,6e-06,4e-06,5e-06,7e-06,4.725375e-07,0.0001,2.3e-05,1e-05,1.2e-05,8e-06,1e-06,1.8e-05,0.000104,3.1e-05,3.6e-05,1.4e-05,2.241688e-06,,1,0,0,0,1,0,0


---

### Setup target variables Y    
  
Currently we will set up two target conditions:    
- Water vs. Sucrose 5%, y_s5  
- Water vs. Sucrose 15%, y_s15

In [16]:
# -- Setup the target variables  --

Y = data.loc[:, "Reward"] # set copy of target variable to Y 

# make condition masks for Water vs. Sucrose %5 and Water vs. Sucorse vs 15%
s5_cond_mask=Y.isin(['Water', "Sucrose_5"])
s15_cond_mask=Y.isin(['Water', "Sucrose_15"])

# filter data for target variabes w/ condition masks
y_s5=Y[s5_cond_mask]
y_s15=Y[s15_cond_mask]


# prepare feature dataframe with filtered masks
s5_data = data[s5_cond_mask]
s15_data = data[s15_cond_mask]

y_s5.unique(), y_s15.unique()


(array(['Water', 'Sucrose_5'], dtype=object),
 array(['Water', 'Sucrose_15'], dtype=object))

In [21]:
y_s5.head(), y_s15.head()

(3727    Water
 3728    Water
 3729    Water
 3730    Water
 3731    Water
 Name: Reward, dtype: object,
 3727    Water
 3728    Water
 3729    Water
 3730    Water
 3731    Water
 Name: Reward, dtype: object)

In [22]:
# -- label encoder on our data -- 
le = preprocessing.LabelEncoder() # initialize encoder obj
y_s5_enc = le.fit_transform(y_s5) # fit and transform the 5% data
y_s15_enc = le.fit_transform(y_s15) # fit and transform the 15% data

y_s5_enc.shape, y_s15_enc.shape 

((12291,), (12342,))

In [23]:
y_s5_enc[:5], y_s15_enc[:5]

(array([1, 1, 1, 1, 1]), array([1, 1, 1, 1, 1]))

---

## Setup Model

In [None]:

# Initializing Classifiers
clf1 = LogisticRegression(solver='liblinear', random_state=1)
clf2 = KNeighborsClassifier()
clf3 = DecisionTreeClassifier(random_state=1)
clf4 = SVC(kernel='rbf', random_state=1)
clf5 = SVC(kernel='linear', random_state=1)

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('clf1', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('clf2', clf2)])

pipe4 = Pipeline([('std', StandardScaler()),
                  ('clf4', clf4)])

pipe5 = Pipeline([('std', StandardScaler()),
                  ('clf5', clf5)])

# Setting up the parameter grids
param_grid1 = [{'clf1__penalty': ['l1', 'l2'],
                'clf1__C': np.power(10., np.arange(-4, 4))}]

param_grid2 = [{'clf2__n_neighbors': list(range(1, 10)),
                'clf2__p': [1, 2]}]

param_grid3 = [{'max_depth': list(range(1, 10)) + [None],
                'criterion': ['gini', 'entropy']}]

param_grid4 = [{'clf4__C': np.power(10., np.arange(-4, 4)),
                'clf4__gamma': np.power(10., np.arange(-5, 0))}]

param_grid5 = [{'clf5__C': np.power(10., np.arange(-4, 4))}]

# Setting up multiple GridSearchCV objects as inner CV, 1 for each algorithm
gridcvs = {}
inner_cv = KFold(n_splits=10)

for pgrid, est, name in zip((param_grid1, param_grid2,
                             param_grid3, param_grid4, param_grid5),
                            (pipe1, pipe2, pipe3, pipe4, pipe5),
                            ('Logit', 'KNN', 'DTree', 'SVMRBF', 'SVMLINEAR')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=1,
                       cv=inner_cv,
                       verbose=0,
                       refit=True)
    gridcvs[name] = gcv
    

# Making an outer CV
outer_cv = KFold(n_splits=10)



----

### Prepare Input Features X  
We are analyzing the data by region and bands.  
Currently we focus here on `low gamma` and `theta`

In [34]:
data.filter(like="low_gamma", axis=1).head()

Unnamed: 0,PFC_low_gamma,BLA_low_gamma,NAc_low_gamma,vHip_low_gamma
0,1e-06,7e-06,8e-06,8e-06
1,1e-06,7e-06,7e-06,9e-06
2,1e-06,7e-06,8e-06,1.4e-05
3,1e-06,7e-06,8e-06,8e-06
4,2e-06,7e-06,9e-06,9e-06


In [None]:
# initialize scaler object
scaler = preprocessing.StandardScaler()

Low Gamma 
> NAc

In [34]:
# -- Nac -- 
s5_lgnac_data=s5_data[["NAc_low_gamma", 'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                       'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']]# 'MouseId','Treatment']] 
s15_lgnac_data=s15_data[["NAc_low_gamma",'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                       'Trzeatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']] 

X_s5_lgnac = scaler.fit_transform(s5_lgnac_data) # fit and transform 
X_s15_lgnac = scaler.fit_transform(s15_lgnac_data) # fit and transform


# Making train set for Nested CV and test set for final model evaluation
X_train_s5_lgpfc, X_test_s5_lgpfc, y_train_s5_lgpfc, y_test_s15_lgpfc = train_test_split(X_s5_lgpfc, y_s5_enc,
                                                    train_size=0.8, 
                                                    test_size=0.2,
                                                    random_state=1, shuffle=True)
                                                                                         
                                                    #stratify= y_s15_enc)
X_train_s15_lgpfc, X_test_s15_lgpfc, y_train_s15_lgpfc, y_test_s15_lgpfc = train_test_split(X_s15_lgpfc, y_s15_enc,
                                                    train_size=0.8, 
                                                    test_size=0.2,
                                                    random_state=1,
                                                    shuffle=True)

In [None]:
for name, gs_est in sorted(gridcvs.items()):
    nested_score = cross_val_score(gs_est, 
                                   X=X_train_s15_lgpfc, 
                                   y=y_train_s15_lgpfc, 
                                   cv=outer_cv,
                                   n_jobs=1)
    print('%s | outer ACC %.2f%% +/- %.2f' % 
          (name, nested_score.mean() * 100, nested_score.std() * 100))

In [None]:

# Fitting a model to the whole training set using the "best" algorithm
best_algo = gridcvs['SVM']

best_algo.fit(X_train_s5, y_train_s5)

train_acc = accuracy_score(y_true=y_train_s5, y_pred=best_algo.predict(X_train_s5))
test_acc = accuracy_score(y_true=y_test_s5, y_pred=best_algo.predict(X_test_s5))

print('Accuracy %.2f%% (average over CV train folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Low Gamma  
> All Regions: vHip, BLA, PFC, NAc

---

In [26]:

# -- setup low gamma --



# -- vHip -- 

s5_lghip_data=s5_data[["vHip_low_gamma", 'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                       'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']] 
s15_lghip_data=s15_data[["vHip_low_gamma", 'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                       'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']] 


X_s5_lghip = scaler.fit_transform(s5_lghip_data) # fit and transform 
X_s15_lghip = scaler.fit_transform(s15_lghip_data) # fit and transform 


# Making train set for Nested CV and test set for final model evaluation
X_train_s15_lgpfc, X_test_s15_lgpfc, y_train_s15_lgpfc, y_test_s15_lgpfc = train_test_split(X_s15_lgpfc, y_s15_enc,
                                                    train_size=0.8, 
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify= y_s15_enc)

# -- BLA -- 
s5_lgbla_data=s5_data[["BLA_low_gamma", 'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                       'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']] 
s15_lgbla_data=s15_data[["BLA_low_gamma",'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8', 
                         'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']]  

X_s5_lgbla = scaler.fit_transform(s5_lgbla_data) # fit and transform 
X_s15_lgbla = scaler.fit_transform(s15_lgbla_data) # fit and transform 


# Making train set for Nested CV and test set for final model evaluation
X_train_s5, X_test_s5, y_train_s5, y_test_s5 = train_test_split(X_s5_lgbla, y_s5_enc,
                                                    train_size=0.8, 
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify= y_s5_enc)

# Making train set for Nested CV and test set for final model evaluation
X_train_s15, X_test_s15, y_train_s15, y_test_s15 = train_test_split(X_s15_lgbla, y_s15_enc,
                                                    train_size=0.8, 
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify= y_s15_enc)

# -- PFC  -- 
s5_lgpfc_data=s5_data[["PFC_low_gamma",  'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                   'Treatment_Post_HFD', 'Treatment_Post_removal','Treatment_Pre_HFD']] #'MouseId','Treatment']] 
s15_lgpfc_data=s15_data[["PFC_low_gamma",'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                   'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']] 

X_s5_lgpfc = scaler.fit_transform(s5_lgpfc_data) # fit and transform 
X_s15_lgpfc = scaler.fit_transform(s15_lgpfc_data) # fit and transform


# Making train set for Nested CV and test set for final model evaluation
X_train_s15_lgpfc, X_test_s15_lgpfc, y_train_s15_lgpfc, y_test_s15_lgpfc = train_test_split(X_s15_lgpfc, y_s15_enc,
                                                    train_size=0.8, 
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify= y_s15_enc)


# -- ALL REGIONS -- 
s5_lgall_data=s5_data[["PFC_low_gamma", "NAc_low_gamma","vHip_low_gamma", "BLA_low_gamma", 'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                   'Treatment_Post_HFD', 'Treatment_Post_removal','Treatment_Pre_HFD']] #'MouseId','Treatment']] 
s15_lgall_data=s15_data[["PFC_low_gamma", "NAc_low_gamma","vHip_low_gamma", "BLA_low_gamma", 'MouseId_E_A1', 'MouseId_E_A3', 'MouseId_E_A7', 'MouseId_E_A8',
                   'Treatment_Post_HFD', 'Treatment_Post_removal', 'Treatment_Pre_HFD']] #'MouseId','Treatment']] 

X_s5_lgall = scaler.fit_transform(s5_lgall_data) # fit and transform 
X_s15_lgall = scaler.fit_transform(s15_lgall_data) # fit and transform 

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

---

> Low Gamma PFC, Water vs. 15%

In [None]:

for name, gs_est in sorted(gridcvs.items()):
    nested_score = cross_val_score(gs_est, 
                                   X=X_train_s15_lgpfc, 
                                   y=y_train_s15_lgpfc, 
                                   cv=outer_cv,
                                   n_jobs=1)
    print('%s | outer ACC %.2f%% +/- %.2f' % 
          (name, nested_score.mean() * 100, nested_score.std() * 100))

In [None]:

# Fitting a model to the whole training set using the "best" algorithm
best_algo = gridcvs['SVM']

best_algo.fit(X_train_s5, y_train_s5)

train_acc = accuracy_score(y_true=y_train_s5, y_pred=best_algo.predict(X_train_s5))
test_acc = accuracy_score(y_true=y_test_s5, y_pred=best_algo.predict(X_test_s5))

print('Accuracy %.2f%% (average over CV train folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

---

In [45]:

# Making an outer CV
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, gs_est in sorted(gridcvs.items()):
    nested_score = cross_val_score(gs_est, 
                                   X=X_train_s5, 
                                   y=y_train_s5, 
                                   cv=outer_cv,
                                   n_jobs=1)
    print('%s | outer ACC %.2f%% +/- %.2f' % 
          (name, nested_score.mean() * 100, nested_score.std() * 100))

DTree | outer ACC 58.40% +/- 0.55
KNN | outer ACC 58.52% +/- 0.58
Logit | outer ACC 51.17% +/- 0.48
SVM | outer ACC 58.63% +/- 1.07


In [51]:

# Fitting a model to the whole training set using the "best" algorithm
best_algo = gridcvs['SVM']

best_algo.fit(X_train_s5, y_train_s5)

train_acc = accuracy_score(y_true=y_train_s5, y_pred=best_algo.predict(X_train_s5))
test_acc = accuracy_score(y_true=y_test_s5, y_pred=best_algo.predict(X_test_s5))

print('Accuracy %.2f%% (average over CV train folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Accuracy 58.08% (average over CV train folds)
Best Parameters: {'clf4__C': 1000.0, 'clf4__gamma': 0.1}
Training Accuracy: 60.32%
Test Accuracy: 58.24%


---

> Low Gamma NAc Water vs. 15%

In [53]:

# Making an outer CV
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

for name, gs_est in sorted(gridcvs.items()):
    nested_score = cross_val_score(gs_est, 
                                   X=X_train_s15, 
                                   y=y_train_s15, 
                                   cv=outer_cv,
                                   n_jobs=1)
    print('%s | outer ACC %.2f%% +/- %.2f' % 
          (name, nested_score.mean() * 100, nested_score.std() * 100))

DTree | outer ACC 56.57% +/- 0.76
KNN | outer ACC 55.93% +/- 0.70
Logit | outer ACC 51.43% +/- 0.87
SVM | outer ACC 56.50% +/- 0.70


In [54]:

# Fitting a model to the whole training set using the "best" algorithm
best_algo = gridcvs['SVM']

best_algo.fit(X_train_s15, y_train_s15)


GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('std', StandardScaler()),
                                       ('clf4', SVC(random_state=1))]),
             n_jobs=1,
             param_grid=[{'clf4__C': array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                          'clf4__gamma': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01])}],
             scoring='accuracy')

In [55]:
train_acc = accuracy_score(y_true=y_train_s15, y_pred=best_algo.predict(X_train_s15))
test_acc = accuracy_score(y_true=y_test_s15, y_pred=best_algo.predict(X_test_s15))

print('Accuracy %.2f%% (average over CV train folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['SVM'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Accuracy 56.86% (average over CV train folds)
Best Parameters: {'clf4__C': 1000.0, 'clf4__gamma': 0.1}
Training Accuracy: 57.88%
Test Accuracy: 57.63%


---

In [36]:
data.filter(like="theta", axis=1).head()

Unnamed: 0,PFC_theta,BLA_theta,NAc_theta,vHip_theta
0,7e-06,6e-06,2.5e-05,9.4e-05
1,7e-06,6e-06,2.1e-05,9.3e-05
2,7e-06,6e-06,2.3e-05,0.000104
3,7e-06,6e-06,2.5e-05,0.000109
4,8e-06,7e-06,4.6e-05,0.000117


## Model Setup

In [28]:
from sklearn.metrics import classification_report, accuracy_score, make_scorer

def classification_report_with_accuracy_score(y_true, y_pred):

    print(classification_report(y_true, y_pred)) # print classification report
    return accuracy_score(y_true, y_pred); # return accuracy score


# configure the cross-validation procedure
cv_inner = KFold(n_splits=10, shuffle=True, random_state=1)
cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)

# define the classifiers
svc_linear = SVC(kernel='linear', max_iter=100000)
svc_rbf = SVC(kernel='rbf', max_iter=10000)
svc_poly = SVC(kernel="poly", max_iter=100000)

# define the hyperparameter grid
C_range = np.logspace(-3, 2, 6)
gamma_range = np.logspace(-3, 2, 6)
poly_range = [3, 5, 10, 15]

param_grid_linear = dict(C=C_range)
param_grid_rbf = dict(gamma=gamma_range, C=C_range)
param_grid_poly = dict(gamma=gamma_range, degree=poly_range )

# set grid object
grid_linear = GridSearchCV(svc_linear, param_grid_linear, scoring="accuracy", verbose=1, n_jobs=3, cv=cv_inner)
grid_rbf = GridSearchCV(svc_rbf,param_grid_rbf , scoring='accuracy', n_jobs=3, cv=cv_inner)
grid_poly = GridSearchCV(svc_poly, param_grid_poly, scoring="accuracy" , n_jobs=3, cv=cv_inner)

grid_linear, grid_rbf, grid_poly

(GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
              estimator=SVC(kernel='linear', max_iter=100000), n_jobs=3,
              param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
              scoring='accuracy', verbose=1),
 GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
              estimator=SVC(max_iter=10000), n_jobs=3,
              param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
              scoring='accuracy'),
 GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
              estimator=SVC(kernel='poly', max_iter=100000), n_jobs=3,
              param_grid={'degree': [3, 5, 10, 15],
                          'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
              scoring='accuracy'))

In [32]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score


In [33]:
# split the dataset in two equal part respecting label proportions
train_s5, test_s5 = iter(StratifiedKFold(y_s5_enc, 2)).next()



ValueError: The number of folds must be of Integral type. [1 1 1 ... 0 0 0] of type <class 'numpy.ndarray'> was passed.

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': gamma_range ,
                     'C': C_range},
                    {'kernel': ['linear'], 'C': C_range} ]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, n_jobs=2,
                       score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_points_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality

## Run Models

### Low Gamma 

In [63]:
# execute the nested cross-validation
start_time = time.time()
scores_s5_lgnac_linear = cross_val_score(grid_linear, X_s5_lgnac, y_s5_enc, scoring='accuracy', cv=cv_outer, n_jobs=3)
print("--- linear kernel, 5%: %s seconds ---" % (time.time() - start_time))

# execute the nested cross-validation
start_time = time.time()
scores_s15_lgnac_linear = cross_val_score(grid_linear, X_s15_lgnac, y_s15_enc, scoring='accuracy', cv=cv_outer, n_jobs=3)
print("---  linear kernel, 15%: %s seconds ---" % (time.time() - start_time))

print('[NAc] SVM w/ Linear kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s5_lgnac_linear), std(scores_s5_lgnac_linear)))
print('[NAc] SVM w/ Linear kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s15_lgnac_linear), std(scores_s15_lgnac_linear)))


--- 161.87079310417175 seconds ---


In [18]:
# execute the nested cross-validation
start_time = time.time()
scores_s5_lgnac_rbf = cross_val_score(grid_rbf, X_s5_lgnac, y_s5_enc, scoring='accuracy', cv=cv_outer, n_jobs=3)
print("---  rbf kernel, %s seconds ---" % (time.time() - start_time))

# execute the nested cross-validation
start_time = time.time()
scores_s15_lgnac_rbf = cross_val_score(grid_rbf, X_s15_lgnac, y_s15_enc, scoring='accuracy', cv=cv_outer, n_jobs=3)
print("---  rbf kernel, %s seconds ---" % (time.time() - start_time))

print('[NAc] SVM w/ RBF kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s5_lgnac_rbf), std(scores_s5_lgnac_rbf)))
print('[NAc] SVM w/ RBF kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s15_lgnac_rbf), std(scores_s15_lgnac_rbf)))


---  rbf kernel, 921.988338470459 seconds ---
---  rbf kernel, 971.9893679618835 seconds ---
[NAc] SVM w/ RBF kernel, Accuracy: 0.592 (0.006)
[NAc] SVM w/ RBF kernel, Accuracy: 0.566 (0.010)


In [19]:
print(scores_s5_lgnac_rbf)

[0.59739732 0.60170871 0.58787632 0.58543531 0.58706265]


In [20]:
print(scores_s15_lgnac_rbf)

[0.57148643 0.56824625 0.55388979 0.57982172 0.55551053]


---

---

### Decision Tree 

In [24]:
tree = DecisionTreeClassifier()

In [25]:
tree.fit(X_s15_lgnac, y_s15_enc)

DecisionTreeClassifier()

In [26]:
tree.predict(X_s15_lgnac)

array([1, 1, 1, ..., 0, 0, 0])

In [27]:
sum(abs(tree.predict(X_s15_lgnac) - y_s15_enc))

2

In [28]:
tree.feature_importances_

array([0.92502588, 0.01745531, 0.05751881])

In [29]:
tree.decision_path(X_s15_lgnac)

<12342x11589 sparse matrix of type '<class 'numpy.int64'>'
	with 298776 stored elements in Compressed Sparse Row format>

In [30]:
tree.predict_proba(X_s15_lgnac)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [31]:
tree.apply(X_s15_lgnac)

array([ 6921,  7069,  5827, ..., 10313, 10777, 10817], dtype=int64)

In [32]:
tree.get_depth(), tree.get_n_leaves()

(62, 5795)

In [86]:
tree.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [92]:
data.columns.values

array(['Time(s)', 'PFC_delta', 'PFC_theta', 'PFC_alpha', 'PFC_beta',
       'PFC_low_gamma', 'PFC_high_gamma', 'BLA_delta', 'BLA_theta',
       'BLA_alpha', 'BLA_beta', 'BLA_low_gamma', 'BLA_high_gamma',
       'NAc_delta', 'NAc_theta', 'NAc_alpha', 'NAc_beta', 'NAc_low_gamma',
       'NAc_high_gamma', 'vHip_delta', 'vHip_theta', 'vHip_alpha',
       'vHip_beta', 'vHip_low_gamma', 'vHip_high_gamma', 'Reward',
       'MouseId', 'Treatment'], dtype=object)

In [35]:
# execute the nested cross-validation
start_time = time.time()
scores_s5_lghip_linear = cross_val_score(grid_linear, X_s5_lghip, y_s5_enc, scoring='accuracy', cv=cv_outer, n_jobs=3)
print("--- %s seconds ---" % (time.time() - start_time))
# execute the nested cross-validation
start_time = time.time()
scores_s15_lghip_linear = cross_val_score(grid_linear, X_s15_lghip, y_s15_enc, scoring='accuracy', cv=cv_outer, n_jobs=3)
print("--- %s seconds ---" % (time.time() - start_time))


--- 179.26610684394836 seconds ---
--- 158.3305902481079 seconds ---


In [37]:

# Low gamma
print("\n\n[INFO] Results for, LOW GAMMA ")

print("\n[INFO] Water vs. Sucrose 5%: ")
print('[NAc] SVM w/ Linear kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s5_lgnac_linear), std(scores_s5_lgnac_linear)))
print('[NAc] SVM w/ RBF kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s5_lgnac_rbf), std(scores_s5_lgnac_rbf)))
print('[vHIP] SVM w/ Linear kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s5_lghip_linear), std(scores_s5_lghip_linear)))

print("\n[INFO] Water vs. Sucrose 15%: ")
print('[NAc] SVM w/ Linear kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s15_lgnac_linear), std(scores_s15_lgnac_linear)))
print('[vHIP] SVM w/ Linear kernel, Accuracy: %.3f (%.3f)' % (mean(scores_s15_lghip_linear), std(scores_s15_lghip_linear)))




[INFO] Results for, LOW GAMMA 

[INFO] Water vs. Sucrose 5%: 
[NAc] SVM w/ Linear kernel, Accuracy: 0.539 (0.046)
[NAc] SVM w/ RBF kernel, Accuracy: 1.000 (0.000)
[vHIP] SVM w/ Linear kernel, Accuracy: 0.564 (0.054)

[INFO] Water vs. Sucrose 15%: 
[NAc] SVM w/ Linear kernel, Accuracy: 0.658 (0.014)
[vHIP] SVM w/ Linear kernel, Accuracy: 0.656 (0.008)


---