# Project M2 - Cell Type Classification with Morphology features

## Getting started with the Allen's data set

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("MorphFeatures.csv",index_col=0)
print(df.shape)
df.head(5) #show some lines on the data from the first record.

(670, 31)


Unnamed: 0,average_bifurcation_angle_local,average_bifurcation_angle_remote,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,hausdorff_dimension,id,max_branch_order,max_euclidean_distance,...,scale_factor_y,scale_factor_z,soma_surface,specimen_id,superseded,tags,total_length,total_surface,total_volume,dendrite_type
0,82.727781,,0.864267,0.345092,20.723077,0.96451,,491119743,6.0,99.779724,...,0.1144,0.28,435.74027,478107198,False,3D Neuron Reconstruction morphology,1666.082926,1803.875644,167.343086,aspiny
1,82.50668,,0.90389,0.634047,105.277778,0.862183,,546781359,3.0,432.38311,...,0.1144,0.28,1446.587725,502367941,False,3D Neuron Reconstruction morphology,2277.259374,4543.139073,921.571895,spiny
2,77.536678,,0.863104,0.417929,73.666667,0.926633,,537042261,6.0,373.630444,...,0.1144,0.28,287.118123,515771244,False,3D Neuron Reconstruction morphology,3589.339062,4704.910407,582.285423,spiny
3,76.583222,,0.900537,0.400396,95.979167,0.942049,,689123605,11.0,943.382549,...,0.1144,0.28,180.994813,561435279,False,3D Neuron Reconstruction morphology,5416.228778,6814.93329,740.722806,spiny
4,72.01925,,0.873518,0.227626,47.535714,1.0,,657879305,5.0,186.218009,...,0.1144,0.28,55.055236,591268268,False,3D Neuron Reconstruction morphology,1659.465869,1185.773462,69.144146,aspiny


The cell type is determined by the dendrite type in the last column of the data set. Ignore the samples of minority type called "sparsely spiny".

In [2]:
df.dropna(axis=1,inplace=True) # Drop columns with Nan values
df = df.drop_duplicates(subset=['specimen_id']) # drop duplicated of specimen_id

# Get rid of sparsely spiny cells
df = df[df.dendrite_type!='sparsely spiny'] #keep all the data that 'dendrite_type' is not 'sparsely spiny'
print(df.shape)
print(df.columns)
df.head(5)

(619, 29)
Index(['average_bifurcation_angle_local', 'average_contraction',
       'average_diameter', 'average_fragmentation',
       'average_parent_daughter_ratio', 'id', 'max_branch_order',
       'max_euclidean_distance', 'max_path_distance',
       'neuron_reconstruction_type', 'number_bifurcations', 'number_branches',
       'number_nodes', 'number_stems', 'number_tips', 'overall_depth',
       'overall_height', 'overall_width', 'scale_factor_x', 'scale_factor_y',
       'scale_factor_z', 'soma_surface', 'specimen_id', 'superseded', 'tags',
       'total_length', 'total_surface', 'total_volume', 'dendrite_type'],
      dtype='object')


Unnamed: 0,average_bifurcation_angle_local,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,id,max_branch_order,max_euclidean_distance,max_path_distance,neuron_reconstruction_type,...,scale_factor_y,scale_factor_z,soma_surface,specimen_id,superseded,tags,total_length,total_surface,total_volume,dendrite_type
0,82.727781,0.864267,0.345092,20.723077,0.96451,491119743,6.0,99.779724,126.59379,dendrite-only,...,0.1144,0.28,435.74027,478107198,False,3D Neuron Reconstruction morphology,1666.082926,1803.875644,167.343086,aspiny
1,82.50668,0.90389,0.634047,105.277778,0.862183,546781359,3.0,432.38311,496.831994,dendrite-only,...,0.1144,0.28,1446.587725,502367941,False,3D Neuron Reconstruction morphology,2277.259374,4543.139073,921.571895,spiny
2,77.536678,0.863104,0.417929,73.666667,0.926633,537042261,6.0,373.630444,436.958952,dendrite-only,...,0.1144,0.28,287.118123,515771244,False,3D Neuron Reconstruction morphology,3589.339062,4704.910407,582.285423,spiny
3,76.583222,0.900537,0.400396,95.979167,0.942049,689123605,11.0,943.382549,989.448317,full,...,0.1144,0.28,180.994813,561435279,False,3D Neuron Reconstruction morphology,5416.228778,6814.93329,740.722806,spiny
4,72.01925,0.873518,0.227626,47.535714,1.0,657879305,5.0,186.218009,221.639502,full,...,0.1144,0.28,55.055236,591268268,False,3D Neuron Reconstruction morphology,1659.465869,1185.773462,69.144146,aspiny


## Feature Engineering

In [3]:
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

df_full = df.drop(columns=['id','neuron_reconstruction_type','scale_factor_x', 'scale_factor_y','scale_factor_z','specimen_id','superseded','tags'])
print(df_full.shape)
df_full.head()

(619, 21)


Unnamed: 0,average_bifurcation_angle_local,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,max_branch_order,max_euclidean_distance,max_path_distance,number_bifurcations,number_branches,...,number_stems,number_tips,overall_depth,overall_height,overall_width,soma_surface,total_length,total_surface,total_volume,dendrite_type
0,82.727781,0.864267,0.345092,20.723077,0.96451,6.0,99.779724,126.59379,33,73,...,7,40,51.4886,140.506829,136.267522,435.74027,1666.082926,1803.875644,167.343086,aspiny
1,82.50668,0.90389,0.634047,105.277778,0.862183,3.0,432.38311,496.831994,9,23,...,5,14,92.6671,566.70122,370.170045,1446.587725,2277.259374,4543.139073,921.571895,spiny
2,77.536678,0.863104,0.417929,73.666667,0.926633,6.0,373.630444,436.958952,21,46,...,4,25,65.4696,425.897625,381.015114,287.118123,3589.339062,4704.910407,582.285423,spiny
3,76.583222,0.900537,0.400396,95.979167,0.942049,11.0,943.382549,989.448317,24,52,...,4,28,99.9139,1217.694976,524.550156,180.994813,5416.228778,6814.93329,740.722806,spiny
4,72.01925,0.873518,0.227626,47.535714,1.0,5.0,186.218009,221.639502,14,32,...,4,18,54.3718,172.075941,261.459057,55.055236,1659.465869,1185.773462,69.144146,aspiny


In [4]:
X = abs(df_full.iloc[:,:-1]) # Need to take absolute value for SelectKBest to work
y = df_full.iloc[:,-1]       # iloc Purely integer-location based indexing for selection by position.

In [5]:
bestfeatures = SelectKBest(score_func=chi2)
fit = bestfeatures.fit(X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df_full.columns)

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

print(featureScores.nlargest(20,'Score'))  #print 20 best features

                              Specs          Score
18                    total_surface  539939.690284
17                     total_length  264251.289979
10                     number_nodes  220597.718099
19                     total_volume  115805.409932
7                 max_path_distance   30418.233712
6            max_euclidean_distance   27774.577274
14                   overall_height   23066.856685
9                   number_branches    2410.349432
15                    overall_width    2214.133645
13                    overall_depth    1994.393610
12                      number_tips    1213.162125
8               number_bifurcations    1201.211153
16                     soma_surface     905.144986
5                  max_branch_order     556.265637
11                     number_stems      64.412472
3             average_fragmentation      55.236331
0   average_bifurcation_angle_local      16.120053
2                  average_diameter       0.834203
4     average_parent_daughter_r

In [6]:
pick_feats = list(featureScores.nlargest(20,'Score').Specs) # make a list of the ten best features
pick_feats.append('dendrite_type') # add dendrite_type to the list

df_small = df[pick_feats] # Make a new DataFrame with our selected features
df_small.head(5)

Unnamed: 0,total_surface,total_length,number_nodes,total_volume,max_path_distance,max_euclidean_distance,overall_height,number_branches,overall_width,overall_depth,...,number_bifurcations,soma_surface,max_branch_order,number_stems,average_fragmentation,average_bifurcation_angle_local,average_diameter,average_parent_daughter_ratio,average_contraction,dendrite_type
0,1803.875644,1666.082926,1470,167.343086,126.59379,99.779724,140.506829,73,136.267522,51.4886,...,33,435.74027,6.0,7,20.723077,82.727781,0.345092,0.96451,0.864267,aspiny
1,4543.139073,2277.259374,2011,921.571895,496.831994,432.38311,566.70122,23,370.170045,92.6671,...,9,1446.587725,3.0,5,105.277778,82.50668,0.634047,0.862183,0.90389,spiny
2,4704.910407,3589.339062,3137,582.285423,436.958952,373.630444,425.897625,46,381.015114,65.4696,...,21,287.118123,6.0,4,73.666667,77.536678,0.417929,0.926633,0.863104,spiny
3,6814.93329,5416.228778,4652,740.722806,989.448317,943.382549,1217.694976,52,524.550156,99.9139,...,24,180.994813,11.0,4,95.979167,76.583222,0.400396,0.942049,0.900537,spiny
4,1185.773462,1659.465869,1406,69.144146,221.639502,186.218009,172.075941,32,261.459057,54.3718,...,14,55.055236,5.0,4,47.535714,72.01925,0.227626,1.0,0.873518,aspiny


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, scale
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

Using TensorFlow backend.


Split the dataset into two groups for training and testing.

In [8]:
train = df_small.sample(frac=0.8,random_state=111) # random pick 80% for training
test = df_small.drop(train.index)
print(train.shape)
print(test.shape)

(495, 21)
(124, 21)


Train and test in different groups.

In [9]:
X = train.values[:,:-1] # put all columns from first to last but not including the last one in X
Y = train.values[:,-1]  # put last columns in y
x = test.values[:,:-1]  # put all columns from first to last but not including the last one in X
y = test.values[:,-1]   # put last columns in y

lb = LabelBinarizer()  # Binarize labels in a one-vs-all fashion
Y_b = lb.fit_transform(Y) # convert values in y into binary labels
y_b = lb.fit_transform(y)

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
training_y = encoder.transform(Y)
testing_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_training = np_utils.to_categorical(training_y)
dummy_testing = np_utils.to_categorical(testing_y)

log_reg = LogisticRegression(penalty="l2")
log_reg.fit(X,Y_b)

y_pred = log_reg.predict(x)  #Predict class labels for samples in X.
print("Model accuracy:", accuracy_score(y_b,y_pred))  # Calculate the accuracy comparing the predicted labels with the ground truth

Model accuracy: 0.8951612903225806


  y = column_or_1d(y, warn=True)


# Neural Network for Morphology

In [10]:
def baseline_model():
    model = Sequential()
    model.add(Dense(22, input_dim=20, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model 
X = scale(X)
model = baseline_model();
model.fit(X,dummy_training,epochs=100,verbose=1)
y_pred = model.predict_classes(x)
print("Model accuracy:", accuracy_score(y_b,y_pred)) 



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Model accuracy: 0.5161290322580645


In [11]:
df_ef = pd.read_csv("ElecPhyFeatures.csv",index_col=0)       # Read electrophysiology data
print(df.shape)
print(df_ef.shape)

df_cb = pd.merge(df,df_ef,how='inner',left_on='specimen_id', right_on='specimen_id',suffixes=('_mp','')) # Combine two data frames
print(df_cb.shape)

df_cb = df_cb.drop(columns='dendrite_type_mp') # dendrite_type was duplicated while merging two dataframes, drop out one of them.
print(df_cb.shape)
print(df_cb.columns)
df_cb.head(5)

(619, 29)
(2333, 57)
(619, 85)
(619, 84)
Index(['average_bifurcation_angle_local', 'average_contraction',
       'average_diameter', 'average_fragmentation',
       'average_parent_daughter_ratio', 'id_mp', 'max_branch_order',
       'max_euclidean_distance', 'max_path_distance',
       'neuron_reconstruction_type', 'number_bifurcations', 'number_branches',
       'number_nodes', 'number_stems', 'number_tips', 'overall_depth',
       'overall_height', 'overall_width', 'scale_factor_x', 'scale_factor_y',
       'scale_factor_z', 'soma_surface', 'specimen_id', 'superseded', 'tags',
       'total_length', 'total_surface', 'total_volume', 'adaptation',
       'avg_isi', 'electrode_0_pa', 'f_i_curve_slope',
       'fast_trough_t_long_square', 'fast_trough_t_ramp',
       'fast_trough_t_short_square', 'fast_trough_v_long_square',
       'fast_trough_v_ramp', 'fast_trough_v_short_square', 'has_burst',
       'has_delay', 'has_pause', 'id', 'input_resistance_mohm', 'latency',
       'peak_t_lo

Unnamed: 0,average_bifurcation_angle_local,average_contraction,average_diameter,average_fragmentation,average_parent_daughter_ratio,id_mp,max_branch_order,max_euclidean_distance,max_path_distance,neuron_reconstruction_type,...,trough_t_short_square,trough_v_long_square,trough_v_ramp,trough_v_short_square,upstroke_downstroke_ratio_long_square,upstroke_downstroke_ratio_ramp,upstroke_downstroke_ratio_short_square,vm_for_sag,vrest,dendrite_type
0,82.727781,0.864267,0.345092,20.723077,0.96451,491119743,6.0,99.779724,126.59379,dendrite-only,...,1.04764,-59.8125,-59.437504,-66.71875,2.625872,2.969025,1.909216,-77.718758,-63.952812,aspiny
1,82.50668,0.90389,0.634047,105.277778,0.862183,546781359,3.0,432.38311,496.831994,dendrite-only,...,1.114696,-58.125004,-57.864586,-66.443753,4.322536,4.272167,3.659369,-98.937508,-66.457214,spiny
2,77.536678,0.863104,0.417929,73.666667,0.926633,537042261,6.0,373.630444,436.958952,dendrite-only,...,1.236256,-56.0,-57.97917,-69.356253,5.076478,5.069481,3.992479,-85.3125,-68.378876,spiny
3,76.583222,0.900537,0.400396,95.979167,0.942049,689123605,11.0,943.382549,989.448317,full,...,1.09665,-55.03125,-55.989587,-63.726564,3.445772,3.264711,3.118708,-83.625,-62.545727,spiny
4,72.01925,0.873518,0.227626,47.535714,1.0,657879305,5.0,186.218009,221.639502,full,...,1.377193,-60.656254,-54.5625,-72.192708,1.255817,1.051984,1.371654,-85.09375,-71.524811,aspiny


## Combined Features Logistic

In [12]:
df_cb = df_cb.drop(columns='neuron_reconstruction_type')
df_cb = df_cb.drop(columns='tags')
df_cb = df_cb.drop(columns='superseded')
df_cb.dropna(inplace=True)
combined_X = df_cb.iloc[:,:-1] # Need to take absolute value for SelectKBest to work
combined_y = df_cb.iloc[:,-1]       # iloc Purely integer-location based indexing for selection by position.

combined_train = df_cb.sample(frac=0.8,random_state=111) # random pick 80% for training
combined_test = df_cb.drop(combined_train.index)

combined_X = combined_train.values[:,:-1] # put all columns from first to last but not including the last one in X
combined_Y = combined_train.values[:,-1]  # put last columns in y
combined_x = combined_test.values[:,:-1]  # put all columns from first to last but not including the last one in X
combined_y = combined_test.values[:,-1]   # put last columns in y

combined_lb = LabelBinarizer()  # Binarize labels in a one-vs-all fashion
combined_Y_b = combined_lb.fit_transform(combined_Y) # convert values in y into binary labels
combined_y_b = combined_lb.fit_transform(combined_y)

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(combined_y)
combined_training_y = encoder.transform(combined_Y)
combined_testing_y = encoder.transform(combined_y)
# convert integers to dummy variables (i.e. one hot encoded)
combined_dummy_training = np_utils.to_categorical(combined_training_y)
combined_dummy_testing = np_utils.to_categorical(combined_testing_y)

combined_log_reg = LogisticRegression(penalty="l2")
combined_log_reg.fit(combined_X,combined_Y_b)

combined_y_pred = combined_log_reg.predict(combined_x)  #Predict class labels for samples in X.

print("Model accuracy:", accuracy_score(combined_y_b,combined_y_pred)) 

Model accuracy: 0.7368421052631579


  y = column_or_1d(y, warn=True)


## Combined Features Neural Network, For Accuracy measurement I use the built in Keras validation_split which splits the dataset according to the fraction you implement. In this case I use .2. It gets me an accuracy of 96.77%

In [13]:
warnings.filterwarnings('ignore')
def baseline_model():
    model = Sequential()
    model.add(Dense(83, input_dim=80, activation='relu'))
    model.add(Dense(83, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model 
combined_X = scale(combined_X)
model = baseline_model();
model.fit(combined_X,combined_dummy_training, validation_split = .2, epochs=200,verbose=1)
combined_y_pred = model.predict(combined_x)

Train on 122 samples, validate on 31 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200


Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200


Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200


Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


# Additional Methods (Ensemble, etc) 

In [14]:
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC

In [15]:
X = train.values[:,:-1] # put all columns from first to last but not including the last one in X
Y = train.values[:,-1]  # put last columns in y
x = test.values[:,:-1]  # put all columns from first to last but not including the last one in X
y = test.values[:,-1]   # put last columns in y

lb = LabelBinarizer()  # Binarize labels in a one-vs-all fashion
Y_b = lb.fit_transform(Y) # convert values in y into binary labels
y_b = lb.fit_transform(y)

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
training_y = encoder.transform(Y)
testing_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_training = np_utils.to_categorical(training_y)
dummy_testing = np_utils.to_categorical(testing_y)

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X,Y_b)

y_pred = rf.predict(x)  #Predict class labels for samples in X.

print("Model accuracy:", accuracy_score(y_b,y_pred)) 

Model accuracy: 0.9112903225806451


In [16]:
et = ExtraTreesClassifier()
et.fit(X,Y_b)

y_pred = et.predict(x)  #Predict class labels for samples in X.

print("Model accuracy:", accuracy_score(y_b,y_pred))

Model accuracy: 0.9112903225806451


In [37]:
ada = AdaBoostClassifier(n_estimators = 27)
ada.fit(X, Y_b)

y_pred = ada.predict(x)

print("Model accuracy:", accuracy_score(y_b,y_pred))

Model accuracy: 0.9354838709677419


In [18]:
knn = KNeighborsClassifier()
knn.fit(X,Y_b)

y_pred = knn.predict(x)  #Predict class labels for samples in X.

print("Model accuracy:", accuracy_score(y_b,y_pred))

Model accuracy: 0.8145161290322581


In [19]:
svc = SVC()
svc.fit(X,Y_b)

y_pred = svc.predict(x)  #Predict class labels for samples in X.

print("Model accuracy:", accuracy_score(y_b,y_pred))

Model accuracy: 0.5564516129032258


In [20]:
rg = RidgeClassifier()
rg.fit(X,Y_b)

y_pred = rg.predict(x)  #Predict class labels for samples in X.

print("Model accuracy:", accuracy_score(y_b,y_pred))

Model accuracy: 0.9112903225806451


# Extra Credit - Voting 

In [21]:
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
seed = 12345
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X,Y_b)
y_pred1 = rf.predict(x)  #Predict class labels for samples in X.
print("Model accuracy:", accuracy_score(y_b,y_pred1)) 

et = ExtraTreesClassifier()
et.fit(X,Y_b)
y_pred2 = et.predict(x)  #Predict class labels for samples in X.
print("Model accuracy:", accuracy_score(y_b,y_pred2))

knn = KNeighborsClassifier()
knn.fit(X,Y_b)
y_pred3 = knn.predict(x)  #Predict class labels for samples in X.
print("Model accuracy:", accuracy_score(y_b,y_pred3))

clf_array = [rf, et, knn]

for clf in clf_array:
    vanilla_scores = cross_val_score(clf, X, Y_b, cv=kfold, n_jobs=-1)
    bagging_clf = BaggingClassifier(clf, max_samples=0.25, max_features=1.0, random_state=seed)
    # max_samples: The proportion of samples to draw from X to train each base estimator.
    # max_features: The proportion of features to draw from X to train each base estimator.
    bagging_scores = cross_val_score(bagging_clf, X, Y_b, cv=10, n_jobs=-1)
    
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]".format(clf.__class__.__name__, vanilla_scores.mean()*100, vanilla_scores.std()*100))
    print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n".format(clf.__class__.__name__, bagging_scores.mean()*100, bagging_scores.std()*100))
    
# Set up voting
eclf = VotingClassifier(estimators=[('Random Forests',rf), ('Extra Trees',et), ('KNeighbors',knn)],
                        voting='hard')

for clf, label in zip([rf, et, knn], ['Random Forest','Extra Trees','KNeighbors']):
    scores = cross_val_score(clf, X, Y_b, cv=10, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean()*100, scores.std()*100, label))

Model accuracy: 0.9112903225806451
Model accuracy: 0.8951612903225806
Model accuracy: 0.8145161290322581
Mean of: 90.101, std: (+/-) 2.157 [RandomForestClassifier]
Mean of: 87.664, std: (+/-) 3.084 [Bagging RandomForestClassifier]

Mean of: 88.283, std: (+/-) 3.037 [ExtraTreesClassifier]
Mean of: 88.685, std: (+/-) 2.067 [Bagging ExtraTreesClassifier]

Mean of: 79.596, std: (+/-) 1.616 [KNeighborsClassifier]
Mean of: 79.019, std: (+/-) 4.384 [Bagging KNeighborsClassifier]

Mean: 88.693, std: (+/-) 3.133 [Random Forest]
Mean: 90.926, std: (+/-) 4.194 [Extra Trees]
Mean: 80.203, std: (+/-) 4.910 [KNeighbors]


In [22]:
warnings.filterwarnings('ignore')
# Set up ensemble voting for bagging
ebclf_array = []

for clf in clf_array:
    ebclf_array.append(BaggingClassifier(clf, max_samples=0.25, max_features=1.0, random_state=seed))

v_eclf = VotingClassifier(estimators=list(zip(['Bagging Random Forest','Bagging Extra Trees','Bagging KNeighbors'],ebclf_array)),  voting='hard')
ebclf_array.append(v_eclf)

for clf, label in zip(ebclf_array, ['Bagging Random Forest', 'Bagging Extra Trees', 'Bagging KNeighbors']):
    scores = cross_val_score(clf, X, Y_b, cv=kfold, scoring='accuracy')
    print("Mean: {0:.3f}, std: (+/-) {1:.3f} [{2}]".format(scores.mean(), scores.std(), label))

Mean: 0.875, std: (+/-) 0.015 [Bagging Random Forest]
Mean: 0.873, std: (+/-) 0.031 [Bagging Extra Trees]
Mean: 0.776, std: (+/-) 0.022 [Bagging KNeighbors]
