**Import libraries**

In [60]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
matplotlib.style.use('ggplot')
import seaborn

# SK-learn libraries for learning.
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier

# SK-Learn Libraries for feature tuning
from sklearn.feature_selection import SelectKBest

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

**Load Data**

In [2]:
# Load full training data set
full_data = np.loadtxt("train.csv", dtype = "int", delimiter = ",", skiprows=1)
feature_names = np.loadtxt("train.csv", dtype = "str", delimiter = ",")[0,:]

# Split into data and labels
full_data_labels = full_data[:,full_data.shape[1]-1]
full_data = full_data[:,:full_data.shape[1]-1]

print ("full data shape: ", full_data.shape)
print ("full label shape:", full_data_labels.shape)

full data shape:  (15120, 55)
full label shape: (15120,)


**Define and Review Train/Dev Data**

In [3]:
# Split into smaller training set and a dev set for us to use
# Shuffle the input so that we get a random subset in training vs dev
# Test set provided separately from kaggle where we do not know the labels
np.random.seed(58230)
shuffle = np.random.permutation(np.arange(full_data.shape[0]))
full_data, full_data_labels = full_data[shuffle], full_data_labels[shuffle]

train_data, train_labels = full_data[:14120], full_data_labels[:14120]
dev_data, dev_labels = full_data[14120:], full_data_labels[14120:]

print ("\ntrain data shape: ", train_data.shape)
print ("train label shape:", train_labels.shape)
print ("\ndev data shape: ", dev_data.shape)
print ("dev label shape:", dev_labels.shape)


# Print some basic info looking at a row of data
print("\nFeature names are:")
print(feature_names)

print("\nAn example row of training data:")
print(train_data[0])


train data shape:  (14120, 55)
train label shape: (14120,)

dev data shape:  (1000, 55)
dev label shape: (1000,)

Feature names are:
["b'Id'" "b'Elevation'" "b'Aspect'" "b'Slope'"
 "b'Horizontal_Distance_To_Hydrology'" "b'Vertical_Distance_To_Hydrology'"
 "b'Horizontal_Distance_To_Roadways'" "b'Hillshade_9am'"
 "b'Hillshade_Noon'" "b'Hillshade_3pm'"
 "b'Horizontal_Distance_To_Fire_Points'" "b'Wilderness_Area1'"
 "b'Wilderness_Area2'" "b'Wilderness_Area3'" "b'Wilderness_Area4'"
 "b'Soil_Type1'" "b'Soil_Type2'" "b'Soil_Type3'" "b'Soil_Type4'"
 "b'Soil_Type5'" "b'Soil_Type6'" "b'Soil_Type7'" "b'Soil_Type8'"
 "b'Soil_Type9'" "b'Soil_Type10'" "b'Soil_Type11'" "b'Soil_Type12'"
 "b'Soil_Type13'" "b'Soil_Type14'" "b'Soil_Type15'" "b'Soil_Type16'"
 "b'Soil_Type17'" "b'Soil_Type18'" "b'Soil_Type19'" "b'Soil_Type20'"
 "b'Soil_Type21'" "b'Soil_Type22'" "b'Soil_Type23'" "b'Soil_Type24'"
 "b'Soil_Type25'" "b'Soil_Type26'" "b'Soil_Type27'" "b'Soil_Type28'"
 "b'Soil_Type29'" "b'Soil_Type30'" "b'Soil_

**Preprocessing**

In [4]:
#Scaling
def my_scaler(data):    
    scaler = preprocessing.StandardScaler()
    continuous = scaler.fit_transform(data[:,:10])
    binary = data[:,10:]
    return np.concatenate((continuous, binary),axis=1)

In [5]:
#Feature selection
def my_featureselection(num_features, fit_data, fit_labels, transform_data):
    selection = SelectKBest(k=num_features)
    top_train = selection.fit_transform(fit_data,fit_labels)
    top_dev = selection.transform(transform_data)
    return [top_train, top_dev]

In [6]:
scaled_train_data = my_scaler(train_data)
scaled_dev_data = my_scaler(dev_data)
top25_train_data,top25_dev_data = my_featureselection(25,train_data,train_labels,dev_data)
top25_scaled_train_data,top25_scaled_dev_data = my_featureselection(25,scaled_train_data,train_labels,scaled_dev_data)
scaled_top25_train_data = my_scaler(my_featureselection(25,train_data,train_labels,dev_data)[0])
scaled_top25_dev_data = my_scaler(my_featureselection(25,train_data,train_labels,dev_data)[1])

  f = msb / msw


Define Model Assessment

In [7]:
def assess_model(model,train_data,train_labels,dev_data):
    model = model
    model.fit(train_data,train_labels)
    dev_preds = model.predict(dev_data)
    
    accuracy = metrics.accuracy_score(dev_labels,dev_preds)
    f1score = metrics.f1_score(dev_labels,dev_preds,average='weighted')
    confusion = confusion_matrix(dev_labels,dev_preds) 
    report = classification_report(dev_labels,dev_preds)
    
    print('Accuracy: ', accuracy)
    print('F1 Score: ', f1score)
    print('Confusion Matrix: \n', confusion)
    print('Classification Report: \n',report)
    
    return [accuracy,f1score,confusion,report]

**K Nearest Neighbor**

In [8]:
model_results = assess_model(KNeighborsClassifier(),train_data,train_labels,dev_data)

Accuracy:  0.82
F1 Score:  0.815523351995
Confusion Matrix: 
 [[ 97  25   0   0   3   1  20]
 [ 23  67   5   0  10   6   2]
 [  0   1 123  19   1  13   0]
 [  0   0   3 127   0   4   0]
 [  0   2   7   0 133   2   0]
 [  0   2  18   6   4 126   0]
 [  3   0   0   0   0   0 147]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.79      0.66      0.72       146
          2       0.69      0.59      0.64       113
          3       0.79      0.78      0.79       157
          4       0.84      0.95      0.89       134
          5       0.88      0.92      0.90       144
          6       0.83      0.81      0.82       156
          7       0.87      0.98      0.92       150

avg / total       0.82      0.82      0.82      1000



In [9]:
model_results = assess_model(KNeighborsClassifier(),scaled_train_data,train_labels,scaled_dev_data)

Accuracy:  0.37
F1 Score:  0.368345380673
Confusion Matrix: 
 [[54 33  6  6 17  5 25]
 [33 31  8  6 18  8  9]
 [10 16 55 25 11 39  1]
 [ 3  2 22 90  6 11  0]
 [22 26 13 18 50  8  7]
 [16  8 48 23 13 47  1]
 [49 29  7  1 17  4 43]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.29      0.37      0.32       146
          2       0.21      0.27      0.24       113
          3       0.35      0.35      0.35       157
          4       0.53      0.67      0.59       134
          5       0.38      0.35      0.36       144
          6       0.39      0.30      0.34       156
          7       0.50      0.29      0.36       150

avg / total       0.38      0.37      0.37      1000



In [10]:
model_results = assess_model(KNeighborsClassifier(),top25_train_data,train_labels,top25_dev_data)

Accuracy:  0.829
F1 Score:  0.82498536594
Confusion Matrix: 
 [[102  23   0   0   4   1  16]
 [ 22  70   5   0   8   6   2]
 [  0   1 117  21   2  16   0]
 [  0   0   5 127   0   2   0]
 [  0   2   6   0 135   1   0]
 [  0   1  14   7   3 131   0]
 [  3   0   0   0   0   0 147]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.80      0.70      0.75       146
          2       0.72      0.62      0.67       113
          3       0.80      0.75      0.77       157
          4       0.82      0.95      0.88       134
          5       0.89      0.94      0.91       144
          6       0.83      0.84      0.84       156
          7       0.89      0.98      0.93       150

avg / total       0.83      0.83      0.82      1000



In [11]:
model_results = assess_model(KNeighborsClassifier(),scaled_top25_train_data,train_labels,scaled_top25_dev_data)

Accuracy:  0.795
F1 Score:  0.791635918999
Confusion Matrix: 
 [[ 88  32   0   0  10   0  16]
 [ 16  77   4   1   8   5   2]
 [  0   4 106  10   4  33   0]
 [  0   0   3 127   0   4   0]
 [  1   8   5   0 126   4   0]
 [  0   3  22   5   2 124   0]
 [  3   0   0   0   0   0 147]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.81      0.60      0.69       146
          2       0.62      0.68      0.65       113
          3       0.76      0.68      0.71       157
          4       0.89      0.95      0.92       134
          5       0.84      0.88      0.86       144
          6       0.73      0.79      0.76       156
          7       0.89      0.98      0.93       150

avg / total       0.80      0.80      0.79      1000



In [12]:
model_results = assess_model(KNeighborsClassifier(),top25_scaled_train_data,train_labels,top25_scaled_dev_data)

Accuracy:  0.371
F1 Score:  0.372629439163
Confusion Matrix: 
 [[46 40  8  3 15  6 28]
 [27 32 12  6 17  8 11]
 [13 13 54 31 12 34  0]
 [ 1  0 30 89  3 11  0]
 [27 27 15  8 54 12  1]
 [12 10 49 23 16 44  2]
 [58 20  8  1 10  1 52]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.25      0.32      0.28       146
          2       0.23      0.28      0.25       113
          3       0.31      0.34      0.32       157
          4       0.55      0.66      0.60       134
          5       0.43      0.38      0.40       144
          6       0.38      0.28      0.32       156
          7       0.55      0.35      0.43       150

avg / total       0.39      0.37      0.37      1000



In [13]:
params = {'n_neighbors': [1,2,3,5,7,10,13,16,20,25],'weights':['uniform','distance']
         ,'metric':['minkowski','canberra','braycurtis'], 'p':[2,3,4]}

In [14]:
params = {'n_neighbors': [1,2,5],'metric':['braycurtis']}

model_search = GridSearchCV(KNeighborsClassifier(),param_grid=params)
model_search.fit(train_data,train_labels)
print(model_search.best_params_)
print(model_search.best_score_)

{'metric': 'braycurtis', 'n_neighbors': 1}
0.8509206798866855


Best K Nearest Neighbor

In [15]:
model_final = KNeighborsClassifier(n_neighbors=1,metric='braycurtis')
model_final_results = assess_model(model_final,train_data,train_labels,dev_data)

Accuracy:  0.872
F1 Score:  0.871447820648
Confusion Matrix: 
 [[115  16   0   0   3   1  11]
 [ 13  88   4   0   2   5   1]
 [  0   1 133  11   0  12   0]
 [  0   0   2 127   0   5   0]
 [  0   6   3   0 133   2   0]
 [  0   2  17   4   2 131   0]
 [  4   1   0   0   0   0 145]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.87      0.79      0.83       146
          2       0.77      0.78      0.78       113
          3       0.84      0.85      0.84       157
          4       0.89      0.95      0.92       134
          5       0.95      0.92      0.94       144
          6       0.84      0.84      0.84       156
          7       0.92      0.97      0.94       150

avg / total       0.87      0.87      0.87      1000



**Decision Tree**

In [16]:
model_results = assess_model(DecisionTreeClassifier(),train_data,train_labels,dev_data)

Accuracy:  0.817
F1 Score:  0.817126613195
Confusion Matrix: 
 [[ 95  38   0   0   4   1   8]
 [ 24  74   5   0   5   5   0]
 [  2   6 127   7   1  14   0]
 [  0   0   5 127   0   2   0]
 [  2   5   2   0 133   1   1]
 [  1   4  28   4   1 118   0]
 [  6   1   0   0   0   0 143]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.73      0.65      0.69       146
          2       0.58      0.65      0.61       113
          3       0.76      0.81      0.78       157
          4       0.92      0.95      0.93       134
          5       0.92      0.92      0.92       144
          6       0.84      0.76      0.79       156
          7       0.94      0.95      0.95       150

avg / total       0.82      0.82      0.82      1000



In [17]:
params = {'criterion': ['gini','entropy'],'max_depth':[2,4,7,10,15,50]
         ,'min_samples_split':[2,4,7,10,15],'max_features':[20,26]}

In [18]:
#Grid Search
params = {'criterion': ['entropy']}

model_search = GridSearchCV(DecisionTreeClassifier(),param_grid=params)
model_search.fit(train_data,train_labels)
print(model_search.best_params_)
print(model_search.best_score_)

{'criterion': 'entropy'}
0.7822237960339944


Best Decision Tree

In [19]:
model_final = DecisionTreeClassifier()
model_final_results = assess_model(model_final,train_data,train_labels,dev_data)

Accuracy:  0.806
F1 Score:  0.805777887162
Confusion Matrix: 
 [[ 93  40   0   0   3   1   9]
 [ 28  68   5   0   8   4   0]
 [  2   4 126   7   1  17   0]
 [  0   0   4 127   0   3   0]
 [  2   8   2   0 131   1   0]
 [  1   5  28   5   1 116   0]
 [  4   1   0   0   0   0 145]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.72      0.64      0.67       146
          2       0.54      0.60      0.57       113
          3       0.76      0.80      0.78       157
          4       0.91      0.95      0.93       134
          5       0.91      0.91      0.91       144
          6       0.82      0.74      0.78       156
          7       0.94      0.97      0.95       150

avg / total       0.81      0.81      0.81      1000



Attempting to visualize decision tree

In [23]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dtree = DecisionTreeClassifier(max_depth = 5)
dtree.fit(train_data,train_labels)

export_graphviz(dtree,out_file = 'graph.dot')
#pydotplus.graph_from_dot_file('graph.dot').write_png('tree.png')

**Random Forest**

In [28]:
model_results = assess_model(RandomForestClassifier(),train_data,train_labels,dev_data)

Accuracy:  0.855
F1 Score:  0.854136795642
Confusion Matrix: 
 [[108  26   0   0   5   1   6]
 [ 18  81   6   0   5   3   0]
 [  0   2 132   8   1  14   0]
 [  0   0   3 129   0   2   0]
 [  1   8   0   0 133   2   0]
 [  1   0  25   3   3 124   0]
 [  2   0   0   0   0   0 148]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.83      0.74      0.78       146
          2       0.69      0.72      0.70       113
          3       0.80      0.84      0.82       157
          4       0.92      0.96      0.94       134
          5       0.90      0.92      0.91       144
          6       0.85      0.79      0.82       156
          7       0.96      0.99      0.97       150

avg / total       0.85      0.85      0.85      1000



In [None]:
params = {'n_estimators': [250],'max_features':[5,10,15,20,25,30,40,50,55]}

In [37]:
#Grid Search
params = {'n_estimators': [100],'max_features':[7,8,9,10,11,12]}

model_search = GridSearchCV(RandomForestClassifier(),param_grid=params)
model_search.fit(train_data,train_labels)
print(model_search.best_params_)
print(model_search.best_score_)

{'max_features': 10, 'n_estimators': 100}
0.8638101983002833


Best Random Forest

In [66]:
model_final = RandomForestClassifier(n_estimators=250,max_features=25)
model_final_results = assess_model(model_final,train_data,train_labels,dev_data)

Accuracy:  0.881
F1 Score:  0.880256957883
Confusion Matrix: 
 [[111  26   0   0   3   0   6]
 [ 15  83   5   0   6   4   0]
 [  0   0 138   4   1  14   0]
 [  0   0   1 130   0   3   0]
 [  0   5   0   0 136   3   0]
 [  0   2  15   3   1 135   0]
 [  2   0   0   0   0   0 148]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.87      0.76      0.81       146
          2       0.72      0.73      0.72       113
          3       0.87      0.88      0.87       157
          4       0.95      0.97      0.96       134
          5       0.93      0.94      0.93       144
          6       0.85      0.87      0.86       156
          7       0.96      0.99      0.97       150

avg / total       0.88      0.88      0.88      1000



**Adaboost**

In [70]:
model_results = assess_model(AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=250,max_features=25)),train_data,train_labels,dev_data)

Accuracy:  0.882
F1 Score:  0.881127584137
Confusion Matrix: 
 [[110  27   0   0   3   0   6]
 [ 17  82   5   0   5   4   0]
 [  0   0 137   4   2  14   0]
 [  0   0   1 130   0   3   0]
 [  0   4   0   0 137   3   0]
 [  0   2  13   2   1 138   0]
 [  2   0   0   0   0   0 148]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.85      0.75      0.80       146
          2       0.71      0.73      0.72       113
          3       0.88      0.87      0.88       157
          4       0.96      0.97      0.96       134
          5       0.93      0.95      0.94       144
          6       0.85      0.88      0.87       156
          7       0.96      0.99      0.97       150

avg / total       0.88      0.88      0.88      1000



**Explore Commonly Erroneous Predictions**

In [77]:
knn_model = KNeighborsClassifier(n_neighbors=1,metric='braycurtis')
knn_model.fit(train_data,train_labels)
knn_dev_preds = knn_model.predict(dev_data)
knn_errors_index = np.where(knn_dev_preds!=dev_labels)
knn_errors_data = dev_data[knn_errors_index]
knn_errors_labels = dev_labels[knn_errors_index]

rf_model = RandomForestClassifier(n_estimators=250,max_features=25)
rf_model.fit(train_data,train_labels)
rf_dev_preds = rf_model.predict(dev_data)
rf_errors_index = np.where(rf_dev_preds!=dev_labels)
rf_errors_data = dev_data[rf_errors_index]
rf_errors_labels = dev_labels[rf_errors_index]

In [85]:
print(np.asarray(knn_errors_index).shape)
print(np.asarray(rf_errors_index).shape)
print(np.intersect1d(knn_errors_index,rf_errors_index).shape)

(1, 128)
(1, 116)
(52,)


In [88]:
dev_data[np.intersect1d(knn_errors_index,rf_errors_index)].shape

(52, 55)

In [89]:
dev_labels[np.intersect1d(knn_errors_index,rf_errors_index)].shape

(52,)

In [91]:
np.concatenate((dev_data[np.intersect1d(knn_errors_index,rf_errors_index)],dev_labels[np.intersect1d(knn_errors_index,rf_errors_index)]),axis=0)

ValueError: all the input arrays must have same number of dimensions

**PCA**

**Clustering**

**Misc Exploration and Analysis**

In [None]:
c = np.corrcoef(np.transpose(train_data[:,1:14]))

In [None]:
c_matrix = seaborn.heatmap(c)

In [None]:
for i in np.linspace(1,14,14):
    print(i, ' - ', feature_names[i])