# Forest Cover Type: Baseline Classification

For this project, we are working on predicting the forest cover type among 7 classifications based on cartographic variables from the US Geological Survey and USFS for each 30 x 30 meter cell of forest.

For more details, please see: https://www.kaggle.com/c/forest-cover-type-prediction

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 

# SK-Learn Libraries for feature tuning
from sklearn.feature_selection import SelectKBest

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report



Load the training data. Note that kaggle has a separate file for test data which we will not load yet. Thus, we will split our training data into training and a dev set.

In [2]:
# Load full training data set
full_data = np.loadtxt("train.csv", dtype = "int", delimiter = ",", skiprows=1)
feature_names = np.loadtxt("train.csv", dtype = "str", delimiter = ",")[0,:]

# Split into data and labels
full_data_labels = full_data[:,full_data.shape[1]-1]
full_data = full_data[:,:full_data.shape[1]-1]
full_data = np.delete(full_data, 0, 1)  # Delete id to prevent use as feature
print ("full data shape: ", full_data.shape)
print ("full label shape:", full_data_labels.shape)

# Split into smaller training set and a dev set for us to use
# Shuffle the input so that we get a random subset in training vs dev
# Test set provided separately from kaggle where we do not know the labels
np.random.seed(58230)
shuffle = np.random.permutation(np.arange(full_data.shape[0]))
full_data, full_data_labels = full_data[shuffle], full_data_labels[shuffle]

train_data, train_labels = full_data[:14120], full_data_labels[:14120]
dev_data, dev_labels = full_data[14120:], full_data_labels[14120:]

print ("\ntrain data shape: ", train_data.shape)
print ("train label shape:", train_labels.shape)
print ("\ndev data shape: ", dev_data.shape)
print ("dev label shape:", dev_labels.shape)


# Print some basic info looking at a row of data
print("\nFeature names are:")
print(feature_names[1:-1])

print("\nAn example row of training data:")
print(train_data[0])

full data shape:  (15120, 54)
full label shape: (15120,)

train data shape:  (14120, 54)
train label shape: (14120,)

dev data shape:  (1000, 54)
dev label shape: (1000,)

Feature names are:
['Elevation' 'Aspect' 'Slope' 'Horizontal_Distance_To_Hydrology'
 'Vertical_Distance_To_Hydrology' 'Horizontal_Distance_To_Roadways'
 'Hillshade_9am' 'Hillshade_Noon' 'Hillshade_3pm'
 'Horizontal_Distance_To_Fire_Points' 'Wilderness_Area1' 'Wilderness_Area2'
 'Wilderness_Area3' 'Wilderness_Area4' 'Soil_Type1' 'Soil_Type2'
 'Soil_Type3' 'Soil_Type4' 'Soil_Type5' 'Soil_Type6' 'Soil_Type7'
 'Soil_Type8' 'Soil_Type9' 'Soil_Type10' 'Soil_Type11' 'Soil_Type12'
 'Soil_Type13' 'Soil_Type14' 'Soil_Type15' 'Soil_Type16' 'Soil_Type17'
 'Soil_Type18' 'Soil_Type19' 'Soil_Type20' 'Soil_Type21' 'Soil_Type22'
 'Soil_Type23' 'Soil_Type24' 'Soil_Type25' 'Soil_Type26' 'Soil_Type27'
 'Soil_Type28' 'Soil_Type29' 'Soil_Type30' 'Soil_Type31' 'Soil_Type32'
 'Soil_Type33' 'Soil_Type34' 'Soil_Type35' 'Soil_Type36' 'Soil_Typ

In [68]:
scaler = preprocessing.StandardScaler()
continuous = scaler.fit_transform(train_data[:, range(0, 10)])
binary = train_data[:, range(10,54)]
scaled_train_data = np.concatenate((continuous, binary), axis=1)

con_dev = scaler.transform(dev_data[:, range(0, 10)])
bin_dev = dev_data[:, range(10,54)]
scaled_dev_data = np.concatenate((con_dev, bin_dev), axis=1)

selection=SelectKBest(k=25)  # just checking univariate selection method first
top_train=selection.fit_transform(scaled_train_data,train_labels)
top_dev=selection.transform(scaled_dev_data)

  f = msb / msw


Run a basic logistic regression model.

In [84]:
# Run logistic regression with default values and check accuracy
model_log = LogisticRegression()
model_log.fit(train_data, train_labels)

print("Default logistic regression accuracy is %1.3f" % (model_log.score(dev_data,dev_labels)))
#print(np.max(train_data,axis=0))
#print(np.min(train_data,axis=0))

Default logistic regression accuracy is 0.668


Run a basic Gaussian Naive Bayes model.

In [57]:
# Run NB with default values and check accuracy
model_NB = GaussianNB()
model_NB.fit(train_data, train_labels)

print("Default NB accuracy is %1.3f" % (model_NB.score(dev_data,dev_labels)))


Default NB accuracy is 0.613


Run a basic k-Nearest Neighbors model.

In [85]:
# Run k-NN with default values and check accuracy

model_knn = KNeighborsClassifier()
model_knn.fit(train_data, train_labels)

print("Default k-NN accuracy is %1.3f" % (model_knn.score(dev_data,dev_labels)))



Default k-NN accuracy is 0.805


In [24]:
dt_clf = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
dt_clf.fit(train_data, train_labels)

print('Decision Tree accuracy:', dt_clf.score(dev_data, dev_labels))

rf_clf= RandomForestClassifier(n_estimators=250,max_features=10)
rf_clf.fit(train_data, train_labels)

print ('Random Forest accuracy:', rf_clf.score(dev_data, dev_labels))

importances = rf_clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_clf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(dev_data.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Decision Tree accuracy: 0.799
Random Forest accuracy: 0.877
Feature ranking:
1. feature 0 (0.247243)
2. feature 5 (0.094471)
3. feature 9 (0.073010)
4. feature 3 (0.062131)
5. feature 4 (0.052283)
6. feature 6 (0.049476)
7. feature 1 (0.047296)
8. feature 13 (0.047226)
9. feature 8 (0.043487)
10. feature 7 (0.042520)
11. feature 2 (0.032757)
12. feature 23 (0.023180)
13. feature 51 (0.019865)
14. feature 10 (0.019805)
15. feature 16 (0.018405)
16. feature 52 (0.016737)
17. feature 12 (0.015766)
18. feature 17 (0.012668)
19. feature 53 (0.008711)
20. feature 43 (0.006684)
21. feature 15 (0.005958)
22. feature 26 (0.005482)
23. feature 30 (0.005301)
24. feature 35 (0.004814)
25. feature 45 (0.004690)
26. feature 36 (0.004523)
27. feature 42 (0.004354)
28. feature 25 (0.004120)
29. feature 11 (0.003458)
30. feature 46 (0.003166)
31. feature 24 (0.002922)
32. feature 19 (0.002612)
33. feature 48 (0.002242)
34. feature 37 (0.002195)
35. feature 44 (0.002129)
36. feature 14 (0.001524)
37. fe

In [88]:
imp_train = train_data[:, (0,5,9,3,4,6,1,13,8)] #train knn on most important features in random forest model
imp_dev = dev_data[:, (0,5,9,3,4,6,1,13,8)]
knn2 = KNeighborsClassifier()
knn2.fit(imp_train, train_labels)

print("k-NN accuracy is %1.3f" % (knn2.score(imp_dev,dev_labels)))

k-NN accuracy is 0.805


In [98]:
# features 20,21, and 28 have zero variance and can be removed for further analysis
subset_t_data = np.delete(train_data, 28, 1)
subset_t_data = np.delete(subset_t_data, 21, 1)
subset_t_data = np.delete(subset_t_data, 20, 1)
subset_d_data = np.delete(dev_data, 28, 1)
subset_d_data = np.delete(subset_d_data, 21, 1)
subset_d_data = np.delete(subset_d_data, 20, 1)
