# Baseline Algorithms for Forest Cover Type Predictions


Data source: https://www.kaggle.com/c/forest-cover-type-prediction

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import xgboost as xgb
import feature_eng_function_rob as f_eng

from datetime import datetime
from matplotlib import pyplot as plt
from IPython.core.display import display, HTML
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))



## Load Data and create base data set

In [2]:
forest = pd.read_csv("data/train.csv", index_col=0)
original_fields = forest.columns.tolist()
forest.head()

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
2,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
3,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
4,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
5,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [3]:
data = forest.values
X_kaggle = data[:,0:(-1)]
y_train = data[:,-1]
X_kaggle.shape, y_train.shape

((15120, 54), (15120,))

## Feature Engineering

In [4]:
forest_eng = f_eng.feature_eng_forest('data/train.csv', 'soil_types.csv')

Dropped the following columns: 

Wetmore
Pachic Argiborolis
Aquolis


In [5]:
forest_eng.head()

Unnamed: 0_level_0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
2,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
3,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
4,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
5,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [6]:
data = forest_eng.values
X_base = data[:,0:(-1)]
y_train = data[:,-1]
X_base.shape, y_train.shape

((15120, 95), (15120,))

## Import the top 100 feature set data for testing

In [7]:
f = open('top_100.csv', 'r')
top_100_features = [line.strip().split(',')[0] for line in f]
top_100_features.append('Cover_Type')
f.close()
forest_100 = forest_eng
forest_100 = f_eng.forest_interactions(forest_100)
print (forest_100.shape)
forest_100 = forest_100[top_100_features]
print(forest_100.shape)

(15120, 5172)
(15120, 101)


In [8]:
forest_100.head()

Unnamed: 0_level_0,Elevation,Horizontal_Distance_To_Hydrology_Hillshade_3pm,Horizontal_Distance_To_Roadways_Horizontal_Distance_To_Fire_Points,Horizontal_Distance_To_Roadways_Aspect2,Hillshade_9am_Hillshade_Noon,Elevation_Elevation_3100_8000,Elevation_3100_8000_Elevation,Horizontal_Distance_To_Fire_Points_Horizontal_Distance_To_Roadways,Slope_Horizontal_Distance_To_Roadways,Slope_Horizontal_Distance_To_Fire_Points,...,Soil_Type39_Hillshade_9am,Wilderness_Area4_Elevation,Very Stony_Elevation,Wilderness_Area4_Hillshade_Noon,Outcrop_Horizontal_Distance_To_Roadways,Aspect2_Vanet,Rock_Elevation,Soil_Type32_Horizontal_Distance_To_Roadways,Horizontal_Distance_To_Roadways_Wilderness_Area4,Cover_Type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2596,38184,3202290,26010,51272,0,0,3202290,1530,18837,...,0,0,0.0,0,510.0,0.0,2596.0,0,0,5
2,2590,32012,2427750,21840,51700,0,0,2427750,780,12450,...,0,0,0.0,0,390.0,0.0,2590.0,0,0,5
3,2804,36180,19464780,442020,55692,0,0,19464780,28620,55089,...,0,0,0.0,0,0.0,0.0,0.0,0,0,2
4,2785,29524,19191990,478950,56644,0,0,19191990,55620,111798,...,0,0,0.0,0,0.0,0.0,0.0,0,0,2
5,2595,22950,2413252,17595,51480,0,0,2413252,782,12344,...,0,0,0.0,0,391.0,0.0,2595.0,0,0,5


In [9]:
data = forest_100.values
X_100 = data[:,0:(-1)]
y_train = data[:,-1]
X_100.shape, y_train.shape

((15120, 100), (15120,))

## Run XGBoost in Grid Search across a broad range of parameters for each data set

In [10]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_kaggle_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5)#, n_jobs = -1)
optimized_kaggle_GBM.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 5360 sec


In [11]:
optimized_kaggle_GBM.grid_scores_



[mean: 0.69061, std: 0.03408, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.69061, std: 0.03355, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.68909, std: 0.03482, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.75099, std: 0.03659, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.75218, std: 0.03708, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.75073, std: 0.03496, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.77864, std: 0.03479, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.77447, std: 0.03964, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.77282, std: 0.03644, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.67870, std: 0.03251, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [12]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_base_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_base_GBM.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 3754 sec


In [13]:
optimized_base_GBM.grid_scores_



[mean: 0.69034, std: 0.03449, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.69180, std: 0.03370, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.69220, std: 0.03422, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.75516, std: 0.03795, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.75344, std: 0.03834, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.74934, std: 0.03857, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.78095, std: 0.03316, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.77573, std: 0.03491, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.77698, std: 0.03563, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.68439, std: 0.03199, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

In [14]:
t1 = datetime.now()
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_100_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_100_GBM.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 4903 sec


In [15]:
optimized_100_GBM.grid_scores_



[mean: 0.70060, std: 0.03620, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.70132, std: 0.03721, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.70033, std: 0.03655, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.77500, std: 0.03620, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.77295, std: 0.03565, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.76951, std: 0.03670, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.79788, std: 0.03663, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.79239, std: 0.03677, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.79187, std: 0.03674, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.69008, std: 0.03703, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

## Fine Tune the best hyperparameters for the best model in the broad search

In [16]:
############ NEEDS TO BE MODIFIED ONCE WE FIGURE OUT THE BEST ONE ####################
cv_params = {'max_depth': [1,3,5], 'min_child_weight': [1,3,5], 'learning_rate': [0.15, 0.1, 0.075, 0.05]}
ind_params = {'n_estimators': 300, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'multi:softmax'}
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 5, n_jobs = -1)
optimized_GBM.fit(X_100, y_train)
optimized_GBM.grid_scores_



[mean: 0.70060, std: 0.03620, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 1},
 mean: 0.70132, std: 0.03721, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 3},
 mean: 0.70033, std: 0.03655, params: {'learning_rate': 0.15, 'max_depth': 1, 'min_child_weight': 5},
 mean: 0.77500, std: 0.03620, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 1},
 mean: 0.77295, std: 0.03565, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 3},
 mean: 0.76951, std: 0.03670, params: {'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5},
 mean: 0.79788, std: 0.03663, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 1},
 mean: 0.79239, std: 0.03677, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 3},
 mean: 0.79187, std: 0.03674, params: {'learning_rate': 0.15, 'max_depth': 5, 'min_child_weight': 5},
 mean: 0.69008, std: 0.03703, params: {'learning_rate': 0.1, 'max_depth': 1, 'min_

## Random Forest Ensembles in Grid Search

In [32]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_kaggle_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=5,n_jobs=-1)
optimized_kaggle_RF.fit(X_kaggle, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 435 sec


In [33]:
optimized_kaggle_RF.grid_scores_



[mean: 0.45231, std: 0.01502, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.45549, std: 0.01143, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.47910, std: 0.01698, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.46852, std: 0.03014, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.46786, std: 0.03025, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.48776, std: 0.02424, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.47454, std: 0.03479, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'n_estimators': 50},
 mean: 0.47837, std: 0.02366, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'n_estimators': 100},
 mean: 0.49940, std: 0.028

In [34]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_base_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=5,n_jobs=-1)
optimized_base_RF.fit(X_base, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 397 sec


In [35]:
optimized_base_RF.grid_scores_



[mean: 0.49914, std: 0.02765, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.47791, std: 0.02382, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.47394, std: 0.02268, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.47235, std: 0.02107, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.47884, std: 0.02503, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.48247, std: 0.02761, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.51938, std: 0.04499, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'n_estimators': 50},
 mean: 0.51792, std: 0.03158, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'n_estimators': 100},
 mean: 0.53882, std: 0.049

In [36]:
t1 = datetime.now()
hyperparameters = {'max_depth': [1,3,5],
    'n_estimators': [50, 100, 200, 300, 500, 700],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2']
}
optimized_100_RF = GridSearchCV(RandomForestClassifier(random_state=0), hyperparameters, cv=5,n_jobs=-1)
optimized_100_RF.fit(X_100, y_train)
t2 = datetime.now()
print ("Total time: %i sec" %(t2-t1).seconds)

Total time: 797 sec


In [37]:
optimized_100_RF.grid_scores_



[mean: 0.44028, std: 0.01134, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 50},
 mean: 0.43836, std: 0.01526, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 100},
 mean: 0.44306, std: 0.01158, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 200},
 mean: 0.44292, std: 0.01184, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 300},
 mean: 0.44597, std: 0.01461, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 500},
 mean: 0.44980, std: 0.01564, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'sqrt', 'n_estimators': 700},
 mean: 0.48505, std: 0.02319, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'n_estimators': 50},
 mean: 0.48009, std: 0.02796, params: {'criterion': 'gini', 'max_depth': 1, 'max_features': 'log2', 'n_estimators': 100},
 mean: 0.48267, std: 0.028