# Baseline Model Testing

Data source: https://www.kaggle.com/c/forest-cover-type-prediction

In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from scipy.stats import gaussian_kde
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import cross_validation
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
from IPython.core.display import display, HTML
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from feature_eng_function import feature_eng_forest, forest_interactions
from confusion_matrix_score_function import confusion_matrix_scoring 
%matplotlib inline
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')
display(HTML("<style>.container { width:100% !important; }</style>"))



## Load Data

In [2]:
forest = pd.read_csv("data/train.csv") 
forest = forest.iloc[:,1:]

In [3]:
original_cols = list(forest.columns)
original_cols.remove('Cover_Type')

In [26]:
output = pd.read_csv('data/test.csv', header=0)[['Id']].set_index('Id')
output.head()

# y_test_kaggle_GBM = optimized_kaggle_GBM.predict(test_X)
# results_kaggle_GBM = pd.DataFrame(test.Id)
# results_kaggle_GBM['Cover_Type'] = y_test_kaggle_GBM
# results_kaggle_GBM['Cover_Type'] = results_kaggle_GBM['Cover_Type'].astype(int)
# results_kaggle_GBM.to_csv('submissions/results_kaggle_GBM.csv', index=False)
# #make sure you manually delete the last line
# # SCORE = 0.73838


15121
15122
15123
15124
15125
15126
15127
15128
15129
15130
15131


# Feature Engineering

In [28]:
forest = feature_eng_forest('data/train.csv', 'soil_types.csv')
forest_test = feature_eng_forest('data/test.csv', 'soil_types.csv')
forest = forest.iloc[:,1:]
original_cols_with_soil_eng = list(forest.columns)
original_cols_with_soil_eng

Dropped the following columns: 

Wetmore
Pachic Argiborolis
Aquolis


KeyError: "['Cover_Type'] not in index"

In [None]:
forest = forest_interactions(forest)
forest_test = forest_interactions(forest_test)

### Transform the continuous features
###### We will try Normalization, Standardized Scaling, and MinMax Scaling
###### Note: there is no need to impute any data points as this is a pretty clean data set

In [7]:
chunk_size = 0.1 #Validation chunk size
seed = 0 # Use the same random seed to ensure consistent validation chunk usage

X_all = [] # all features
X_all_add = [] # Additionally we will make a list of subsets
trans_list = [] # Transformations
comb = [] # combinations
comb.append("All+1.0")

features = [] # feature selection models
model_features = [] # names of feature selection models

#Reorder the data to have continuous variables come first
continuous = []
categorical = []
final_columns = []
for col in forest.columns.tolist():
    if col == 'Cover_Type':
        pass
    elif forest[col].nunique() > 4:
        continuous.append(col)
    else:
        categorical.append(col)
final_columns.extend(continuous)
final_columns.extend(categorical)
final_columns.append('Cover_Type')
forest = forest[final_columns]
num_row, num_cols = forest.shape
cols = forest.columns
size = len(continuous) # Number of continuous columns

#Create the data arrays for model building
val_array = forest.values
X = val_array[:,0:(num_cols-1)]
y = val_array[:,(num_cols-1)]
X_train, X_val, y_train, y_val = cross_validation.train_test_split(X, y, test_size=chunk_size, random_state=seed)
X_all.append(['Orig','All', X_train,X_val,cols[:num_cols-1]])

# MinMax Scale the data

X_temp = MinMaxScaler().fit_transform(X_train[:,0:size])
X_val_temp = MinMaxScaler().fit_transform(X_val[:,0:size])

# Recombine data
X_con = np.concatenate((X_temp,X_train[:,size:]),axis=1)
X_val_con = np.concatenate((X_val_temp,X_val[:,size:]),axis=1)

X_all.append(['MinMax', 'All', X_con,X_val_con,cols])

# Add transformation to the list
for trans,name,X,X_val,cols_list in X_all:
    trans_list.append(trans)

### Create classifiers and Grid Search
- Logistic Regression
- SVM

In [12]:
# Add Logistic Regression
n = 'Logistic Regression'
model_features.append(n)
comb.append(n)
features.append([n, LogisticRegression(random_state=seed),
    {
        'penalty':('l1', 'l2'),
        'dual':(True, False),
        'C':(1e-3, 1e-2,1e-1,1e0,1e1,1e2,1e3),
        'fit_intercept':(True, False),
        'intercept_scaling':(1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3),
        'max_iter':('newton-cg', 'lbfgs', 'liblinear', 'sag'),
        'tol':(1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1),
        'multi_class':('ovr', 'multinomial')
    }])
    
# Add SVM
n = 'SVM'
model_features.append(n)
comb.append(n)
features.append([n, LinearSVC(random_state=seed),
    {
        'C':(1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3),
        'kernel':('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'),
        'degree':(1,2,3,4),
        'gamma':('auto',1e-3, 1e-2,1e-1,1e0,1e1,1e2,1e3),
        'coef0':(1e-3, 1e-2,1e-1,1e0,1e1,1e2,1e3),
        'probability':(True,False),
        'shrinking':(True,False),
        'tol':(1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1),
        'decision_function_shape':('ovo', 'ovr', None)
    }])
    

# Top 100 features

In [13]:
top100 = list(pd.read_csv('top_100.csv', header=None, names=['Feature', 'Importance'])['Feature'].values)
column_lists = {
    'original':original_cols,
    'original with soil engineered':original_cols_with_soil_eng,
    'top 100':top100#,
    #'all':all_interacted_cols
}

# Grid Search

In [14]:
#Run grid search over the different data transformations
def gridSearch(model, params, X, y):
    g = GridSearchCV(model, params, error_score=-999, verbose=1)
    g.fit(X, y)
    return g.best_estimator_, g.best_score_, g.best_params_, g.cv_results_

# Run models on selected features

In [16]:


# Determine feature importance for each model and transformation combination
with open('model_testing.txt', 'w+') as file:
    for trans, s, X, X_val, cols in X_all:
        for name,model,params in features:
            for c in column_lists:
                print (name)
                file.write('name : ' + str(name) + '\n')
                print (model)
                file.write('model : ' + str(model) + '\n')
                print (c)
                file.write('c : ' + str(cols) + '\n')

                selected_features = column_lists[c]
                print (len(selected_features))
                file.write('selected features : ' + str(selected_features) + '\n')

                cols_list = [] # List of names of columns selected
                i_cols_list = [] # Indexes of columns selected
                rank_list =[] # Ranking of all the columns
                rem_list = [] # List of columns not selected
                i_rem_list = [] # Indexes of columns not selected

                for field in cols:
                    if field in selected_features:
                        cols_list.append(field)
                        i_cols_list.append(list(cols).index(field))
                    else:
                        rem_list.append(field)
                        i_rem_list.append(list(cols).index(field))

                #Limit training and validation dataset to just relevant columns
                X_new = np.delete(X, i_rem_list, axis=1)
                X_val_new = np.delete(X_val, i_rem_list, axis=1) 

                #Fit the model on selected dataset
                model.fit(X_new, y_train)

                #Calculate model score against true class for each sample
                print (model.score(X_val_new, y_val))
                file.write('model score : ' + str(model.score(X_val_new, y_val)) + '\n')
#                 #Grid search
#                 file.write('Grid Search Results -- \n')
#                 best_estimator, best_score, best_params, cv_results = gridSearch(model, params, X_new, y_train)
#                 print (best_estimator)
#                 file.write('best estimator : ' + str(best_estimator) + '\n')
#                 print (best_score)
#                 file.write('best score : ' + str(best_score) + '\n')
#                 print (best_params)
#                 file.write('best params : ' + str(best_params) + '\n')
#                 print (cv_results)
#                 file.write('best cv results : ' + str(cv_results) + '\n')
                
                #Output prediction on test data
                pd.DataFrame(model.predict(X_test)).to_csv('%s_%s_%s_test.csv'%(trans, name, c))
                
                print (confusion_matrix_scoring(model.predict(X_val_new), y_val))
                file.write('conf matrix score : ' + str(confusion_matrix_scoring(model.predict(X_val_new), y_val)) + '\n')
                file.write('\n')
                file.write('-----------------\n')
                file.write('\n')


                # Append model name, array, columns selected and columns to be removed to the additional list        
                X_all_add.append([trans,name,X_new,X_val_new,cols_list]) 


Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
original
54
0.630952380952


ValueError: X has 55 features per sample; expecting 51