In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import feature_eng_function as f_eng

from datetime import datetime
from matplotlib import pyplot as plt
from PCA_function import pca_data100
from IPython.core.display import display, HTML
from sklearn import ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import GridSearchCV

%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))



## Import the training and testing data

In [2]:
forest_train = pd.read_csv("data/train.csv")
forest_test = pd.read_csv("data/test.csv")
forest_base_train = pd.read_csv("data/train_eng.csv")
forest_base_test = pd.read_csv("data/test_eng.csv")
forest_100_train = pd.read_csv("data/train_100.csv")
forest_100_test = pd.read_csv("data/test_100.csv")

## Create arrays for each of the data sets

In [3]:
y_train = forest_train['Cover_Type']
ID = forest_test['Id']

X_train = forest_train[[col for col in forest_train.columns.tolist() if col not in ['Id','Cover_Type']]].values
X_train_base = forest_base_train[[col for col in forest_base_train.columns.tolist() if col not in ['Id','Cover_Type']]].values
X_train_100 = forest_100_train[[col for col in forest_100_train.columns.tolist() if col not in ['Id','Cover_Type']]].values

X_test = forest_test[[col for col in forest_test.columns.tolist() if col not in ['Id','Cover_Type']]].values
X_test_base = forest_base_test[[col for col in forest_base_test.columns.tolist() if col not in ['Id','Cover_Type']]].values
X_test_100 = forest_100_test[[col for col in forest_100_test.columns.tolist() if col not in ['Id','Cover_Type']]].values


## Import the pickles

In [6]:
# Gradient Boosting Models
optimized_100_cv5_GBM = pickle.load( open('pickles/optimized_100_cv5_GBM.p', 'rb') )
optimized_100_cv10_GBM = pickle.load( open('pickles/optimized_100_cv10_GBM.p', 'rb') )
optimized_100_default_GBM = pickle.load( open('pickles/optimized_100_default_GBM.p', 'rb') )
optimized_base_cv5_GBM = pickle.load( open('pickles/optimized_base_cv5_GBM.p', 'rb') )
optimized_base_cv10_GBM  = pickle.load( open('pickles/optimized_base_cv10_GBM.p', 'rb') )
optimized_base_default_GBM  = pickle.load( open('pickles/optimized_base_default_GBM.p', 'rb') )
optimized_kaggle_default_GBM = pickle.load( open('pickles/optimized_kaggle_default_GBM.p', 'rb') )
optimized_kaggle_cv5_GBM = pickle.load( open('pickles/optimized_kaggle_cv5_GBM.p', 'rb') )
optimized_kaggle_cv10_GBM = pickle.load( open('pickles/optimized_kaggle_cv10_GBM.p', 'rb') )

# Random Forest Models
optimized_100_cv5_RF = pickle.load( open('pickles/optimized_100_cv5_RF.p', 'rb') )
#optimized_100_cv10_RF = pickle.load( open('pickles/optimized_100_cv10_RF.p', 'rb') )
optimized_base_cv5_RF = pickle.load( open('pickles/optimized_base_cv5_RF.p', 'rb') )
optimized_base_cv10_RF = pickle.load( open('pickles/optimized_base_cv10_RF.p', 'rb') )
optimized_kaggle_cv5_RF = pickle.load( open('pickles/optimized_kaggle_cv5_RF.p', 'rb') )
optimized_kaggle_cv10_RF = pickle.load( open('pickles/optimized_kaggle_cv10_RF.p', 'rb') )

#Extra Trees Models
#optimized_kaggle_cv10_ET = pickle.load( open('pickles/optimized_kaggle_cv10_ET.p', 'rb') )
#optimized_base_cv10_ET = pickle.load( open('pickles/optimized_base_cv10_ET.p', 'rb') )
#optimized_100_cv10_ET = pickle.load( open('pickles/optimized_100_cv10_ET.p', 'rb') )


## Create lists of models to test on for each data set

In [8]:
models_100 = [
    ('optimized_100_cv5_GBM',optimized_100_cv5_GBM),
    ('optimized_100_cv10_GBM',optimized_100_cv10_GBM),
    ('optimized_100_default_GBM',optimized_100_default_GBM),
    ('optimized_100_cv5_RF',optimized_100_cv5_RF)#,
    #('optimized_100_cv10_RF',optimized_100_cv10_RF)#,
    #('optimized_kaggle_cv10_ET',optimized_kaggle_cv10_ET)
]

models_base = [
    ('optimized_base_cv5_GBM',optimized_base_cv5_GBM),
    ('optimized_base_cv10_GBM',optimized_base_cv10_GBM),
    ('optimized_base_default_GBM',optimized_base_default_GBM),
    ('optimized_base_cv5_RF',optimized_base_cv5_RF),
    ('optimized_base_cv10_RF',optimized_base_cv10_RF)#,
    #('optimized_base_cv10_ET',optimized_base_cv10_ET)
]

models_kaggle = [
    ('optimized_kaggle_cv5_GBM',optimized_kaggle_cv5_GBM),
    ('optimized_kaggle_cv10_GBM',optimized_kaggle_cv10_GBM),
    ('optimized_kaggle_default_GBM',optimized_kaggle_default_GBM),
    ('optimized_kaggle_cv5_RF',optimized_kaggle_cv5_RF),
    ('optimized_kaggle_cv10_RF',optimized_kaggle_cv10_RF)#,
    #('optimized_kaggle_cv10_ET',optimized_kaggle_cv10_ET) 
]

## Test each of the models and import into data frame to view results

In [None]:
for i in range(len(models_kaggle)):
    m = models_kaggle[i][0]
    mod = models_kaggle[i][1]
    
    results = pd.DataFrame(ID)
    results['Cover_Type'] = mod.predict(X_test)
    results['Cover_Type'] = results['Cover_Type'].astype(int)
    
    filename = str('submissions/' + m + '.csv')
    results.to_csv(filename, index=False)
    
for i in range(len(models_base)):
    m = models_base[i][0]
    mod = models_base[i][1]
    
    results = pd.DataFrame(ID)
    results['Cover_Type'] = mod.predict(X_test_base)
    results['Cover_Type'] = results['Cover_Type'].astype(int)
    
    filename = str('submissions/' + m + '.csv')
    results.to_csv(filename, index=False)
    
for i in range(len(models_100)):
    m = models_100[i][0]
    mod = models_100[i][1]
    
    results = pd.DataFrame(ID)
    results['Cover_Type'] = mod.predict(X_test_100)
    results['Cover_Type'] = results['Cover_Type'].astype(int)
    
    filename = str('submissions/' + m + '.csv')
    results.to_csv(filename, index=False)
    