# Forest Cover Type: Baseline Classification

For this project, we are working on predicting the forest cover type among 7 classifications based on cartographic variables from the US Geological Survey and USFS for each 30 x 30 meter cell of forest.

For more details, please see: https://www.kaggle.com/c/forest-cover-type-prediction

In [5]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# SK-learn libraries for learning.

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
# SK-Learn Libraries for feature tuning
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import KernelPCA

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

Load the training data. Note that kaggle has a separate file for test data which we will not load yet. 

In [484]:
# Load full training data set
full_data = np.loadtxt("train.csv", dtype = "int", delimiter = ",", skiprows=1)
feature_names = np.loadtxt("train.csv", dtype = "str", delimiter = ",")[0,:]

# Split into data and labels
full_data_labels = full_data[:,full_data.shape[1]-1]
full_data = full_data[:,:full_data.shape[1]-1]

# Delete id to prevent use as feature
full_data = np.delete(full_data, 0, 1)  

# features 20,21, and 28 have zero variance 
#and can be removed for further analysis
full_data = np.delete(full_data, 28, 1)
full_data = np.delete(full_data, 21, 1)
full_data = np.delete(full_data, 20, 1)
feature_names = np.delete(feature_names,[28,21,20])

# Shuffle the input so that we get a random subset in training vs dev
#np.random.seed(32)
np.random.seed(58230)
shuffle = np.random.permutation(np.arange(full_data.shape[0]))
full_data, full_data_labels = full_data[shuffle], full_data_labels[shuffle]

print ("full data shape: ", full_data.shape)
print ("full label shape:", full_data_labels.shape)

# Print some basic info looking at a row of data
print("\nFeature names are:")
print(feature_names[1:-1])

print("\nAn example row of training data:")
print(full_data[0])

full data shape:  (15120, 51)
full label shape: (15120,)

Feature names are:
['Elevation' 'Aspect' 'Slope' 'Horizontal_Distance_To_Hydrology'
 'Vertical_Distance_To_Hydrology' 'Horizontal_Distance_To_Roadways'
 'Hillshade_9am' 'Hillshade_Noon' 'Hillshade_3pm'
 'Horizontal_Distance_To_Fire_Points' 'Wilderness_Area1' 'Wilderness_Area2'
 'Wilderness_Area3' 'Wilderness_Area4' 'Soil_Type1' 'Soil_Type2'
 'Soil_Type3' 'Soil_Type4' 'Soil_Type5' 'Soil_Type8' 'Soil_Type9'
 'Soil_Type10' 'Soil_Type11' 'Soil_Type12' 'Soil_Type13' 'Soil_Type15'
 'Soil_Type16' 'Soil_Type17' 'Soil_Type18' 'Soil_Type19' 'Soil_Type20'
 'Soil_Type21' 'Soil_Type22' 'Soil_Type23' 'Soil_Type24' 'Soil_Type25'
 'Soil_Type26' 'Soil_Type27' 'Soil_Type28' 'Soil_Type29' 'Soil_Type30'
 'Soil_Type31' 'Soil_Type32' 'Soil_Type33' 'Soil_Type34' 'Soil_Type35'
 'Soil_Type36' 'Soil_Type37' 'Soil_Type38' 'Soil_Type39' 'Soil_Type40']

An example row of training data:
[2075   27   30   30   15  404  193  162   89  330    0    0    0    1  

### Feature Selection

It appears some of the features may be iterrelated such that creating additional features may be of interest. For instance, there are three hillshade features that measure the area sunlight, each at a different time of day. It may be that a more important feature in determining cover type is whether it can be described to be in sun or shade the entire day. 

In [485]:
basic_data = pd.DataFrame(data=full_data,columns=feature_names[1:-1])

# Add Hillshade variables to datav1
datav1 = basic_data.copy()
temp = basic_data.loc[:, ["Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"]]
datav1['Strong_shade'] = (temp < 215).sum(axis=1) 
datav1['Strong_sun'] = (temp > 170).sum(axis=1)
datav1['Part_shade'] = (temp < 190).sum(axis=1) 
datav1['Part_sun'] = (temp > 225).sum(axis=1)
datav1.Strong_shade.replace([0,1,2,3], [0,0,0,1], inplace=True)
datav1.Strong_sun.replace([0,1,2,3], [0,0,0,1], inplace=True)
datav1.Part_shade.replace([0,1,2,3], [0,0,1,1], inplace=True)
datav1.Part_sun.replace([0,1,2,3], [0,0,1,1], inplace=True)

datav2 = datav1.copy()
datav2.insert(1,'Min_Hydro',pd.np.minimum(np.absolute(datav2.Horizontal_Distance_To_Hydrology), np.absolute(datav2.Vertical_Distance_To_Hydrology)))
datav2.insert(1,'Combined_Hydro',(datav2.Horizontal_Distance_To_Hydrology.abs()+ datav2.Vertical_Distance_To_Hydrology.abs()))
datav2.drop(['Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology'], axis=1,inplace=True)
print(datav1.Part_shade.value_counts())
print(datav1.Part_sun.value_counts())
print(datav1.Strong_shade.value_counts())
print(datav1.Strong_sun.value_counts())
#print(datav2.shape)

0    12396
1     2724
Name: Part_shade, dtype: int64
0    12737
1     2383
Name: Part_sun, dtype: int64
0    12928
1     2192
Name: Strong_shade, dtype: int64
0    13020
1     2100
Name: Strong_sun, dtype: int64


### Create different datasets

In [486]:
# Split into smaller training set and a dev set for us to use

train_data, train_labels = full_data[:14120], full_data_labels[:14120]
dev_data, dev_labels = full_data[14120:], full_data_labels[14120:]

train_v1_data = datav1.values[:14120]
dev_v1_data = datav1.values[14120:]

train_v2_data = datav2.values[:14120]
dev_v2_data = datav2.values[14120:]

In [487]:
scaler = preprocessing.StandardScaler()
continuous = scaler.fit_transform(train_data[:, range(0, 10)])
con_dev = scaler.transform(dev_data[:, range(0, 10)])
binary = train_data[:, range(10,51)]
bin_dev = dev_data[:, range(10,51)]
scaled_train_data = np.concatenate((continuous, binary), axis=1)
scaled_dev_data = np.concatenate((con_dev, bin_dev), axis=1)

binary_v1 = train_v1_data[:, range(10,55)]
bin_v1_dev = dev_v1_data[:, range(10,55)]
scaled_train_v1_data = np.concatenate((continuous, binary_v1), axis=1)
scaled_dev_v1_data = np.concatenate((con_dev, bin_v1_dev), axis=1)

scaler2 = preprocessing.StandardScaler()
continuous_v2 = scaler2.fit_transform(train_v2_data[:, range(0, 10)])
con_v2_dev = scaler2.transform(dev_v2_data[:, range(0, 10)])
scaled_train_v2_data = np.concatenate((continuous_v2, binary_v1), axis=1)
scaled_dev_v2_data = np.concatenate((con_v2_dev, bin_v1_dev), axis=1)




In [488]:
interactions = preprocessing.PolynomialFeatures(interaction_only=True, include_bias=False)
i_train_data=interactions.fit_transform(scaled_train_data)
i_dev_data=interactions.transform(scaled_dev_data)

interactions1 = preprocessing.PolynomialFeatures(interaction_only=True, include_bias=False)
i_train_data_v1=interactions1.fit_transform(scaled_train_v1_data)
i_dev_data_v1=interactions1.transform(scaled_dev_v1_data)

interactions2 = preprocessing.PolynomialFeatures(interaction_only=True, include_bias=False)
i_train_data_v2=interactions2.fit_transform(scaled_train_v2_data)
i_dev_data_v2=interactions2.transform(scaled_dev_v2_data)

poly = preprocessing.PolynomialFeatures(2)
poly_train_data=poly.fit_transform(scaled_train_data)
poly_dev_data=poly.transform(scaled_dev_data)

poly1 = preprocessing.PolynomialFeatures(2)
poly_train_data_v1=poly1.fit_transform(scaled_train_v1_data)
poly_dev_data_v1=poly1.transform(scaled_dev_v1_data)

poly2 = preprocessing.PolynomialFeatures(2)
poly_train_data_v2=poly2.fit_transform(scaled_train_v2_data)
poly_dev_data_v2=poly2.transform(scaled_dev_v2_data)

In [476]:
def custom_scoring(true,pred):
    tp = 0
    fp = 0
    fn = 0
    for i in range(len(pred)):
        if (true[i] == target_of_interest and pred[i]==target_of_interest):
            tp +=1
        elif (pred[i] == target_of_interest and true[i]!=target_of_interest):
            fp += 1
        elif (true[i] == target_of_interest and pred[i]!=target_of_interest):
            fn +=1
    if tp > 0:
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        F1 = 2 * (precision * recall) / (precision + recall) 
    else: 
        F1 = 0
    return F1

In [214]:
def onefeaturebestforest(X,Y,featureset):
    global target_of_interest
    target_of_interest = featureset[0]
    label_scorer = metrics.make_scorer(custom_scoring, greater_is_better=True)
    df = pd.concat([pd.DataFrame(X),pd.DataFrame(Y,columns=['target'])],axis=1)
    dfsub = df.copy()
    dfsub = dfsub.loc[dfsub['target'].isin(featureset)]
    lab = dfsub['target']
    dfsub.drop(['target'], axis=1,inplace=True)
    params = {'n_estimators': [300],'max_features':[10,12,15,25,35,'auto']}
    rf_clf = GridSearchCV(RandomForestClassifier()
                          ,param_grid=params,scoring=label_scorer)
    rf_clf.fit(dfsub,lab.values)
    print(rf_clf.best_params_)
    print(rf_clf.best_score_)
    importances = rf_clf.best_estimator_.feature_importances_   
    indices = np.argsort(importances)[::-1]
    print(indices)
    return [importances,indices]

The following uses onefeaturebestforest to find the most important features that improve category 1 (first value input in set) f1 score when evaluating only data from categories 1,2, and 7

In [477]:
c1_c2_c7_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(1,2,7)) 

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.7503194914520902
[  1  56  68 ..., 990 991   0]


In [490]:
def num_check_v1(importances,tlabel):
    # This function takes output from onebestforest as importances and a target, tlabel
    # Return the number of features that maximize knn f1 score for tlabel
    scores = []
    #num_features = [5,10,15,20,25,30,35,40,45,50,55,60,70,80,100,160]
    num_features = list(range(8,50))
    for i in num_features:
        train_data = poly_train_data_v1[:, (importances[1][:i])] 
        dev_data = poly_dev_data_v1[:, (importances[1][:i])]
        knn_model = KNeighborsClassifier(n_neighbors=1,metric='braycurtis')
        knn_model.fit(train_data,train_labels)
        y_pred = knn_model.predict(dev_data)
        knn_score = metrics.f1_score(dev_labels, y_pred, average=None)
        scores.append(knn_score[tlabel-1])
    max_value = max(scores)
    nf = num_features[scores.index(max_value)]
    return [max_value,nf]

In [492]:
print(num_check_v1(c1_c2_c7_ofbf_pv1_bt,1))

[0.80000000000000004, 32]


In [493]:
def k_ens_fit(parameters):
    for i in range(len(parameters)):
        impsR,nfeaturesR,impsP,nfeaturesP = parameters[i]
        train_dataR = poly_train_data_v1[:, (impsR[1][:nfeaturesR])] 
        dev_dataR = poly_dev_data_v1[:, (impsR[1][:nfeaturesR])]
        train_dataP = poly_train_data_v1[:, (impsP[1][:nfeaturesP])] 
        dev_dataP = poly_dev_data_v1[:, (impsP[1][:nfeaturesP])]
        
        knn_modelR = KNeighborsClassifier(n_neighbors=1,metric='braycurtis')
        knn_modelR.fit(train_dataR,train_labels)
        y_predR = knn_modelR.predict(dev_dataR)
        
        knn_modelP = KNeighborsClassifier(n_neighbors=1,metric='braycurtis')
        knn_modelP.fit(train_dataP,train_labels)
        y_predP = knn_modelP.predict(dev_dataP)
        if i==0:
            finalsR=y_predR.copy()
            finalsP=y_predP.copy()
        else:
            finalsR = np.vstack((finalsR,y_predR))
            finalsP = np.vstack((finalsP,y_predR))
    dfR=pd.DataFrame(finalsR.T,columns=['one', 'two', 'three','four', 'five', 'six','seven'])
    # Only interested if model predicts target it was designed for, zero other predictions out
    dfR[dfR[['one']]!=1] = 0
    dfR[dfR[['two']]!=2] = 0
    dfR[dfR[['three']]!=3] = 0
    dfR[dfR[['four']]!=4] = 0
    dfR[dfR[['five']]!=5] = 0
    dfR[dfR[['six']]!=6] = 0
    dfR[dfR[['seven']]!=7] = 0
    dfP=pd.DataFrame(finalsP.T,columns=['one', 'two', 'three','four', 'five', 'six','seven'])
    dfP[dfP[['one']]!=1] = 0
    dfP[dfP[['two']]!=2] = 0
    dfP[dfP[['three']]!=3] = 0
    dfP[dfP[['four']]!=4] = 0
    dfP[dfP[['five']]!=5] = 0
    dfP[dfP[['six']]!=6] = 0
    dfP[dfP[['seven']]!=7] = 0
    df = pd.concat([dfR,dfP,dfR,dfP],1) 
    df.replace(0, np.nan,inplace=True)

    # Random forest to fill predictions not made
    rf_model = RandomForestClassifier(n_estimators=400,max_features=25,random_state=13)
    rf_model.fit(train_data,train_labels)

    df['rclf'] = rf_model.predict(dev_data)  
    dev_pred = df.mode(axis=1)
    dev_pred = dev_pred[dev_pred.columns[0]]
    f = dev_pred.astype('int64')
    return f

In [494]:
en_imps = [(c1_c2_c7_ofbf_pv1_bt,35,c1_c2_c7_ofbf_pv1_bt,28), 
           (c2_c1_c3_c5_ofbf_pv1_bt,24,c2_c1_c5_c7_ofbf_pv1_bt,19), #
           (c3_c2_c6_ofbf_pv1_bt,18,c3_c6_ofbf_pv1_bt,15),  #
           (c4_c3_c6_ofbf_pv1_bt,36,c4_c3_ofbf_pv1_bt,40),
           (c5_c2_c6_ofbf_pv1_bt,38,c5_c3_c6_ofbf_pv1_bt,32),
           (c6_c3_ofbf_pv1_bt,18,c6_c2_c3_ofbf_pv1_bt,45),
          (c7_c1_ofbf_pv1_bt,41,c7_c1_ofbf_pv1_bt,13)]

k_ens = k_ens_fit(en_imps)

In [495]:
accuracy = metrics.accuracy_score(dev_labels,k_ens)
f1score = metrics.f1_score(dev_labels,k_ens,average='weighted')
confusion = confusion_matrix(dev_labels,k_ens) 
report = classification_report(dev_labels,k_ens)

print('Accuracy: ', accuracy)
print('F1 Score: ', f1score)
print('Confusion Matrix: \n', confusion)
print('Classification Report: \n',report)

Accuracy:  0.896
F1 Score:  0.893988837834
Confusion Matrix: 
 [[ 98  16   0   0   1   0   6]
 [ 29 109   1   0  12   2   0]
 [  0   1 129   5   1  13   0]
 [  0   0   1 142   0   1   0]
 [  0   1   1   0 127   0   0]
 [  0   0   8   5   0 133   0]
 [  0   0   0   0   0   0 158]]
Classification Report: 
              precision    recall  f1-score   support

          1       0.77      0.81      0.79       121
          2       0.86      0.71      0.78       153
          3       0.92      0.87      0.89       149
          4       0.93      0.99      0.96       144
          5       0.90      0.98      0.94       129
          6       0.89      0.91      0.90       146
          7       0.96      1.00      0.98       158

avg / total       0.90      0.90      0.89      1000



In [496]:
def assess_model(model,train_data,train_labels,dev_data):
    model = model
    model.fit(train_data,train_labels)
    dev_preds = model.predict(dev_data)
    
    accuracy = metrics.accuracy_score(dev_labels,dev_preds)
    f1score = metrics.f1_score(dev_labels,dev_preds,average='weighted')
    confusion = confusion_matrix(dev_labels,dev_preds) 
    report = classification_report(dev_labels,dev_preds)
    
    print('Accuracy: ', accuracy)
    print('F1 Score: ', f1score)
    print('Confusion Matrix: \n', confusion)
    print('Classification Report: \n',report)
    
    return [accuracy,f1score,confusion,report]

In [417]:
c1_c2_c5_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(1,2,5))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.8082112391336336
[  1  56  57 ..., 993 994   0]


In [441]:
c2_c1_c3_c5_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(2,1,5,3))

{'class_weight': 'balanced', 'max_features': 25, 'n_estimators': 300}
0.7238689312774123
[  1  56  68 ..., 996 997   0]


In [287]:
c2_c1_c5_c7_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(2,1,5,7))

{'class_weight': 'balanced', 'max_features': 35, 'n_estimators': 300}
0.74446713938201
[  1  56  68 ..., 990 991   0]


In [298]:
c4_c3_c6_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(4,3,6))

{'class_weight': 'balanced', 'max_features': 35, 'n_estimators': 300}
0.9423122285844986
[   1   59   60 ..., 1046 1047    0]


In [307]:
c7_c1_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(7,1))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.9437161236887138
[  56    1   68 ..., 1012 1013    0]


In [308]:
c5_c2_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(5,2))

{'class_weight': 'balanced', 'max_features': 35, 'n_estimators': 300}
0.9287688884150057
[   1   56   61 ..., 1014 1015    0]


In [425]:
c5_c3_c6_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(5,2,3))

{'class_weight': 'balanced', 'max_features': 35, 'n_estimators': 300}
0.921005331597835
[   1   56   68 ..., 1014 1015    0]


In [356]:
c5_c2_c6_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(5,2,6))

{'class_weight': 'balanced', 'max_features': 35, 'n_estimators': 300}
0.9215868272848537
[   1   56   68 ..., 1008 1009    0]


In [357]:
c5_c2_c1_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(5,2,1))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.9210841570550924
[ 56   1  57 ..., 993 994   0]


In [280]:
c3_c6_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(3,6))

{'class_weight': 'balanced', 'max_features': 25, 'n_estimators': 300}
0.8544468559960993
[ 111  325    6 ..., 1046 1047    0]


In [304]:
c6_c3_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(6,3))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.8624742979624106
[ 325  111   65 ..., 1048 1049    0]


In [402]:
c6_c3_c4_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(6,3,4))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.8535587541053087
[   1   59   69 ..., 1048 1049    0]


In [290]:
c3_c2_c6_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(3,2,6))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.8450093910183755
[   1   68   11 ..., 1020 1021    0]


In [430]:
c6_c2_c3_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(6,2,3))

{'class_weight': 'balanced', 'max_features': 'auto', 'n_estimators': 300}
0.8493432993067396
[   1   68   11 ..., 1023 1024    0]


In [291]:
c3_c4_c6_ofbf_pv1_bt=onefeaturebestforest_bt(poly_train_data_v1,train_labels,(3,4,6))

{'class_weight': 'balanced', 'max_features': 35, 'n_estimators': 300}
0.8248112190900774
[ 228   59   56 ..., 1046 1047    0]


In [17]:
#c1_c2_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(1,2))
c1_c2_c7_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(1,2,7))
#c1_c2_c5_c7_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(1,2,7,5))
c1_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(1,2,3,4,5,6,7))

#c2_c1_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(2,1))
c2_c1_c5_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(2,1,5))
c2_c5_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(2,5))
#c2_c1_c3_c5_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(2,1,3,5,6))
c2_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(2,1,3,4,5,6,7))

#c3_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(3,6))
c3_c4_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(3,4,6))
c3_c2_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(3,2,6))
#c3_c2_c4_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(3,2,4,6))
c3_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(3,1,2,4,5,6,7))

c4_c3_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(4,3))
#c4_c3_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(4,3,6))
c4_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(4,1,2,3,5,6,7))

c5_c2_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(5,2))
c5_c2_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(5,2,6))
c5_c1_c2_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(5,2,1))
c5_c1_c2_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(5,2,1,6))
c5_c1_c2_c3_c6_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(5,2,1,3,6))
c5_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(5,1,2,3,4,6,7))

c6_c3_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(6,3))
c6_c2_c3_c4_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(6,2,3,4))
c6_c2_c3_c4_c5_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(6,2,3,4,5))
c6_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(6,1,2,3,4,5,7))

c7_c1_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(7,1))
c7_all_ofbf_p=onefeaturebestforest(poly_train_data,train_labels,(7,1,2,3,4,5,6))

{'max_features': 'auto', 'n_estimators': 150}
0.7535796103446443
[ 52   1  64 ..., 905 906   0]
{'max_features': 25, 'n_estimators': 150}
0.7570523383341604
[  1  52  64 ..., 905 906   0]
{'max_features': None, 'n_estimators': 150}
0.7425975324529844
[  1 302  61 ..., 906 907   0]
{'max_features': None, 'n_estimators': 150}
0.9301632424853032
[  1 302  57 ..., 912 914   0]
{'max_features': 'auto', 'n_estimators': 150}
0.7214601071375559
[  1  52  64 ..., 904 905   0]
{'max_features': None, 'n_estimators': 150}
0.842034507848609
[350  55 103 ..., 919 920   0]
{'max_features': None, 'n_estimators': 150}
0.8590683169637988
[  1 103 301 ..., 915 916   0]
{'max_features': None, 'n_estimators': 150}
0.8176296268665495
[  1  52 350 ..., 906 907   0]
{'max_features': None, 'n_estimators': 150}
0.9630095243102461
[ 65 358  57 ..., 921 922   0]
{'max_features': None, 'n_estimators': 150}
0.9454402393027751
[  1  52 350 ..., 906 907   0]
{'max_features': None, 'n_estimators': 150}
0.9334007560840

In [74]:
#c1_c2_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2))
c1_c2_c7_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2,7))
#c1_c2_c5_c7_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2,7,5))
c1_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2,3,4,5,6,7))

#c2_c1_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1))
c2_c1_c5_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1,5))
c2_c5_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,5))
#c2_c1_c3_c5_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1,3,5,6))
c2_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1,3,4,5,6,7))

#c3_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,6))
c3_c4_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,4,6))
c3_c2_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,2,6))
#c3_c2_c4_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,2,4,6))
c3_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,1,2,4,5,6,7))

c4_c3_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(4,3))
#c4_c3_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(4,3,6))
c4_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(4,1,2,3,5,6,7))

c5_c2_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2))
c5_c2_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,6))
c5_c1_c2_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,1))
c5_c1_c2_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,1,6))
c5_c1_c2_c3_c6_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,1,3,6))
c5_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,1,2,3,4,6,7))

c6_c3_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,3))
c6_c2_c3_c4_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,2,3,4))
c6_c2_c3_c4_c5_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,2,3,4,5))
c6_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,1,2,3,4,5,7))

c7_c1_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(7,1))
c7_all_ofbf_pv1=onefeaturebestforest(poly_train_data_v1,train_labels,(7,1,2,3,4,5,6))

{'max_features': 25, 'n_estimators': 150}
0.7545406547216285
[  56    1  110 ..., 1005 1006    0]
{'max_features': 'auto', 'n_estimators': 150}
0.7612856884220641
[   1  110   56 ..., 1005 1006    0]
{'max_features': None, 'n_estimators': 150}
0.743859492022732
[   1   65  326 ..., 1020 1021    0]
{'max_features': 'auto', 'n_estimators': 150}
0.9282285891734952
[   1  110   56 ..., 1031 1032    0]
{'max_features': 25, 'n_estimators': 150}
0.712343696458878
[   1  110   56 ..., 1004 1005    0]
{'max_features': None, 'n_estimators': 150}
0.8455750408115772
[ 378  111  110 ..., 1059 1060    0]
{'max_features': None, 'n_estimators': 150}
0.855876536915733
[   1  111  325 ..., 1052 1053    0]
{'max_features': None, 'n_estimators': 150}
0.821838275190033
[   1   56  378 ..., 1013 1014    0]
{'max_features': None, 'n_estimators': 150}
0.9598275670555071
[ 110  386   72 ..., 1066 1067    0]
{'max_features': None, 'n_estimators': 150}
0.9472027561068626
[   1   56  378 ..., 1015 1016    0]
{'ma

In [15]:
#c1_c2_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(1,2))
c1_c2_c7_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(1,2,7))
#c1_c2_c5_c7_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(1,2,7,5))
c1_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(1,2,3,4,5,6,7))

#c2_c1_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(2,1))
c2_c1_c5_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(2,1,5))
c2_c5_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(2,5))
#c2_c1_c3_c5_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(2,1,3,5,6))
c2_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(2,1,3,4,5,6,7))

#c3_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(3,6))
c3_c4_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(3,4,6))
c3_c2_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(3,2,6))
#c3_c2_c4_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(3,2,4,6))
c3_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(3,1,2,4,5,6,7))

c4_c3_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(4,3))
#c4_c3_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(4,3,6))
c4_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(4,1,2,3,5,6,7))

c5_c2_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(5,2))
c5_c2_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(5,2,6))
c5_c1_c2_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(5,2,1))
c5_c1_c2_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(5,2,1,6))
c5_c1_c2_c3_c6_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(5,2,1,3,6))
c5_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(5,1,2,3,4,6,7))

c6_c3_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(6,3))
c6_c2_c3_c4_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(6,2,3,4))
c6_c2_c3_c4_c5_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(6,2,3,4,5))
c6_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(6,1,2,3,4,5,7))

c7_c1_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(7,1))
c7_all_ofbf_pv2=onefeaturebestforest(poly_train_data_v2,train_labels,(7,1,2,3,4,5,6))

{'max_features': 25, 'n_estimators': 150}
0.7510887087933135
[   1  110   56 ..., 1008 1009    0]
{'max_features': 'auto', 'n_estimators': 150}
0.7580437315663723
[ 110    1   56 ..., 1008 1009    0]
{'max_features': None, 'n_estimators': 150}
0.7446540561571048
[   1  326   65 ..., 1025 1026    0]
{'max_features': None, 'n_estimators': 150}
0.9320660122657575
[   1  326   61 ..., 1042 1043    0]
{'max_features': 25, 'n_estimators': 150}
0.7220513030712924
[   1  110   56 ..., 1004 1005    0]
{'max_features': None, 'n_estimators': 150}
0.8381663020527379
[ 378  110  218 ..., 1060 1061    0]
{'max_features': None, 'n_estimators': 150}
0.8542139456956062
[   1  218  325 ..., 1054 1055    0]
{'max_features': None, 'n_estimators': 150}
0.8123500197173819
[   1   56  378 ..., 1013 1014    0]
{'max_features': None, 'n_estimators': 150}
0.9591727007456076
[ 110  386   69 ..., 1067 1068    0]
{'max_features': None, 'n_estimators': 150}
0.9407156671451061
[   1   56  378 ..., 1017 1018    0]
{'

In [182]:
c1_c2_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,7))
c1_c2_c7_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2,7))
c1_c2_c5_c7_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2,7,5))
#c1_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(1,2,3,4,5,6,7))

c2_c1_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1))
c2_c1_c5_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1,5))
c2_c5_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,5))
c2_c1_c3_c5_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1,3,5,6))
c2_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(2,1,3,4,5,6,7))

c3_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,6))
c3_c4_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,4,6))
c3_c2_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,2,6))
c3_c2_c4_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,2,4,6))
c3_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(3,1,2,4,5,6,7))

c4_c3_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(4,3))
c4_c3_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(4,3,6))
c4_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(4,1,2,3,5,6,7))

c5_c2_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2))
c5_c2_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,6))
c5_c1_c2_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,1))
c5_c1_c2_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,1,6))
c5_c1_c2_c3_c6_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,2,1,3,6))
c5_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(5,1,2,3,4,6,7))

c6_c3_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,3))
c6_c3_c4_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,3,4))
c6_c2_c3_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,3,2))
c6_c2_c3_c4_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,2,3,4))
c6_c2_c3_c4_c5_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,2,3,4,5))
c6_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(6,1,2,3,4,5,7))

c7_c1_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(7,1))
c7_c1_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(7,1,2))
c7_all_ofbf_pver1=onefeaturebestforest(poly_train_data_v1,train_labels,(7,1,2,3,4,5,6))

{'max_features': 'auto', 'n_estimators': 300}
0.9421646494359116
[  56    1  110 ..., 1019 1020    0]
{'max_features': 'auto', 'n_estimators': 300}
0.7581103223797014
[  56    1  110 ..., 1005 1006    0]
{'max_features': 'auto', 'n_estimators': 300}
0.7606565314848993
[   1   56  110 ..., 1002 1003    0]
{'max_features': 12, 'n_estimators': 300}
0.8036128632259317
[  56    1  110 ..., 1007 1008    0]
{'max_features': 35, 'n_estimators': 300}
0.7431957535107429
[  56    1  110 ..., 1005 1006    0]
{'max_features': 'auto', 'n_estimators': 300}
0.929076028839538
[  56    1  110 ..., 1021 1022    0]
{'max_features': 'auto', 'n_estimators': 300}
0.7201040588169856
[   1  110   56 ..., 1007 1008    0]
{'max_features': 25, 'n_estimators': 300}
0.7215487054026615
[110   1  56 ..., 998 999   0]
{'max_features': 35, 'n_estimators': 300}
0.862285835074686
[ 325  111   61 ..., 1053 1054    0]
{'max_features': 35, 'n_estimators': 300}
0.8327240069628468
[  69  228   56 ..., 1050 1051    0]
{'max_fe