In [20]:
#modules needed 
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
import category_encoders
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

Loading Data

In [40]:
cat_pred=pd.read_csv(os.path.join(os.getcwd(),'cat_train_prob.csv'))
for_pred=pd.read_csv(os.path.join(os.getcwd(),'forest_train_prob.csv'))
for_smal_pred=pd.read_csv(os.path.join(os.getcwd(),'forest_smal_train_prob.csv'))
NN_pred=pd.read_csv(os.path.join(os.getcwd(),'NN_train_prob.csv'))
H2O_1_pred=pd.read_csv(os.path.join(os.getcwd(),'h20_1_train_prob.csv'))
H2O_2_pred=pd.read_csv(os.path.join(os.getcwd(),'h20_2_train_prob.csv'))
H2O_3_pred=pd.read_csv(os.path.join(os.getcwd(),'h20_3_train_prob.csv'))

train=pd.read_csv(os.path.join(os.getcwd(),'train.csv.zip'))

### Metrics

In [41]:
#class_weights={1: 0.37062, 2: 0.49657,3:0.05947,4:0.018,5:0.018,6:0.018,7:0.018}
class_weights={1: 0.37053, 2: 0.49657,3:0.05947,4:0.00106,5:0.01287,6:0.02698,7:0.03238}
list_weight=compute_class_weight(class_weights,np.unique(train['Cover_Type']),train['Cover_Type'])

In [42]:
#columns
target_col='Cover_Type'
soil_type=[x for x in train.columns if 'Soil' in x]
wild_type=[x for x in train.columns if 'Wilderness' in x]
vert_dist=['Vertical_Distance_To_Hydrology']
hor_dist=['Horizontal_Distance_To_Hydrology','Horizontal_Distance_To_Roadways',  'Horizontal_Distance_To_Fire_Points']
hill_feat=['Hillshade_9am', 'Hillshade_Noon','Hillshade_3pm']
other_feat=['Elevation', 'Aspect', 'Slope']

In [43]:
def bal_acc(y_true,y_pred,weights=None):
    #weights should be provided as list of weights for each class to sum up to 1
    temp1=np.unique(y_true,return_counts=True)[1]
    temp2=confusion_matrix(y_true,y_pred)
    acc_bal=np.sum(np.diag(temp2)*(temp1/np.sum(temp1))/temp1)
    if len(weights)==0:
        acc_weig=acc_bal
    elif len(weights)!=len(temp1):
        print('provide weight for each class')
    else:
        acc_weig=np.sum(np.diag(temp2)*weights/temp1)
    return (acc_weig,acc_bal)

### Check individual 

In [44]:
print('cat_acc ',bal_acc(train[target_col].values,(np.argmax(cat_pred.iloc[:,1:].values,1)+1),weights=list_weight))
print('for_acc ',bal_acc(train[target_col].values,(np.argmax(for_pred.iloc[:,1:].values,1)+1),weights=list_weight))
print('NN_acc ',bal_acc(train[target_col].values,(np.argmax(NN_pred.iloc[:,1:].values,1)+1),weights=list_weight))

cat_acc  (0.772510162037037, 0.8607142857142857)
for_acc  (0.786329462962963, 0.8853174603174603)
NN_acc  (0.7926306018518519, 0.8728174603174602)


In [45]:
print('H2O_1_acc ',bal_acc(train[target_col].values,(np.argmax(H2O_1_pred.iloc[:,2:].values,1)+1),weights=list_weight))
print('H2O_2_acc ',bal_acc(train[target_col].values,(np.argmax(H2O_2_pred.iloc[:,2:].values,1)+1),weights=list_weight))
print('H2O_3_acc ',bal_acc(train[target_col].values,(np.argmax(H2O_3_pred.iloc[:,2:].values,1)+1),weights=list_weight))

H2O_1_acc  (0.7762913379629628, 0.8805555555555555)
H2O_2_acc  (0.7755452962962962, 0.879100529100529)
H2O_3_acc  (0.7745515370370372, 0.8794973544973544)


In [46]:
temp=cat_pred.iloc[:,1:].values+for_pred.iloc[:,1:].values+NN_pred.iloc[:,1:].values+H2O_1_pred.iloc[:,2:].values+H2O_2_pred.iloc[:,2:].values

In [47]:
print('mean_acc ',bal_acc(train[target_col].values,(np.argmax(temp,1)+1),weights=list_weight))

mean_acc  (0.801738449074074, 0.89265873015873)


In [48]:
X_data=np.hstack([cat_pred.iloc[:,1:].values,for_pred.iloc[:,1:].values,for_smal_pred.iloc[:,1:].values,NN_pred.iloc[:,1:].values,H2O_1_pred.iloc[:,2:].values,H2O_2_pred.iloc[:,2:].values,H2O_3_pred.iloc[:,2:].values])

### Tuning RF

In [49]:
from sklearn.ensemble import RandomForestClassifier

In [50]:
#first I split the data
X_train, X_val, y_train, y_val = train_test_split(X_data, train[target_col], stratify=train[target_col],  test_size=0.2,random_state=42)

In [51]:
#one more time but this version will be needed for cross validation
from sklearn.metrics import make_scorer
def bal_acc2(y_true,y_pred):
    #weights should be provided as list of weights for each class to sum up to 1
    temp1=np.unique(y_true,return_counts=True)[1]
    temp2=confusion_matrix(y_true,y_pred)
    acc_bal=np.sum(np.diag(temp2)*(temp1/np.sum(temp1))/temp1)
    weights=[0.37053, 0.49657, 0.05947, 0.00106, 0.01287, 0.02698, 0.03238]
    acc_weig=np.sum(np.diag(temp2)*weights/temp1)
    return (acc_weig)

my_score = make_scorer(bal_acc2)

In [52]:
def Bayes_RandomTrees2(n_estimators, max_depth,max_features):
    
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    
    assert type(n_estimators) == int
    assert type(max_depth) == int

    #etc =RandomForestClassifier(n_estimators = n_estimators,max_features = max_features,bootstrap = False,max_depth=max_depth,verbose=0,n_jobs=-1,random_state=42)
    etc =RandomForestClassifier(n_estimators = n_estimators,max_features = max_features,bootstrap = False,max_depth=max_depth,verbose=0,n_jobs=-1,class_weight=class_weights,random_state=42)
    etc.fit(X_train, y_train)
    
    score = bal_acc2(y_val, etc.predict(X_val))
    
    return score

In [59]:
params = {'n_estimators' : (80, 400),
          'max_depth' : (5, 50),
          'max_features' : (.05,0.9)} 

RandomTreeBO = BayesianOptimization(Bayes_RandomTrees2, params, random_state = 42)

print(RandomTreeBO.space.keys)


['max_depth', 'max_features', 'n_estimators']


In [60]:
init_points = 50
n_iter = 100

In [61]:
RandomTreeBO.maximize(init_points = init_points,  n_iter = n_iter,   acq = 'ucb',    xi = 0.0,        alpha = 1e-6)

|   iter    |  target   | max_depth | max_fe... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.7876  [0m | [0m 21.85   [0m | [0m 0.8581  [0m | [0m 314.2   [0m |
| [95m 2       [0m | [95m 0.796   [0m | [95m 31.94   [0m | [95m 0.1826  [0m | [95m 129.9   [0m |
| [95m 3       [0m | [95m 0.8237  [0m | [95m 7.614   [0m | [95m 0.7862  [0m | [95m 272.4   [0m |
| [0m 4       [0m | [0m 0.7914  [0m | [0m 36.86   [0m | [0m 0.0675  [0m | [0m 390.4   [0m |
| [0m 5       [0m | [0m 0.8044  [0m | [0m 42.46   [0m | [0m 0.2305  [0m | [0m 138.2   [0m |
| [0m 6       [0m | [0m 0.8171  [0m | [0m 13.25   [0m | [0m 0.3086  [0m | [0m 247.9   [0m |
| [0m 7       [0m | [0m 0.8016  [0m | [0m 24.44   [0m | [0m 0.2975  [0m | [0m 275.8   [0m |
| [0m 8       [0m | [0m 0.8237  [0m | [0m 11.28   [0m | [0m 0.2983  [0m | [0m 197.2   [0m |
| [0m 9       [0m | [0m 0.7989  [0m | [0m 25.

| [0m 80      [0m | [0m 0.8162  [0m | [0m 13.31   [0m | [0m 0.06652 [0m | [0m 119.7   [0m |
| [0m 81      [0m | [0m 0.8267  [0m | [0m 5.024   [0m | [0m 0.2975  [0m | [0m 170.7   [0m |
| [0m 82      [0m | [0m 0.816   [0m | [0m 9.638   [0m | [0m 0.8925  [0m | [0m 299.2   [0m |
| [0m 83      [0m | [0m 0.7972  [0m | [0m 49.79   [0m | [0m 0.1389  [0m | [0m 379.0   [0m |
| [0m 84      [0m | [0m 0.803   [0m | [0m 5.091   [0m | [0m 0.06611 [0m | [0m 279.6   [0m |
| [0m 85      [0m | [0m 0.7995  [0m | [0m 49.79   [0m | [0m 0.1086  [0m | [0m 299.7   [0m |
| [0m 86      [0m | [0m 0.7926  [0m | [0m 37.01   [0m | [0m 0.05788 [0m | [0m 290.6   [0m |
| [0m 87      [0m | [0m 0.7987  [0m | [0m 49.9    [0m | [0m 0.245   [0m | [0m 194.0   [0m |
| [0m 88      [0m | [0m 0.7971  [0m | [0m 49.81   [0m | [0m 0.1575  [0m | [0m 80.05   [0m |
| [0m 89      [0m | [0m 0.8093  [0m | [0m 7.637   [0m | [0m 0.05066 [0m | 

In [66]:
print(RandomTreeBO.max)

{'target': 0.8312958564814816, 'params': {'max_depth': 6.547483450184828, 'max_features': 0.8229223417669648, 'n_estimators': 162.8095941120054}}


### Sample fitting

In [62]:
forest=RandomForestClassifier(n_estimators = 300,max_features = 0.9,bootstrap = False,max_depth=8,class_weight=class_weights)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False,
                       class_weight={1: 0.37053, 2: 0.49657, 3: 0.05947,
                                     4: 0.00106, 5: 0.01287, 6: 0.02698,
                                     7: 0.03238},
                       criterion='gini', max_depth=8, max_features=0.9,
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=300, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [63]:
#the accuracy of validation
print("special acc is: ",bal_acc(y_train,forest.predict(X_train),weights=list_weight))

special acc is:  (0.9157917476851852, 0.7321428571428571)


In [64]:
predict=forest.predict(X_val)

In [65]:
#the accuracy of validation
print("special acc is: ",bal_acc(y_val,predict,weights=list_weight))

special acc is:  (0.8186674305555555, 0.689484126984127)


Results are: special acc is:  (0.8186674305555555, 0.689484126984127)

In [67]:
forest_2=RandomForestClassifier(n_estimators = 300,max_features = 0.9,bootstrap = False,max_depth=8,class_weight=class_weights)

In [69]:
CrossValidationscores = cross_val_score(forest_2, X_data, train[target_col], cv=5, scoring = my_score)

In [70]:
np.mean(CrossValidationscores)

0.8202221712962963

CV of 5+1 model 0.8202221712962963


Now adding the Gaussian Mixture

In [220]:
test=pd.read_csv(os.path.join(os.getcwd(),'test.csv.zip'))

In [93]:
from sklearn.mixture import GaussianMixture
features_cluster=soil_type+wild_type+hor_dist+['Elevation']
gmix = GaussianMixture(n_components=11)
gmix.fit(test[features_cluster])

temp_train_cluster = gmix.predict(train[features_cluster])
temp_test_cluster = gmix.predict(test[features_cluster])

In [221]:
#X_data_3=np.hstack([cat_pred.iloc[:,1:].values,for_pred.iloc[:,1:].values,NN_pred.iloc[:,1:].values])
#X_data_3=np.hstack([X_data,np.expand_dims(temp_train_cluster,-1)])
X_data_3=X_data

In [222]:
forest_3=RandomForestClassifier(n_estimators = 300,max_features = 'auto',bootstrap = False,max_depth=7,class_weight=class_weights)

In [223]:
CrossValidationscores_2 = cross_val_score(forest_3, X_data_3, train_new[target_col], cv=5, scoring = my_score)

In [224]:
np.mean(CrossValidationscores_2)

0.828467412037037

### Making final prediction with bagging

In [78]:
cat_pred_test=pd.read_csv(os.path.join(os.getcwd(),'cat_test_prob.csv'))
for_pred_test=pd.read_csv(os.path.join(os.getcwd(),'forest_test_prob.csv'))
for_smal_pred_test=pd.read_csv(os.path.join(os.getcwd(),'forest_smal_test_prob.csv'))
NN_pred_test=pd.read_csv(os.path.join(os.getcwd(),'NN_test_prob.csv'))
H2O_1_test_pred=pd.read_csv(os.path.join(os.getcwd(),'h20_1_test_prob.csv'))
H2O_2_test_pred=pd.read_csv(os.path.join(os.getcwd(),'h20_2_test_prob.csv'))
H2O_3_test_pred=pd.read_csv(os.path.join(os.getcwd(),'h20_3_test_prob.csv'))
X_test=np.hstack([cat_pred_test.iloc[:,1:].values,for_pred_test.iloc[:,1:].values,for_smal_pred_test.iloc[:,1:].values,NN_pred_test.iloc[:,1:].values,H2O_1_test_pred.iloc[:,2:].values,H2O_2_test_pred.iloc[:,2:].values,H2O_3_test_pred.iloc[:,2:].values])
#X_test=np.hstack([X_test,np.expand_dims(temp_test_cluster,-1)])
test=pd.read_csv(os.path.join(os.getcwd(),'test.csv.zip'))

In [72]:
def train_forest(data_train,y_train,data_val,bagg=10):
    temp_result=[]
    for i in range(bagg):
        forest_model=RandomForestClassifier(n_estimators = 300,max_features = 0.9,bootstrap = False,max_depth=8,class_weight=class_weights,verbose=0,random_state=i*10)
        #forest_model=RandomForestClassifier(n_estimators = 100,max_features = 'auto',bootstrap = False,max_depth=87,verbose=0,class_weight=class_weights,random_state=i*10)
        forest_model.fit(data_train, y_train)
        temp_result.append(forest_model.predict_proba(data_val))
        print(i,' fit finished')
    return np.mean(temp_result,axis=0)

In [74]:
forest_full=train_forest(X_data,train[target_col],X_test,bagg=10)

0  fit finished
1  fit finished
2  fit finished
3  fit finished
4  fit finished
5  fit finished
6  fit finished
7  fit finished
8  fit finished
9  fit finished


In [75]:
pred_test=np.argmax(forest_full,axis=1)+1

In [79]:
sub=pd.DataFrame(test.Id.values,columns=['ID'])
sub['Cover_Type']=pred_test
sub.to_csv('merged_forest_tuned.csv',index=False)

Result 0.83316