## 3.1 Imports

In [131]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import BorderlineSMOTE


## 3.2 Load The  Forest Cover Type Data

In [64]:
covtype = pd.read_csv('../data/covtype_step3_features.csv')
covtype.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,interact_Hillshade_9am3pm,interact_Hillshade_9amNoon,interact_Hillshade_3pmNoon,Euclidean_Distance_To_Hydrology
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,5,32708,51272,34336,258.0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,5,33220,51700,35485,212.084889
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,2,31590,55692,32130,275.769832
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,2,29036,56644,29036,269.235956
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,5,33000,51480,35100,153.003268


In [65]:
X = covtype.loc[:,covtype.columns != 'Cover_Type']
y = covtype.loc[:,covtype.columns == 'Cover_Type']

## 3.3 Train/Test Split

In [79]:
#split data into trainning set, validation set, and test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42,stratify = y)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 0.2,random_state  = 42,stratify = y_train)


In [80]:
X_train.shape

(371847, 58)

## Target Encoding On Categorical variable

In [195]:
def onehot_to_label(X,feature_name):   
    '''transform the one hot encoding columns to a label column'''

    X = X.iloc[:,X.columns.str.contains(pat = feature_name + '.*')] 
    Label = X.apply(lambda row : row.argmax(), axis = 1)
    return Label

In [187]:
def target_encoding(X,feature):
    label = onehot_to_label(df,feature)
    target = df.Cover_type 
    encoded_features = pd.DataFrame({feature:label,'target':target})
    
    #total 7 cover types
    n = 7
    for i in range(n):
        encoded_feature_name = 'Encoded_' + feature + str(i + 1)
        encoded_features[encoded_feature_name] = np.array(target == (i+1)).astype('int')
        posterior_prob = encoded_features[[encoded_feature_name,feature]] \
                                .groupby(feature,as_index = False) \
                                .mean() 
        encoded_features.drop(columns  = encoded_feature_name,inplace = True)
        encoded_features  = encoded_features.merge(posterior_prob,on = feature, how = 'left')
        
    return encoded_features


    
    

In [None]:
target_encoding(X_train)

In [196]:
onehot_to_label(X_train,'Wilderness')

195730    0
489741    2
505525    1
258619    0
257402    0
         ..
93082     0
31542     0
44641     0
548589    2
316058    2
Length: 371847, dtype: int64

## 3.4 Scale The Data

In [81]:
#extract categorical indice because we dont want to scale categorical features
cat_index = X.columns.str.contains('Wilderness_Area|Soil_Type',regex=True)  

In [82]:
#standardize trianning set

cat_features = X_train.loc[:,cat_index]
noncat_features = X_train.loc[:,~cat_index]
scaler = StandardScaler()
scaler.fit(noncat_features)
scaled_noncat_features = scaler.transform(noncat_features)

#concatenate the scaled numeric features and categorical features
X_train = pd.concat([pd.DataFrame(scaled_noncat_features,columns = X.columns[~cat_index],index = cat_features.index),
                     cat_features],axis = 1)


In [83]:
#standardize validation set

cat_features = X_val.loc[:,cat_index]
noncat_features = X_val.loc[:,~cat_index]
scaled_noncat_features = scaler.transform(noncat_features)
X_val = pd.concat([pd.DataFrame(scaled_noncat_features,columns = X.columns[~cat_index],index = cat_features.index),
                     cat_features],axis = 1)

In [84]:
#standardize test set

cat_features = X_test.loc[:,cat_index]
noncat_features = X_test.loc[:,~cat_index]
scaled_noncat_features = scaler.transform(noncat_features)
X_test = pd.concat([pd.DataFrame(scaled_noncat_features,columns = X.columns[~cat_index],index = cat_features.index),
                     cat_features],axis = 1)

## 3.5 Perform Oversampling On the Data

In [87]:
#only oversample the trainning set and keep the validation, test set unchanged
sm = BorderlineSMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [110]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_res,y_train_res.values.ravel())

RandomForestClassifier(random_state=42)

In [112]:
y_val_hat = clf.predict(X_val)

In [119]:
from sklearn.metrics import classification_report, confusion_matrix, plot_roc_curve


print(classification_report(y_val_hat,y_val.values.ravel()))

              precision    recall  f1-score   support

           1       0.95      0.94      0.95     34225
           2       0.94      0.96      0.95     44506
           3       0.96      0.91      0.93      6012
           4       0.87      0.87      0.87       439
           5       0.87      0.83      0.85      1595
           6       0.89      0.86      0.88      2875
           7       0.96      0.96      0.96      3310

    accuracy                           0.94     92962
   macro avg       0.92      0.90      0.91     92962
weighted avg       0.94      0.94      0.94     92962



In [117]:
y_val_prob_hat = clf.predict_proba(X_val)