## 3.1 Imports

In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


## 3.2 Load The  Forest Cover Type Data

In [9]:
covtype = pd.read_csv('../data/covtype_step2_features.csv')
covtype.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,interact_Hillshade_9am3pm,interact_Hillshade_9amNoon,interact_Hillshade_3pmNoon,Euclidean_Distance_To_Hydrology
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,5,32708,51272,34336,258.0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,5,33220,51700,35485,212.084889
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,2,31590,55692,32130,275.769832
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,2,29036,56644,29036,269.235956
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,5,33000,51480,35100,153.003268


In [10]:
X = covtype.loc[:, covtype.columns != 'Cover_Type']
y = covtype.loc[:, covtype.columns == 'Cover_Type']

## 3.3 Train/Test Split

In [11]:
#split data into trainning set, validation set, and test set
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 42, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state  = 42, stratify = y_train)

In [12]:
X_train.shape

(371847, 58)

## 3.4 Target Encoding On Categorical variable

Our data set has categorical features that are one-hot encoded,where soil type consists of 40 types and wilderness area consists of 4 types. The higher dimension of the feature matrix sometimes may cause the algorithm having a hard time to learn form the data, so we also want to explore if target encoding the categorical features will results in a better outcome.  

In [13]:
def onehot_to_label(X, cat_name):   
    '''transform the one hot encoding columns to a label column'''
    
    X = X.iloc[:, X.columns.str.contains(pat = cat_name + '.*')] 
    Label = X.apply(lambda row : row.argmax(), axis = 1)
    return Label

In [14]:
def target_encoding(X, target, cat_name):
    '''return the posterior probility of a cover type given a sample belonging to a certain category'''
    
    label = onehot_to_label(X, cat_name)
    label_target = pd.DataFrame({cat_name: label, 'target': target.Cover_Type})
    posterior_prob = pd.DataFrame(index = range(label.nunique()))
    
    #total 7 cover types
    n = 7
    for i in range(1, n+1):
        label_target['ith_Covtype'] = np.array(target == i).astype('int')
        encoded_feature = 'PProb_CovType' + str(i) + '|' + cat_name
        posterior_prob[encoded_feature] = label_target[['ith_Covtype', cat_name]
                                                      ].groupby(cat_name).mean() 

        label_target.drop(columns  = 'ith_Covtype', inplace = True)  
        
    return posterior_prob
 

In [15]:
def merge_encoded_features(X, encoded_features, cat_name):
    '''replace the one-hot encoded features by target encoded features'''
    X_copy = X.copy()
    X_copy[cat_name] = onehot_to_label(X_copy, cat_name)
    new_X = X_copy.merge(encoded_features, how = 'left', left_on = cat_name, right_index = True
                        ).drop(columns = X_copy.columns[X_copy.columns.str.contains('^' + cat_name)])
    return new_X
    

### 3.4.1 Target Encoding On Soil Type

In [16]:
#each cell represents the posterior probability of a cover type given the soil type,eg:encoded_SoilType.iloc[0,1] is 
#the posterior probability of a sample belonging to Cover type 1 given it belonging to soil type 
encoded_SoilType = target_encoding(X_train, y_train, 'Soil_Type')
encoded_SoilType.head()

Unnamed: 0,PProb_CovType1|Soil_Type,PProb_CovType2|Soil_Type,PProb_CovType3|Soil_Type,PProb_CovType4|Soil_Type,PProb_CovType5|Soil_Type,PProb_CovType6|Soil_Type,PProb_CovType7|Soil_Type
0,0.0,0.0,0.690814,0.059318,0.0,0.249869,0.0
1,0.0,0.112611,0.66027,0.014973,0.034795,0.177351,0.0
2,0.0,0.248967,0.493482,0.213355,0.0,0.044197,0.0
3,0.01448,0.264039,0.605137,0.012843,0.047973,0.049232,0.006296
4,0.0,0.0,0.611969,0.033784,0.0,0.354247,0.0


In [18]:
#add the target encoded soil type feature to the feature matrix
#for validation set and test set, we transfer the categorical features using the benchmarks from the tranning set,
#in order to avoid data leakage
X_train_target = merge_encoded_features(X_train, encoded_SoilType, 'Soil_Type')
X_val_target = merge_encoded_features(X_val, encoded_SoilType, 'Soil_Type')
X_test_target = merge_encoded_features(X_test, encoded_SoilType, 'Soil_Type')

### 3.4.2 Target Encoding On Wilderness Area

In [19]:
#each cell represents the posterior probability of a cover type given the wilderness area,
#eg:encoded_WildernessArea.iloc[0,1] is the posterior probability of a sample belonging to Cover type 1 given it is
#in wilderness area 0.
encoded_WildernessArea = target_encoding(X_train, y_train, 'Wilderness_Area')
encoded_WildernessArea

Unnamed: 0,PProb_CovType1|Wilderness_Area,PProb_CovType2|Wilderness_Area,PProb_CovType3|Wilderness_Area,PProb_CovType4|Wilderness_Area,PProb_CovType5|Wilderness_Area,PProb_CovType6|Wilderness_Area,PProb_CovType7|Wilderness_Area
0,0.405578,0.560433,0.0,0.0,0.014547,0.0,0.019442
1,0.620827,0.301871,0.0,0.0,0.0,0.0,0.077302
2,0.345674,0.494063,0.056129,0.0,0.022481,0.029854,0.051799
3,0.0,0.080725,0.580725,0.074188,0.0,0.264361,0.0


In [20]:
#add the target encoded wilderness area feature to the feature matrix
X_train_target = merge_encoded_features(X_train_target, encoded_WildernessArea, 'Wilderness_Area')
X_val_target = merge_encoded_features(X_val_target, encoded_WildernessArea, 'Wilderness_Area')
X_test_target = merge_encoded_features(X_test_target, encoded_WildernessArea, 'Wilderness_Area')

In [21]:
X_train_target.columns

Index(['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'interact_Hillshade_9am3pm',
       'interact_Hillshade_9amNoon', 'interact_Hillshade_3pmNoon',
       'Euclidean_Distance_To_Hydrology', 'PProb_CovType1|Soil_Type',
       'PProb_CovType2|Soil_Type', 'PProb_CovType3|Soil_Type',
       'PProb_CovType4|Soil_Type', 'PProb_CovType5|Soil_Type',
       'PProb_CovType6|Soil_Type', 'PProb_CovType7|Soil_Type',
       'PProb_CovType1|Wilderness_Area', 'PProb_CovType2|Wilderness_Area',
       'PProb_CovType3|Wilderness_Area', 'PProb_CovType4|Wilderness_Area',
       'PProb_CovType5|Wilderness_Area', 'PProb_CovType6|Wilderness_Area',
       'PProb_CovType7|Wilderness_Area'],
      dtype='object')

## 3.5 Scale The Data

### 3.5.1 Scale Data With One Hot Encoded Features

In [22]:
#extract categorical indice because we dont want to scale one hot encoded features
cat_index = X.columns.str.contains('Wilderness_Area|Soil_Type', regex=True)  

In [23]:
#standardize the training set
cat_features = X_train.loc[:, cat_index]
noncat_features = X_train.loc[:, ~cat_index]

scaler = StandardScaler()
scaler.fit(noncat_features)
scaled_noncat_features = scaler.transform(noncat_features)
#concatenate the scaled numeric features and categorical features
X_train_onehot = pd.concat([pd.DataFrame(scaled_noncat_features,
                                         columns = X.columns[~cat_index],
                                         index = cat_features.index
                                        ),cat_features], axis = 1)


#standardize validation set
cat_features = X_val.loc[:, cat_index]
noncat_features = X_val.loc[:, ~cat_index]
scaled_noncat_features = scaler.transform(noncat_features)
#concatenate the scaled numeric features and categorical features
X_val_onehot = pd.concat([pd.DataFrame(scaled_noncat_features,
                                       columns = X.columns[~cat_index],
                                       index = cat_features.index
                                      ), cat_features], axis = 1)


#standardize test set
cat_features = X_test.loc[:, cat_index]
noncat_features = X_test.loc[:, ~cat_index]
scaled_noncat_features = scaler.transform(noncat_features)
#concatenate the scaled numeric features and categorical features
X_test_onehot = pd.concat([pd.DataFrame(scaled_noncat_features,
                                        columns = X.columns[~cat_index],
                                        index = cat_features.index
                                       ), cat_features], axis = 1)

### 3.5.2 Scale Data With Target Encoded Features

In [24]:
#store column names for data storage in the future
col_names_target = X_train_target.columns

#standardize trianning set
scaler = StandardScaler()
scaler.fit(X_train_target)
X_train_target = scaler.transform(X_train_target)

#standardize validation set
X_val_target = scaler.transform(X_val_target)

#standardize validation set
X_test_target = scaler.transform(X_test_target)

## 3.6 Perform Oversampling On the Data

In order to address the issue of class imblance, we perform oversampling on the trainning set, and
keep validation set and test set intact.

In [25]:
#oversampling data with one hot encoded features
sm1 = BorderlineSMOTE(random_state=42)
X_train_onehot_res, y_train_onehot_res = sm1.fit_resample(X_train_onehot, y_train)

#oversample data with target encoded features 
sm2 = BorderlineSMOTE(random_state=42)
X_train_target_res, y_train_target_res = sm2.fit_resample(X_train_target, y_train)

## 3.7 Compare One Hot Encoding and Target Encoding 

In [26]:
#using one hot encoded features to fit a random forest model
clf1 = RandomForestClassifier(random_state=42, n_jobs = -1)
clf1.fit(X_train_onehot_res, y_train_onehot_res.values.ravel())
pred_y_val1 = clf1.predict(X_val_onehot)

print(classification_report(pred_y_val1, y_val.values.ravel()))

              precision    recall  f1-score   support

           1       0.95      0.94      0.95     34225
           2       0.94      0.96      0.95     44506
           3       0.96      0.91      0.93      6012
           4       0.87      0.87      0.87       439
           5       0.87      0.83      0.85      1595
           6       0.89      0.86      0.88      2875
           7       0.96      0.96      0.96      3310

    accuracy                           0.94     92962
   macro avg       0.92      0.90      0.91     92962
weighted avg       0.94      0.94      0.94     92962



In [27]:
#using one target encoded features to fit a random forest model
clf2 = RandomForestClassifier(random_state=42, n_jobs = -1)
clf2.fit(X_train_target_res,  y_train_target_res.values.ravel())
pred_y_val2 = clf2.predict(X_val_target)

print(classification_report(pred_y_val2, y_val.values.ravel()))

              precision    recall  f1-score   support

           1       0.96      0.95      0.95     34178
           2       0.95      0.97      0.96     44579
           3       0.96      0.92      0.94      5988
           4       0.87      0.88      0.88       437
           5       0.88      0.83      0.85      1619
           6       0.90      0.88      0.89      2840
           7       0.97      0.96      0.96      3321

    accuracy                           0.95     92962
   macro avg       0.93      0.91      0.92     92962
weighted avg       0.95      0.95      0.95     92962



The model fitting on target encoded features comprehensively outperforms the one fitting on encoded features. Most of the socres in precision, recall, f1-score have a 1% improvement, so we will keep the the feature matrix containing target encode features for our future modeling.

## 3.8 Save Data

In [28]:
#save trainning set
train_new = pd.DataFrame(X_train_target_res, columns = col_names_target).join(y_train_target_res)
datapath = '../data'
train_new.to_csv(datapath + '/training_step3.csv', index=False)

#save validation set
val_new = pd.DataFrame(X_val_target, columns = col_names_target, index = y_val.index).join(y_val)
val_new.to_csv(datapath + '/validation_step3.csv', index=False)

#save validation set
test_new = pd.DataFrame(X_test_target, columns = col_names_target, index = y_test.index).join(y_test)
test_new.to_csv(datapath + '/test_step3.csv', index=False)
