# Lab | Random Forests


### Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.
- Use Feature Selections that you have learned in class to decide if you want to use all of the features (Variance Threshold, RFE, PCA, etc.)
- Re-run the Random Forest algorithm to determine if the Feature Selection has improved the results.
- Discuss the output and its impact in the business scenario. Is the cost of a false positive equals to the cost of the false negative? How would you change your algorithm or data in order to maximize the return of the business?


In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
target = pd.read_csv('target.csv')

In [3]:
data = pd.concat([numerical, categorical, target], axis=1)
data

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,TARGET_B,TARGET_D
0,0,60.000000,5,9,0,0,39,34,18,10,...,92,8,94,2,95,12,89,11,0,0.0
1,1,46.000000,6,9,16,0,15,55,11,6,...,93,10,95,12,95,12,93,10,0,0.0
2,1,61.611649,3,1,2,0,20,29,33,6,...,91,11,92,7,95,12,90,1,0,0.0
3,0,70.000000,1,4,2,0,23,14,31,3,...,87,11,94,11,95,12,87,2,0,0.0
4,0,78.000000,3,2,60,1,28,9,53,26,...,93,10,96,1,96,1,79,3,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,96,2,96,2,96,2,96,2,0,0.0
95408,1,48.000000,7,9,1,0,31,43,19,4,...,96,3,96,3,96,3,96,3,0,0.0
95409,1,60.000000,5,9,0,0,18,46,20,7,...,96,3,95,1,96,10,94,10,0,0.0
95410,0,58.000000,7,9,0,0,28,35,20,9,...,90,11,96,8,97,1,86,12,1,18.0


In [4]:
y = data['TARGET_B']
X = data.drop(['TARGET_B','TARGET_D'], axis = 1)

In [5]:
# Note: we need to do train/test split before downsampling, and then only upsample the training set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [7]:
numericalX_train = X_train.select_dtypes(np.number)
numericalX_test = X_test.select_dtypes(np.number)
categoricalX_train = X_train.select_dtypes(object)
categoricalX_test = X_test.select_dtypes(object)

In [8]:
# we OneHotEncode the categoricals so we can use the same dataset to perform a regression later (in the lab).

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first').fit(categoricalX_train)
encoded_categorical_train = encoder.transform(categoricalX_train).toarray()
encoded_categorical_train = pd.DataFrame(encoded_categorical_train, columns=encoder.get_feature_names_out()) # needed to avoid error
encoded_categorical_test = encoder.transform(categoricalX_test).toarray()
encoded_categorical_test = pd.DataFrame(encoded_categorical_test, columns=encoder.get_feature_names_out())


In [9]:
# we scale the numericals so we can use the same dataset to perform a regression later
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(numericalX_train)
scaled_numerical_train = transformer.transform(numericalX_train)
scaled_numerical_train = pd.DataFrame(scaled_numerical_train, columns = numericalX_train.columns)
scaled_numerical_test = transformer.transform(numericalX_test)
scaled_numerical_test = pd.DataFrame(scaled_numerical_test, columns = numericalX_train.columns)

In [10]:
X_train = pd.concat([scaled_numerical_train, encoded_categorical_train], axis = 1)
X_test = pd.concat([scaled_numerical_test, encoded_categorical_test], axis = 1)

In [11]:
#we need to temporarily concat X_train and y_train
trainset = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
trainset

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,TARGET_B
0,0.000017,0.762887,0.500000,0.666667,0.008299,0.000000,0.313131,0.101010,0.686869,0.060606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.000000,0.536082,0.666667,1.000000,0.000000,0.000000,0.292929,0.242424,0.383838,0.070707,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
2,0.000017,0.608247,0.666667,0.111111,0.020747,0.000000,0.424242,0.161616,0.626263,0.101010,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.000017,0.783505,0.833333,0.666667,0.037344,0.010101,0.404040,0.232323,0.414141,0.080808,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,0.000052,0.556701,0.666667,0.222222,0.087137,0.333333,0.272727,0.292929,0.181818,0.121212,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76324,0.000000,0.711340,0.333333,1.000000,0.020747,0.020202,0.373737,0.101010,0.323232,0.262626,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
76325,0.000034,0.670103,0.333333,0.666667,0.000000,0.000000,0.363636,0.111111,0.626263,0.040404,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
76326,0.000017,0.624862,0.666667,1.000000,0.000000,0.000000,0.353535,0.292929,0.424242,0.171717,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
76327,0.000017,0.773196,0.666667,0.333333,0.004149,0.000000,0.595960,0.222222,0.313131,0.030303,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0


In [12]:
# oversample the data
from sklearn.utils import resample
category_0 = trainset[trainset['TARGET_B'] == 0]
category_1 = trainset[trainset['TARGET_B'] == 1]

category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))
train_upsampled = pd.concat([category_1_oversampled, category_0], axis=0)

In [13]:
train_upsampled['TARGET_B'].value_counts()

1    72486
0    72486
Name: TARGET_B, dtype: int64

In [14]:
train_upsampled.shape

(144972, 355)

In [15]:
X_train = train_upsampled.drop(['TARGET_B'], axis=1)
y_train = train_upsampled['TARGET_B']

In [16]:
# Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6193816737025081
0.6064036053031494


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[11021,  7062],
       [  449,   551]])

In [18]:
# USE PCA HERE

In [19]:
from sklearn.decomposition import PCA

In [22]:
pca = PCA(80)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train_pca, y_train)
print(clf.score(X_train_pca, y_train))
print(clf.score(X_test_pca, y_test))

y_pred = clf.predict(X_test_pca)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6346604861628452
0.6260546035738616


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[11427,  6656],
       [  480,   520]])

In [None]:
# not too much difference on model performance