# Lab Random Forests

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_score

In [2]:
numerical = pd.read_csv('files_for_lab/numerical.csv')
categorical = pd.read_csv('files_for_lab/categorical.csv')
target = pd.read_csv('files_for_lab/target.csv')

# Preprocessing

#### X-y Split

In [3]:
X = pd.concat([numerical,categorical], axis = 1)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, target.drop('TARGET_D', axis = 1), test_size=0.2)

#### Num-Cat Split

In [5]:
train_num  = X_train.select_dtypes(include = np.number)
train_cat = X_train.select_dtypes(include = object)

test_num  = X_test.select_dtypes(include = np.number)
test_cat = X_test.select_dtypes(include = object)

#### Scaling

In [6]:
from sklearn.preprocessing import MinMaxScaler
# Fitting scaler
transformer = MinMaxScaler().fit(train_num)
# Scaling train and test data
train_num_scaled = pd.DataFrame(transformer.transform(train_num), columns = train_num.columns, index = train_num.index)
test_num_scaled = pd.DataFrame(transformer.transform(test_num), columns = test_num.columns, index = test_num.index)

#### Encoding

In [7]:
from sklearn.preprocessing import OneHotEncoder
# Fit encoder
encoder = OneHotEncoder(handle_unknown='ignore').fit(train_cat)
# Getting the column names for the later selection
column_name = encoder.get_feature_names_out(train_cat.columns)
# Encode train and test
train_encoded = pd.DataFrame(encoder.transform(train_cat).toarray(),columns = column_name, index=train_cat.index)
test_encoded = pd.DataFrame(encoder.transform(test_cat).toarray(),columns = column_name, index=test_cat.index)

# Apply the Random Forests algorithm but this time only by upscaling the data to deal with the imbalance.

## Oversampling

#### Concatenating

In [8]:
train_all = pd.concat([train_encoded, train_num_scaled, y_train['TARGET_B']], axis = 1)
X_test = pd.concat([test_encoded,test_num_scaled], axis = 1)

#### Upscaling

In [9]:
from sklearn.utils import resample
# Splitting into majority and minority class, the 'yes' donors are the minority class.
no = train_all[train_all['TARGET_B']==0]
yes = train_all[train_all['TARGET_B']==1]

In [10]:
# oversample minority
yes_oversampled = resample(yes, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no),#<- make both sets the same size
                                    )

In [11]:
# Concatenate with majority class
train_oversampled = pd.concat([no,yes_oversampled],axis=0)

In [12]:
# X-y Split again
X_train_over = train_oversampled.drop('TARGET_B', axis = 1)
y_train_over = train_oversampled['TARGET_B']

## Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [14]:
# Defining the classifier
clf = RandomForestClassifier(max_depth=5)

In [15]:
def model_eval(model,X_train,y_train,X_test, y_test):
        # Fitting
        model.fit(X_train, y_train)
        # Predicting
        predictions = model.predict(X_test)
        # Calculating confusion matrix
        cm = confusion_matrix(y_test, predictions)
        # Printing different evaluation metrics
        print( str(model.base_estimator_)[:-2] + " score: ", model.score(X_test, y_test))
        print( str(model.base_estimator_)[:-2] + " precision: ",precision_score(y_test,predictions))
        print( str(model.base_estimator_)[:-2] + " recall: ",recall_score(y_test,predictions))
        print( str(model.base_estimator_)[:-2] + " f1: ",f1_score(y_test,predictions))
        print('\n')
        print(cm)
        return model

In [16]:
model_eval(clf,X_train_over,y_train_over,X_test, y_test)

DecisionTreeClassifier score:  0.5955562542577163
DecisionTreeClassifier precision:  0.06565334014561247
DecisionTreeClassifier recall:  0.5605234460196292
DecisionTreeClassifier f1:  0.11753944660416192


[[10851  7315]
 [  403   514]]


RandomForestClassifier(max_depth=5)

# Use Feature Selections that you have learned in class to decide if you want to use all of the features
(Variance Threshold, RFE, PCA, etc.)

## Variance Threshhold

In [17]:
from sklearn.feature_selection import VarianceThreshold 
# set threshhold
sel = VarianceThreshold(threshold= 0.02)

In [18]:
# Fitting with our numericals
sel = sel.fit(train_num_scaled)

In [19]:
# Check wich columns have low variance
sel.get_support()
# Making it a list
var_list = list(sel.get_support())
# Creating a droplist
droplist_var = [col[0] for col in zip(train_num_scaled.columns, var_list) if col[1] == False]
print(droplist_var)
len(droplist_var)

['TCODE', 'HIT', 'MALEMILI', 'MALEVET', 'LOCALGOV', 'STATEGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C4', 'POP90C5', 'ETH3', 'ETH4', 'ETH5', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 'ETH10', 'ETH11', 'ETH12', 'ETH13', 'ETH14', 'ETH15', 'ETH16', 'AGE901', 'AGE902', 'AGE903', 'AGE904', 'AGE905', 'AGE906', 'AGE907', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'AGEC4', 'AGEC5', 'AGEC6', 'AGEC7', 'CHILC1', 'CHILC2', 'CHILC3', 'CHILC4', 'CHILC5', 'HHAGE1', 'HHAGE2', 'HHAGE3', 'HHN1', 'HHN2', 'HHN4', 'HHN5', 'HHN6', 'MARR1', 'MARR2', 'MARR3', 'MARR4', 'HHP1', 'HHP2', 'DW3', 'DW7', 'DW8', 'DW9', 'HU3', 'HU4', 'HHD1', 'HHD4', 'HHD6', 'HHD7', 'HHD8', 'HHD9', 'HHD10', 'HHD11', 'HHD12', 'ETHC1', 'ETHC3', 'ETHC4', 'ETHC5', 'ETHC6', 'HUR1', 'RHP1', 'RHP2', 'RHP3', 'RHP4', 'HUPA1', 'HUPA4', 'HUPA5', 'HUPA7', 'DMA', 'IC1', 'IC2', 'IC3', 'IC4', 'IC5', 'IC7', 'IC8', 'IC9', 'IC10', 'IC11', 'IC12', 'IC13', 'IC14', 'IC15', 'IC16', 'IC17', 'IC18', 'IC19', 'IC20', 'IC21', 'IC22', 'IC23', 'HHAS1', '

240

In [20]:
# Trying our model without the low variance columns:
model_eval(clf,X_train_over.drop(droplist_var, axis = 1),y_train_over,X_test.drop(droplist_var, axis = 1), y_test)

DecisionTreeClassifier score:  0.6052507467379343
DecisionTreeClassifier precision:  0.06724228152799581
DecisionTreeClassifier recall:  0.5605234460196292
DecisionTreeClassifier f1:  0.12007942997313398


[[11036  7130]
 [  403   514]]


RandomForestClassifier(max_depth=5)

#### Conclusion

The removal of the low variance columns did not make much of a difference. The results got marginally better.

## Recursive feature elimination

We will continue to work without the columns we eliminated with VT, because this will considerably speed up our RFE process.

In [21]:
# %%time

# from sklearn.feature_selection import RFE
# from sklearn import linear_model
# # Creating RFE model with Randomforest as Estimator
# rfe = RFE(clf, n_features_to_select=20, verbose=False)

# # Fitting
# rfe.fit(train_all.drop(droplist_var, axis = 1), y_train.values.ravel())

In [22]:
# Saving to not rerun:
import pickle
# pickle.dump(rfe, open('rfe.p', 'wb'))

In [23]:
rfe_l = pickle.load(open('rfe.p','rb'))

In [24]:
rfe_l

RFE(estimator=RandomForestClassifier(max_depth=5), n_features_to_select=20,
    verbose=False)

In [25]:
# Checking Ranking of columns
df = pd.DataFrame(data = rfe_l.ranking_, columns=['Rank'])
df['Column_name'] = pd.DataFrame(train_all.drop(droplist_var, axis = 1)).columns
df = df[df['Column_name'] != 'TARGET_B']
df.sort_values(by = ['Rank'], ascending = False).head(20)

Unnamed: 0,Rank,Column_name
0,103,STATE_CA
1,102,STATE_FL
2,101,STATE_GA
23,100,GEOCODE2_B
4,99,STATE_IN
3,98,STATE_IL
14,97,GENDER_F
6,96,STATE_MO
25,95,GEOCODE2_D
17,94,RFA_2R_L


In [26]:
for r in range(1,20,4):
    droplist_rfe = list(df[df['Rank'] < r]['Column_name'])
    droplist_full = droplist_var
    droplist_full.extend(droplist_rfe)
    # Creating dataframes without the eliminated columns
    temp_train = X_train_over.drop(droplist_full, axis = 1).copy()
    temp_test = X_test.drop(droplist_full, axis = 1).copy()
    
    # Checking results:
    print('\n\n\n Testing with columns ranking higher than ' +str(r))
    model_eval(clf,temp_train,y_train_over,temp_test, y_test)




 Testing with columns ranking higher than 1
DecisionTreeClassifier score:  0.601582560394068
DecisionTreeClassifier precision:  0.0668566986265872
DecisionTreeClassifier recall:  0.5627044711014176
DecisionTreeClassifier f1:  0.1195136074116966


[[10964  7202]
 [  401   516]]



 Testing with columns ranking higher than 5
DecisionTreeClassifier score:  0.5969187234711524
DecisionTreeClassifier precision:  0.056319580877537655
DecisionTreeClassifier recall:  0.46892039258451473
DecisionTreeClassifier f1:  0.10056127221702525


[[10961  7205]
 [  487   430]]



 Testing with columns ranking higher than 9
DecisionTreeClassifier score:  0.5948750196509983
DecisionTreeClassifier precision:  0.05833549390718175
DecisionTreeClassifier recall:  0.49073064340239914
DecisionTreeClassifier f1:  0.10427528675703858


[[10902  7264]
 [  467   450]]



 Testing with columns ranking higher than 13
DecisionTreeClassifier score:  0.5793114290205943
DecisionTreeClassifier precision:  0.0571677668451

# Final model

In [27]:
final_model = clf.fit(temp_train, y_train_over)                     #model_eval(clf,temp_train,y_train_over,temp_test, y_test)

# Predicting for our whole dataframe:

#### Num-cat Split:

In [None]:
X =  

In [28]:
numerical  = X.select_dtypes(include = np.number)
categorical = X.select_dtypes(include = object)

#### Scaling:

In [29]:
num_scaled = pd.DataFrame(transformer.transform(numerical), columns = train_num.columns, index = numerical.index)

#### Encoding:

In [30]:
encoded = pd.DataFrame(encoder.transform(categorical).toarray(),columns = column_name, index=categorical.index)

#### Concatenating:

In [31]:
X_all = pd.concat([encoded,num_scaled], axis = 1)

#### Dropping columns:

In [32]:
X_all_selected = X_all.drop(droplist_full,axis = 1)

#### Predicting:

In [33]:
X['predicted_donate'] = clf.predict(X_all_selected)

In [34]:
clf.predict(X_all_selected)

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

#### Evaluating:

In [35]:
predictions = clf.predict(X_all_selected)
# Calculating confusion matrix
cm = confusion_matrix(target['TARGET_B'], predictions)
# Printing different evaluation metrics
print( str(clf.base_estimator_)[:-2] + " score: ", clf.score(X_all_selected, target['TARGET_B']))
print( str(clf.base_estimator_)[:-2] + " precision: ",precision_score(target['TARGET_B'],predictions))
print( str(clf.base_estimator_)[:-2] + " recall: ",recall_score(target['TARGET_B'],predictions))
print( str(clf.base_estimator_)[:-2] + " f1: ",f1_score(target['TARGET_B'],predictions))
cm

DecisionTreeClassifier score:  0.6051230453192471
DecisionTreeClassifier precision:  0.07196307981122728
DecisionTreeClassifier recall:  0.5698946933718769
DecisionTreeClassifier f1:  0.12778961014908788


array([[54976, 35593],
       [ 2083,  2760]], dtype=int64)

In [36]:
df_all = pd.concat([X,target], axis = 1)

# Lab | Case Regression

## Only look at people who have donated (Target B = 1)

In [37]:
# Dataframe for building the model
df_regr_build = df_all[df_all['TARGET_B']==1]
# Dataframe for making predictions
df_regr_pred = df_all[df_all['predicted_donate']==1]

## Preprocessing

#### X-y Split

In [38]:
X = df_regr_build.drop(['TARGET_D', 'TARGET_B', 'predicted_donate'], axis = 1)
y = df_regr_build['TARGET_D']

#### Train-Test Split

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Num-Cat Split

In [40]:
train_num  = X_train.select_dtypes(include = np.number)
train_cat = X_train.select_dtypes(include = object)

test_num  = X_test.select_dtypes(include = np.number)
test_cat = X_test.select_dtypes(include = object)

#### Scaling

In [41]:
from sklearn.preprocessing import MinMaxScaler
# Fitting scaler
transformer = MinMaxScaler().fit(train_num)
# Scaling train and test data
train_num_scaled = pd.DataFrame(transformer.transform(train_num), columns = train_num.columns, index = train_num.index)
test_num_scaled = pd.DataFrame(transformer.transform(test_num), columns = test_num.columns, index = test_num.index)

#### Encoding

In [42]:
from sklearn.preprocessing import OneHotEncoder
# Fit encoder
encoder = OneHotEncoder(handle_unknown='ignore').fit(train_cat)
# Getting the column names for the later selection
column_name = encoder.get_feature_names_out(train_cat.columns)
# Encode train and test
train_encoded = pd.DataFrame(encoder.transform(train_cat).toarray(),columns = column_name, index=train_cat.index)
test_encoded = pd.DataFrame(encoder.transform(test_cat).toarray(),columns = column_name, index=test_cat.index)

#### Concatenating

In [43]:
train_encoded = pd.concat([train_encoded,train_num_scaled], axis = 1)
test_encoded = pd.concat([test_encoded,test_num_scaled], axis = 1)

## First model

In [44]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [45]:
def model_eval_regression(model,X_train,y_train,X_test, y_test):
        # Fitting
        model.fit(X_train, y_train)
        # Predicting
        predictions = model.predict(X_test)
        # Printing different evaluation metrics
        print( str(model.base_estimator_)[:-2] + " R2-score: ", r2_score(y_test, predictions))
        print( str(model.base_estimator_)[:-2] + " MSE: ",mean_squared_error(y_test,predictions))
        print( str(model.base_estimator_)[:-2] + " RMSE: ",np.sqrt(mean_squared_error(y_test,predictions)))
        print( str(model.base_estimator_)[:-2] + " MAE: ",mean_absolute_error(y_test, predictions))
        print('\n')
        return model

In [46]:
# Defining a model
RfR = RandomForestRegressor(max_depth=5)

In [47]:
model_eval_regression(RfR,train_encoded,y_train,test_encoded, y_test)

DecisionTreeRegressor R2-score:  0.4124597680953397
DecisionTreeRegressor MSE:  113.88333468936163
DecisionTreeRegressor RMSE:  10.67161349981162
DecisionTreeRegressor MAE:  4.236842873212477




RandomForestRegressor(max_depth=5)

## Feature Selection

#### Variance threshhold

In [48]:
# Fitting
sel_regr = VarianceThreshold(threshold= 0.01)
sel_regr = sel_regr.fit(train_encoded)

In [49]:
# Check wich columns have low variance
sel_regr.get_support()
# Making it a list
var_list = list(sel_regr.get_support())
# Creating a droplist
droplist_var_regr = [col[0] for col in zip(train_encoded.columns, var_list) if col[1] == False]
print(droplist_var_regr)
len(droplist_var_regr)

['RFA_2R_L', 'TCODE', 'HIT', 'MALEMILI', 'LOCALGOV', 'FEDGOV', 'POP901', 'POP902', 'POP903', 'POP90C4', 'POP90C5', 'ETH3', 'ETH4', 'ETH6', 'ETH7', 'ETH8', 'ETH9', 'ETH10', 'ETH11', 'ETH12', 'ETH14', 'ETH15', 'ETH16', 'AGE904', 'AGE905', 'AGE906', 'CHIL1', 'CHIL2', 'CHIL3', 'AGEC1', 'AGEC2', 'AGEC3', 'CHILC1', 'CHILC3', 'CHILC4', 'CHILC5', 'HHN2', 'HHN5', 'HHN6', 'MARR2', 'MARR4', 'HHP1', 'HHP2', 'DW3', 'DW7', 'DW8', 'DW9', 'HHD7', 'HHD9', 'HHD10', 'HHD12', 'ETHC1', 'ETHC4', 'ETHC6', 'HUR1', 'RHP3', 'RHP4', 'HUPA4', 'HUPA7', 'IC5', 'IC10', 'IC13', 'IC14', 'IC16', 'IC17', 'IC18', 'IC23', 'TPE2', 'TPE3', 'TPE4', 'TPE5', 'TPE6', 'TPE7', 'TPE8', 'TPE9', 'PEC1', 'TPE10', 'LFC10', 'OCC3', 'OCC4', 'OCC5', 'OCC6', 'OCC7', 'OCC9', 'OCC12', 'EIC1', 'EIC2', 'EIC3', 'EIC5', 'EIC7', 'EIC8', 'EIC9', 'EIC10', 'EIC11', 'EIC12', 'EIC13', 'EIC14', 'EIC15', 'OEDC3', 'OEDC4', 'OEDC6', 'OEDC7', 'EC2', 'SEC1', 'SEC5', 'AFC1', 'AFC2', 'AFC3', 'VC2', 'ANC1', 'ANC2', 'ANC3', 'ANC5', 'ANC6', 'ANC8', 'ANC9', 'ANC

139

In [50]:
# Checking result:
model_eval_regression(RfR, train_encoded.drop(droplist_var_regr, axis = 1), y_train, test_encoded.drop(droplist_var_regr, axis = 1), y_test)

DecisionTreeRegressor R2-score:  0.20786414113165885
DecisionTreeRegressor MSE:  153.5402483715987
DecisionTreeRegressor RMSE:  12.391135878990218
DecisionTreeRegressor MAE:  5.14428907450447




RandomForestRegressor(max_depth=5)

In [51]:
# Continuing with this
train_var = train_encoded.drop(droplist_var_regr, axis = 1).copy()
test_var = test_encoded.drop(droplist_var_regr, axis = 1).copy()

## Hyperparameter search:

In [52]:
RfR = RandomForestRegressor()

In [53]:


max_depth_choices= [2,3,4,5,6,7,9,10,None]
criterion_choices = ['squared_error']
min_samples_split_choices = [1,2,5,20,50,100,200]
min_samples_leaf_choices = [2,5,20,50,100,200]
max_features_choices = ["auto", "sqrt", "log2", None, 10]



grid = {'max_depth': max_depth_choices,
               'criterion': criterion_choices,
               'min_samples_split': min_samples_split_choices,
               'min_samples_leaf': min_samples_leaf_choices,
               'max_features': max_features_choices,
               'bootstrap' : [True,False]}

In [54]:
# Grid search
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator = RfR, param_grid = grid, cv = 6)

In [55]:
# Fit the grid search to the data
# grid_search.fit(train_var, y_train)

In [56]:
# pickle.dump(grid_search, open('grid_search.p', 'wb'))

In [67]:
grid_search = pickle.load(open('grid_search.p','rb'))

In [68]:
grid_search.best_params_

{'min_samples_split': 50,
 'min_samples_leaf': 50,
 'max_features': None,
 'max_depth': 6,
 'criterion': 'squared_error',
 'bootstrap': True}

In [69]:
grid_search.best_score_

0.3710766105247606

# Building final Model

In [70]:
# Setting model with selected Hyperparameters
RfR = RandomForestRegressor(min_samples_split= 50,
 min_samples_leaf = 50,
 max_features = None,
 max_depth = 6,
 criterion = 'squared_error',
 bootstrap = True)

In [71]:
%%time
# Training model
RfR.fit(train_var, y_train)

Wall time: 10.1 s


RandomForestRegressor(max_depth=6, max_features=None, min_samples_leaf=50,
                      min_samples_split=50)

# Making predictions

#### Preprocessing whole dataframe:

In [99]:
X = df_all[df_all['predicted_donate']==1].copy()
X = X.drop(['TARGET_D', 'TARGET_B', 'predicted_donate'], axis = 1)

In [100]:
X_num   X.select_dtypes(include = np.number)
X_cat = X.select_dtypes(include = object)

X_num_scaled = pd.DataFrame(transformer.transform(X_num), columns = X_num.columns, index = X_num.index)

X_encoded = pd.DataFrame(encoder.transform(X_cat).toarray(),columns = column_name, index=X_cat.index)

X_transformed = pd.concat([X_encoded, X_num_scaled], axis = 1)

X_transformed.drop(droplist_var_regr, axis = 1, inplace = True)

# Predicting:

In [101]:
predictions_r = RfR.predict(X_transformed)
predictions_r

array([ 9.96206477, 11.67516219,  9.87741309, ..., 17.94358869,
       15.89336773, 27.07189034])

# Adding predictions to dataframe:

In [108]:
df_all['Predicted_D']  = 0

In [109]:
df_all.loc[df_all['predicted_donate']==1,'Predicted_D']  = predictions_r

In [112]:
df_all.isna().sum().sum()

0

# Conclusion:

#### Cost of mailing action:

In [113]:
# Old
len(df_all)*0.68

64880.16

In [114]:
# New
len(df_all[df_all['predicted_donate']==1])*0.68

26080.04

#### Revenue:

In [116]:
# Old
df_all['TARGET_D'].sum()

75668.7

In [117]:
# Predicted
df_all['Predicted_D'].sum()

701082.725473102

#### Only considering true positives:

In [118]:
df_all[ (df_all['predicted_donate']==1) & (df_all['TARGET_B']==1) ]['Predicted_D'].sum()

43664.00090113023

#### Result:

DecisionTreeClassifier score:  0.596604307498821
DecisionTreeClassifier precision:  0.0565075212557227
DecisionTreeClassifier recall:  0.47110141766630315
DecisionTreeClassifier f1:  0.10091100210231255

DecisionTreeRegressor R2:  0.3710766105247606

In [120]:
# Old:
75668 - 64880

10788

In [122]:
# Modeled naive:
701082 - 26080

675002

In [121]:
# Modeled bad scenario:
43664 - 26080

17584