# Random Forest

Random Forest is a great classifier to use in this particular instance. It is powerful and widely implemented and with default settings performs well due to the ability to have many trees in the forest. Having many trees in the forest prevents overfitting since some trees will have the same data.

In [1]:
# Import Data
# MUTATE DATAFRAMES ACCORDING TO THE EXPLORATORY DATA ANALYSIS CODE

#For data Manipulation
import numpy as np
import pandas as pd
#In order to show all columns available
pd.set_option('display.max_columns', 200)

#Sklearn imports
from sklearn.preprocessing import LabelEncoder, Imputer

#Graphing libs
import matplotlib.pyplot as plt
import seaborn as sns

apptrain = pd.read_csv('../Dataset/application_train.csv')
apptest = pd.read_csv('../Dataset/application_test.csv')

# Code that modifies dataframes
apptrain['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
apptest['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
apptrain['DAYS_BIRTH'] = abs(apptrain['DAYS_BIRTH'])
apptest['DAYS_BIRTH'] = abs(apptrain['DAYS_BIRTH'])

# Preparing the base data

In [2]:
# One-hot encoding and dataframe alignment
le = LabelEncoder()
le_count = 0

# Iterate through columns
for col in apptrain:
    if apptrain[col].dtype == "object":
        if len(list(apptrain[col].unique())) <= 2:
            #train on the training data
            le.fit(apptrain[col])
            #transform both training and testing data
            apptrain[col] = le.transform(apptrain[col])
            apptest[col] = le.transform(apptest[col])
            
            le_count += 1
            

            
#One-Hot encoding
apptrain = pd.get_dummies(apptrain)
apptest = pd.get_dummies(apptest)



print('Training features shape: {}'.format(apptrain.shape))
print('Training features shape: {}'.format(apptest.shape))
print('{} columns were label encoded'.format(le_count))

Training features shape: (307511, 243)
Training features shape: (48744, 239)
3 columns were label encoded


In [3]:
# Take the labels out of the training dataset as an inner merge will erase them since the test dataset does not have the targets
train_labels = apptrain['TARGET']


#aligning the training and testing data, keep only columns present in both df's
apptrain, apptest = apptrain.align(apptest, join = 'inner', axis = 1)
apptrain['TARGET'] = train_labels

print('Training Features shape: ', apptrain.shape)
print('Testing features shape: ', apptest.shape)
print("We're back on track, remember the training dataset will have one column more since it DOES have the targets")

Training Features shape:  (307511, 240)
Testing features shape:  (48744, 239)
We're back on track, remember the training dataset will have one column more since it DOES have the targets


In [4]:
# Scaling not very much required for Random Forest models
# Creating base df's for machine learning model
training_data = apptrain.drop(columns = ['TARGET'])
testing_data = apptest.copy()

# In the dataframes we still have missing values, WE USE IMPUTATION HERE
imputer = Imputer(strategy = 'median')
imputer.fit(training_data)
imputer.fit(testing_data)
training_data = imputer.transform(training_data)
testing_data = imputer.transform(testing_data)

print('training data shape', training_data.shape)
print('testing data shape', testing_data.shape)

training data shape (307511, 239)
testing data shape (48744, 239)


# Random Forest Classifier (BASE) model

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Declare the model, tune parameters, fit data
Random_Forest = RandomForestClassifier(n_estimators = 1000, verbose = 1, n_jobs = -1, max_features = 'auto')
# Random_Forest.fit(training_data, train_labels)

In [6]:
#Let's make a function that'll format and save our predictions for submissions to the Kaggle competition
def format_and_submit(predictions, desired_file_name):
    submit = apptest[['SK_ID_CURR']]
    submit['TARGET'] = predictions
    submit.to_csv('../Model_Predictions/{}.csv'.format(desired_file_name), index = False)

In [7]:
# predictions = Random_Forest.predict_proba(testing_data)[:, 1]
# format_and_submit(predictions, 'Random_Forest_1000Trees_BASE')

#### Score
When submitted to kaggle the base Random Forest scored .711, which matched our Random Forest with 500 trees and domain features

# Random Forest Classifier Domain Features Model Prep

In [8]:
# Take note that DK will be understood as Domain Knowledge
# Creating train data copies
apptrain_domain = apptrain.copy()
apptest_domain = apptest.copy()

#Creating variables for train data
apptrain_domain['CREDIT_INCOME_PERCENT'] = apptrain_domain['AMT_CREDIT'] * 100 / apptrain_domain['AMT_INCOME_TOTAL']
apptrain_domain['ANNUITY_INCOME_PERCENT'] = apptrain_domain['AMT_ANNUITY'] / apptrain_domain['AMT_INCOME_TOTAL']
apptrain_domain['CREDIT_TERM'] = apptrain_domain['AMT_ANNUITY'] / apptrain_domain['AMT_CREDIT']
apptrain_domain['DAYS_EMPLOYED_PERCENT'] = apptrain_domain['DAYS_EMPLOYED'] / apptrain_domain['DAYS_BIRTH']

#Creating variables for test data
apptest_domain['CREDIT_INCOME_PERCENT'] = apptest_domain['AMT_CREDIT'] / apptest_domain['AMT_INCOME_TOTAL']
apptest_domain['ANNUITY_INCOME_PERCENT'] = apptest_domain['AMT_ANNUITY'] / apptest_domain['AMT_INCOME_TOTAL']
apptest_domain['CREDIT_TERM'] = apptest_domain['AMT_ANNUITY'] / apptest_domain['AMT_CREDIT']
apptest_domain['DAYS_EMPLOYED_PERCENT'] = apptest_domain['DAYS_EMPLOYED'] / apptest_domain['DAYS_BIRTH']

### To add features we must do the following

1. Obtain additional features in dataframe with original features (done in the code above)
2. Use Imputation to rid the dataframe of NaN values
3. Scale if Necessary (not necessary for model)

In [9]:
# Resolving imputations done on new dataset

# Important to take out target column from the testing set, it is stored in the train_labels variable to be given as a separate parameter
dk_training_data = apptrain_domain.drop(columns=['TARGET'])
dk_testing_data = apptest_domain

# fitting the Imputer
imputer.fit(dk_training_data)
imputer.fit(dk_testing_data)

#transforming the data
dk_training_data = imputer.transform(dk_training_data)
dk_testing_data = imputer.transform(dk_testing_data)

print(dk_training_data.shape)
print(dk_testing_data.shape)

(307511, 243)
(48744, 243)


# Random Forest Classifier with Domain Knowledge features

In [10]:
# dk_Random_Forest = RandomForestClassifier(n_estimators = 1000, verbose = 1, n_jobs = -1, max_features = 'auto')
# dk_Random_Forest.fit(dk_training_data, train_labels)

In [11]:
# dk_predictions = dk_Random_Forest.predict_proba(dk_testing_data)[:, 1]
# format_and_submit(dk_predictions, 'Random_Forest_DomainKnowledgeFeatures')

# Adding Polynomial features

In [12]:
# Creating polynomial features starting from the dataframe we already created domain knowledge features for
poly_features = apptrain_domain[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'TARGET']]
poly_features_test = apptest_domain[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']]

# Removing target column so fits are the same
poly_target = poly_features['TARGET']
poly_features = poly_features.drop(columns = ['TARGET'])

# Imputing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

print('poly_features shape: {}'.format(poly_features.shape))
print('poly_features_test shape: {}'.format(poly_features_test.shape))

poly_features shape: (307511, 4)
poly_features_test shape: (48744, 4)


In [13]:
from sklearn.preprocessing import PolynomialFeatures
#Create polynomial object with specific degree
poly_transformer = PolynomialFeatures(degree = 3)
poly_transformer.fit(poly_features)

#transforming features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features Shape: ', poly_features.shape)

Polynomial Features Shape:  (307511, 35)


In [14]:
# Turning polynomial features into dataframe for both training and testing
poly_features = pd.DataFrame(poly_features, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))
# Put target back into poly_features
poly_features['TARGET'] = poly_target

# Testing dataset
poly_features_test = pd.DataFrame(poly_features_test, columns = poly_transformer.get_feature_names(['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']))

In [15]:
# Creating common column to merge training dataset and polyfeatures dataset
poly_features['SK_ID_CURR'] = apptrain_domain['SK_ID_CURR']
pdk_training_df = apptrain_domain.merge(poly_features, on = 'SK_ID_CURR', how = 'left')

# Doing same for test dataset
poly_features_test['SK_ID_CURR'] = apptest_domain['SK_ID_CURR']
pdk_testing_df = apptest_domain.merge(poly_features_test, on = 'SK_ID_CURR', how = 'left')

# Align that dataframes
pdk_training_df, pdk_testing_df = pdk_training_df.align(pdk_testing_df, join = 'inner', axis = 1)

print('polynomial domain knowledge training dataset: ', pdk_training_df.shape)
print('polynomial domain knowledge testing dataset: ', pdk_testing_df.shape)
pdk_train_labels = poly_features['TARGET']

polynomial domain knowledge training dataset:  (307511, 278)
polynomial domain knowledge testing dataset:  (48744, 278)


## Random Forest Classifier with Polynomial Features AND Domain Knowledge Features

Next steps are to imputate, and run the model with the new features

In [16]:
# It's beneficial to use copies of the original dataframe, just in case you mess up you can always toss the copy out
poly_training_data = pdk_training_df.copy()
poly_testing_data = pdk_testing_df.copy()

poly_training_data = imputer.fit_transform(poly_training_data)
poly_testing_data = imputer.transform(poly_testing_data)

In [17]:
# PDK_Random_Forest = RandomForestClassifier(n_estimators = 1000, verbose = 1, n_jobs = -1, max_features = 'auto')
# PDK_Random_Forest.fit(poly_training_data, pdk_train_labels)

In [18]:
# PDK_predictions = PDK_Random_Forest.predict_proba(poly_testing_data)[:, 1]
# format_and_submit(PDK_predictions, 'Random_Forest_PDK')

# Automated Feature Selection
Let's take only the features that were useful, We're doing this now because we see a drop in score.

In [19]:
# Polynomial/ Domain Knowledge Automated Feature Selection
PDK_AFS_Random_Forest = RandomForestClassifier(n_estimators = 2500, verbose = 1, n_jobs = -1, max_features = 'auto')
PDK_AFS_Random_Forest.fit(poly_training_data, pdk_train_labels)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 24.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [20]:
feature_list = list(pdk_training_df.columns)
print(len(feature_list))
for feature in zip(feature_list, PDK_AFS_Random_Forest.feature_importances_):
    print(feature)

278
('SK_ID_CURR', 0.015382922829246547)
('NAME_CONTRACT_TYPE', 0.0009013644875789921)
('FLAG_OWN_CAR', 0.00194718169110946)
('FLAG_OWN_REALTY', 0.0020195834024265905)
('CNT_CHILDREN', 0.0033756056668383188)
('AMT_INCOME_TOTAL', 0.011582752258668701)
('AMT_CREDIT', 0.012910217585595237)
('AMT_ANNUITY', 0.013992205791054556)
('AMT_GOODS_PRICE', 0.011221032347318985)
('REGION_POPULATION_RELATIVE', 0.012189775227951024)
('DAYS_BIRTH_x', 0.01225386266432864)
('DAYS_EMPLOYED', 0.014138861642864802)
('DAYS_REGISTRATION', 0.015825714414699123)
('DAYS_ID_PUBLISH', 0.01582777732660419)
('OWN_CAR_AGE', 0.0069241045580709)
('FLAG_MOBIL', 1.1762610868102907e-08)
('FLAG_EMP_PHONE', 0.0006117624903234478)
('FLAG_WORK_PHONE', 0.0019170734622506636)
('FLAG_CONT_MOBILE', 0.0001783199469691369)
('FLAG_PHONE', 0.0018225771499347989)
('FLAG_EMAIL', 0.0009993748579395468)
('CNT_FAM_MEMBERS', 0.00475185055194397)
('REGION_RATING_CLIENT', 0.002574281498140615)
('REGION_RATING_CLIENT_W_CITY', 0.00259753950415

In [21]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(PDK_AFS_Random_Forest, threshold='median')
sfm.fit(poly_training_data, pdk_train_labels)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 23.4min
[Parallel(n_jobs=-1)]: Done 2500 out of 2500 | elapsed: 24.0min finished


SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False),
        norm_order=1, prefit=False, threshold='median')

In [22]:
# This will print out all the features that were selected as important
for feature_list_index in sfm.get_support(indices=True):
    print(feature_list[feature_list_index])

SK_ID_CURR
FLAG_OWN_CAR
FLAG_OWN_REALTY
CNT_CHILDREN
AMT_INCOME_TOTAL
AMT_CREDIT
AMT_ANNUITY
AMT_GOODS_PRICE
REGION_POPULATION_RELATIVE
DAYS_BIRTH_x
DAYS_EMPLOYED
DAYS_REGISTRATION
DAYS_ID_PUBLISH
OWN_CAR_AGE
FLAG_WORK_PHONE
FLAG_PHONE
CNT_FAM_MEMBERS
REGION_RATING_CLIENT
REGION_RATING_CLIENT_W_CITY
HOUR_APPR_PROCESS_START
REG_CITY_NOT_LIVE_CITY
REG_CITY_NOT_WORK_CITY
LIVE_CITY_NOT_WORK_CITY
EXT_SOURCE_1_x
EXT_SOURCE_2_x
EXT_SOURCE_3_x
APARTMENTS_AVG
BASEMENTAREA_AVG
YEARS_BEGINEXPLUATATION_AVG
YEARS_BUILD_AVG
COMMONAREA_AVG
ELEVATORS_AVG
ENTRANCES_AVG
FLOORSMAX_AVG
FLOORSMIN_AVG
LANDAREA_AVG
LIVINGAPARTMENTS_AVG
LIVINGAREA_AVG
NONLIVINGAPARTMENTS_AVG
NONLIVINGAREA_AVG
APARTMENTS_MODE
BASEMENTAREA_MODE
YEARS_BEGINEXPLUATATION_MODE
YEARS_BUILD_MODE
COMMONAREA_MODE
ENTRANCES_MODE
FLOORSMAX_MODE
FLOORSMIN_MODE
LANDAREA_MODE
LIVINGAPARTMENTS_MODE
LIVINGAREA_MODE
NONLIVINGAPARTMENTS_MODE
NONLIVINGAREA_MODE
APARTMENTS_MEDI
BASEMENTAREA_MEDI
YEARS_BEGINEXPLUATATION_MEDI
YEARS_BUILD_MEDI
COMMO

In [23]:
important_training_data = sfm.transform(poly_training_data)
important_test_data = sfm.transform(poly_testing_data)

In [24]:
important_features_Random_Forest = RandomForestClassifier(n_estimators=2800, n_jobs=-1, max_features='auto', verbose=1)
important_features_Random_Forest.fit(important_training_data, pdk_train_labels)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done 2800 out of 2800 | elapsed: 28.0min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2800, n_jobs=-1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [26]:
important_features_RF_Preds = important_features_Random_Forest.predict_proba(important_test_data)[:, 1]
format_and_submit(important_features_RF_Preds, 'RF_Important_features')

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    3.4s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    5.4s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    7.8s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:   10.7s
[Parallel(n_jobs=8)]: Done 2800 out of 2800 | elapsed:   12.3s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
