# Part 4 : Ensemble Random Forest
- More trees = Random Forest

# Import Libraries

In [64]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split

# import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

## Import Data

In [65]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/credit-card-approval-clean-data/clean_dataset.csv
/kaggle/input/credit-card-approval-clean-data/crx.csv


In [66]:
data = pd.read_csv('/kaggle/input/credit-card-approval-clean-data/clean_dataset.csv')

### Describe Data

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


## Partition Data into X and y

In [68]:
y = data['Approved']
X = data.drop('Approved', axis = 1)

## Create Dummy Variables

In [69]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,...,Industry_Transport,Industry_Utilities,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Latino,Ethnicity_Other,Ethnicity_White,Citizen_ByBirth,Citizen_ByOtherMeans,Citizen_Temporary
0,1,30.83,0.0,1,1,1.25,1,1,1,0,...,0,0,0,0,0,0,1,1,0,0
1,0,58.67,4.46,1,1,3.04,1,1,6,0,...,0,0,0,1,0,0,0,1,0,0
2,0,24.5,0.5,1,1,1.5,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,1,27.83,1.54,1,1,3.75,1,1,5,1,...,0,0,0,0,0,0,1,1,0,0
4,1,20.17,5.625,1,1,1.71,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0


## Train Test Split

In [70]:
# write a code to stratify 'y'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify = y)

## Decision Tree with Hyperparameter

### 1/ Initiate an Instance

In [71]:
# Create and instance of Random Forest Classifier - make class_weight balanced
rf_model = RandomForestClassifier(random_state=23, class_weight = 'balanced')

### 2/ Create Parameter Grid
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [72]:
# How can you improve the parameter choices?
dt_param_grid = {
    'n_estimators' : range(2,40,2),
    'max_depth' : range(2,10),
    'max_features' :  ['auto', 'sqrt', 'log2'] 
}

### 3/ Initiate a GridSearchCV instance
- model
- parameter grid
- scoring

In [73]:
gs_dt = GridSearchCV(rf_model, param_grid = dt_param_grid, scoring = 'accuracy')

### 4/ Fit Grid Search to get Best Estimators

In [74]:
# write a code to fit train data
gs_dt.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=23),
             param_grid={'max_depth': range(2, 10),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(2, 40, 2)},
             scoring='accuracy')

### 5/ Store Best Estimator

In [75]:
gs_dt.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=3, n_estimators=30,
                       random_state=23)

In [76]:
best_dt_estimates = gs_dt.best_estimator_

### 6/ Fit Model with Best Estimator

In [77]:
best_dt_estimates.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=3, n_estimators=30,
                       random_state=23)

### 7/ Make predictions

In [78]:
y_train_predicted = best_dt_estimates.predict(X_train)
y_test_predicted = best_dt_estimates.predict(X_test)

### 8/ Check Performance

#### Check train Peformance

In [79]:
print(classification_report(y_train, y_train_predicted))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       306
           1       0.89      0.83      0.86       246

    accuracy                           0.88       552
   macro avg       0.88      0.87      0.88       552
weighted avg       0.88      0.88      0.88       552



#### Check test peformance

In [80]:
print(classification_report(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91        77
           1       0.91      0.84      0.87        61

    accuracy                           0.89       138
   macro avg       0.89      0.89      0.89       138
weighted avg       0.89      0.89      0.89       138



# Did we improve the performance?