# Part 4 : Ensemble Random Forest
- More trees = Random Forest

# Import Libraries

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split

# import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

## Import Data

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ipba-12-unext-jigsaw-classification-case-study/sample_submission.csv
/kaggle/input/ipba-12-unext-jigsaw-classification-case-study/train.csv
/kaggle/input/ipba-12-unext-jigsaw-classification-case-study/test.csv


In [3]:
train = pd.read_csv('/kaggle/input/ipba-12-unext-jigsaw-classification-case-study/train.csv')
test = pd.read_csv('/kaggle/input/ipba-12-unext-jigsaw-classification-case-study/test.csv')

### Describe Data

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        5282 non-null   object 
 1   gender            5282 non-null   object 
 2   SeniorCitizen     5282 non-null   int64  
 3   Partner           5282 non-null   object 
 4   Dependents        5282 non-null   object 
 5   tenure            5282 non-null   int64  
 6   PhoneService      5282 non-null   object 
 7   MultipleLines     5282 non-null   object 
 8   InternetService   5282 non-null   object 
 9   OnlineSecurity    5282 non-null   object 
 10  OnlineBackup      5282 non-null   object 
 11  DeviceProtection  5282 non-null   object 
 12  TechSupport       5282 non-null   object 
 13  StreamingTV       5282 non-null   object 
 14  StreamingMovies   5282 non-null   object 
 15  Contract          5282 non-null   object 
 16  PaperlessBilling  5282 non-null   object 


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1761 entries, 0 to 1760
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        1761 non-null   object 
 1   gender            1761 non-null   object 
 2   SeniorCitizen     1761 non-null   int64  
 3   Partner           1761 non-null   object 
 4   Dependents        1761 non-null   object 
 5   tenure            1761 non-null   int64  
 6   PhoneService      1761 non-null   object 
 7   MultipleLines     1761 non-null   object 
 8   InternetService   1761 non-null   object 
 9   OnlineSecurity    1761 non-null   object 
 10  OnlineBackup      1761 non-null   object 
 11  DeviceProtection  1761 non-null   object 
 12  TechSupport       1761 non-null   object 
 13  StreamingTV       1761 non-null   object 
 14  StreamingMovies   1761 non-null   object 
 15  Contract          1761 non-null   object 
 16  PaperlessBilling  1761 non-null   object 


# Data Cleaning

In [6]:
train['TotalCharges'] = train['TotalCharges'].replace(' ',np.nan)
test['TotalCharges'] = test['TotalCharges'].replace(' ',np.nan)

In [7]:
train['TotalCharges'] = train['TotalCharges'].astype('float64')
test['TotalCharges'] = test['TotalCharges'].astype('float64')

## Partition Data into X and y

In [8]:
y = train['Churn']
X = train.drop(['Churn','customerID','TotalCharges'], axis = 1)

In [9]:
X_predict = test.drop(['customerID','TotalCharges'], axis = 1)

## Create Dummy Variables

In [10]:
# Improvement Area 1 
X = pd.get_dummies(X)
X.head(2)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,72,53.65,1,0,0,1,0,1,1,...,1,0,0,1,1,0,0,1,0,0
1,0,4,46.0,1,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1


In [11]:
X_predict = pd.get_dummies(X_predict)
X_predict.head(2)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,52,80.2,1,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,0,1
1,0,33,24.25,1,0,0,1,0,1,0,...,0,1,0,0,1,0,1,0,0,0


## Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=23, stratify = y)

## Decision Tree with Hyperparameter

### 1/ Initiate an Instance

In [13]:
#rf_model = RandomForestClassifier(random_state = 42)
rf_model = RandomForestClassifier(random_state = 42, class_weight = 'balanced')

### 2/ Create Parameter Grid
- https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [14]:
# Improvement Area 
dt_param_grid = {
    'n_estimators' : range(2,40,2),
    'max_depth' : range(2,10),
    'max_features' :  ['auto', 'sqrt', 'log2'] 
}

### 3/ Initiate a GridSearchCV instance
- model
- parameter grid
- scoring

In [15]:
gs_dt = GridSearchCV(rf_model, param_grid = dt_param_grid, scoring = 'accuracy')

### 4/ Fit Grid Search to get Best Estimators

In [16]:
gs_dt.fit(X_train,y_train)

GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=42),
             param_grid={'max_depth': range(2, 10),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': range(2, 40, 2)},
             scoring='accuracy')

### 5/ Store Best Estimator

In [17]:
gs_dt.best_estimator_

RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=34,
                       random_state=42)

In [18]:
best_dt_estimates = gs_dt.best_estimator_

### 6/ Fit Model with Best Estimator

In [19]:
best_dt_estimates.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=9, n_estimators=34,
                       random_state=42)

### 7/ Make predictions

In [20]:
y_train_predicted = best_dt_estimates.predict(X_train)
y_test_predicted = best_dt_estimates.predict(X_test)

In [21]:
y_sub_predicted = best_dt_estimates.predict(X_predict)

In [22]:
submission = pd.DataFrame({
    'customerID' : test['customerID'],
    'Churn' : y_sub_predicted
})
submission.head()

Unnamed: 0,customerID,Churn
0,5343-SGUBI,0
1,5442-BXVND,0
2,6434-TTGJP,0
3,1628-BIZYP,1
4,0298-XACET,0


In [23]:
submission.to_csv('submission.csv', index = False)

### 8/ Check Performance

#### Check train Peformance

In [24]:
print(classification_report(y_train, y_train_predicted))

              precision    recall  f1-score   support

           0       0.95      0.83      0.88      3104
           1       0.65      0.88      0.74      1121

    accuracy                           0.84      4225
   macro avg       0.80      0.85      0.81      4225
weighted avg       0.87      0.84      0.85      4225



#### Check test peformance

In [25]:
print(classification_report(y_test, y_test_predicted))

              precision    recall  f1-score   support

           0       0.90      0.78      0.84       776
           1       0.56      0.75      0.64       281

    accuracy                           0.77      1057
   macro avg       0.73      0.77      0.74      1057
weighted avg       0.81      0.77      0.78      1057



# Did we improve the performance?