## Training Random Forest with Grid Search for Hyperparameter Optimization

In this step, we will train the Random Forest model again, but this time we will use **Grid Search** to find the optimal hyperparameters for our final model. This approach ensures that we systematically explore a range of parameter combinations to maximize the model's performance. 

Key parameters we will tune include:

- **`n_estimators`**: Number of trees in the forest.
- **`max_depth`**: Maximum depth of the tree.
- **`min_samples_split`**: Minimum number of samples required to split a node.
- **`min_samples_leaf`**: Minimum number of samples required at a leaf node.

Once the optimal parameters are determined, we will train the final model using these settings and evaluate its performance on the test dataset.


In [47]:
import warnings
warnings.filterwarnings('ignore')

In [48]:
import numpy as np
import pandas as pd

In [49]:
from sklearn.preprocessing import ( StandardScaler,
                                    OneHotEncoder, OrdinalEncoder
                                    )

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import(   
                                Pipeline,
                                make_pipeline
                            )

from sklearn.ensemble import    RandomForestClassifier 

from sklearn.model_selection import (   train_test_split, 
                                        StratifiedKFold
                                    )



In [50]:
import pickle

In [51]:
df= pd.concat([
    pd.read_csv('customer_churn_dataset-training-master.csv'),
    pd.read_csv('customer_churn_dataset-testing-master.csv')
],
    axis=0)

df.reset_index(drop= True, inplace= True)
df= df.dropna()

In [52]:
subscription_map = {
    'Basic': 'Bsc',
    'Premium': 'Prm',
    'Standard': 'Std'
}

contract_map = {
    'Annual': 'Ann',
    'Monthly': 'Mon',
    'Quarterly': 'Qua'
}

df['Subscription_Contract'] = df['Subscription Type'].map(subscription_map) + '-' + df['Contract Length'].map(contract_map)

In [53]:
df.loc[(df['Age'] > 17.999) & (df['Age'] <= 27), 'Age'] = 0
df.loc[(df['Age'] > 27) & (df['Age'] <= 35), 'Age'] = 1
df.loc[(df['Age'] > 35) & (df['Age'] <= 43), 'Age'] = 2
df.loc[(df['Age'] > 43) & (df['Age'] <= 50), 'Age'] = 3
df.loc[(df['Age'] > 50) & (df['Age'] <= 65), 'Age'] = 4
df.loc[df['Age'] > 65, 'Age']

df.loc[(df['Tenure'] > 0.999) & (df['Tenure'] <= 7.0), 'Tenure'] = 0
df.loc[(df['Tenure'] > 7.0) & (df['Tenure'] <= 13.0), 'Tenure'] = 1
df.loc[(df['Tenure'] > 13.0) & (df['Tenure'] <= 20.0), 'Tenure'] = 2
df.loc[(df['Tenure'] > 20.0) & (df['Tenure'] <= 26.0), 'Tenure'] = 3
df.loc[(df['Tenure'] > 26.0) & (df['Tenure'] <= 32.0), 'Tenure'] = 4
df.loc[(df['Tenure'] > 32.0) & (df['Tenure'] <= 38.0), 'Tenure'] = 5
df.loc[(df['Tenure'] > 38.0) & (df['Tenure'] <= 43.0), 'Tenure'] = 6
df.loc[(df['Tenure'] > 43.0) & (df['Tenure'] <= 49.0), 'Tenure'] = 7
df.loc[(df['Tenure'] > 49.0) & (df['Tenure'] <= 55.0), 'Tenure'] = 8
df.loc[(df['Tenure'] > 55.0) & (df['Tenure'] <= 60.0), 'Tenure'] = 9
df.loc[(df['Tenure'] > 60), 'Tenure']

df.loc[(df['Total Spend'] > 99.999) & (df['Total Spend'] <= 405.0), 'Total Spend'] = 0
df.loc[(df['Total Spend'] > 405.0) & (df['Total Spend'] <= 592.774), 'Total Spend'] = 1
df.loc[(df['Total Spend'] > 592.774) & (df['Total Spend'] <= 728.72), 'Total Spend'] = 2
df.loc[(df['Total Spend'] > 728.72) & (df['Total Spend'] <= 863.64), 'Total Spend'] = 3
df.loc[(df['Total Spend'] > 863.64), 'Total Spend'] = 4



In [54]:
Irr_cols= ['CustomerID', 'Subscription Type', 'Contract Length']
df=df.drop(columns= Irr_cols, axis=1)

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 505206 entries, 0 to 505206
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Age                    505206 non-null  float64
 1   Gender                 505206 non-null  object 
 2   Tenure                 505206 non-null  float64
 3   Usage Frequency        505206 non-null  float64
 4   Support Calls          505206 non-null  float64
 5   Payment Delay          505206 non-null  float64
 6   Total Spend            505206 non-null  float64
 7   Last Interaction       505206 non-null  float64
 8   Churn                  505206 non-null  float64
 9   Subscription_Contract  505206 non-null  object 
dtypes: float64(8), object(2)
memory usage: 42.4+ MB


In [56]:
numerical_features = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']
categorical_features_ohe = ['Gender']
categorical_features_ord= ['Subscription_Contract']

num_pipeline= Pipeline(steps= [
    ('scale' , StandardScaler())
])

ord_pipeline= Pipeline(steps= [
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, categories=[
        
        ['Bsc-Mon', 'Bsc-Qua', 'Bsc-Ann', 'Std-Mon', 'Std-Qua', 'Std-Ann','Prm-Mon', 'Prm-Qua', 'Prm-Ann']]))
])

ohe_pipeline= Pipeline(steps= [
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output= False))
])

column_transformer= ColumnTransformer(transformers= [
    ('numeric_pipeline', num_pipeline, numerical_features),
    ('one_hot_pipeline', ohe_pipeline, categorical_features_ohe),
    ('ordinal_pipeline', ord_pipeline, categorical_features_ord)
    ],
    remainder= 'drop',
    n_jobs= -1
)

In [57]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,  random_state=42,  stratify=y)  

In [59]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],        
    'max_depth': [None, 10, 20, 30],            
    'min_samples_split': [2, 5, 10],            
    'min_samples_leaf': [1, 2, 4],              
    'bootstrap': [True, False]                  
}


In [60]:
rf= RandomForestClassifier(random_state=42)

In [61]:
CV_RF= GridSearchCV(estimator= rf, param_grid= param_grid, cv= StratifiedKFold(n_splits=3))

In [62]:
RANDOM_FOREST_PIPELINE= make_pipeline(column_transformer, CV_RF)

In [63]:
RANDOM_FOREST_PIPELINE.fit(X_train, y_train)

In [64]:
print(CV_RF.best_score_)

0.9271731271455118


In [65]:
best_rf_model = CV_RF.best_estimator_
best_params = CV_RF.best_params_


y_pred = RANDOM_FOREST_PIPELINE.predict(X_test)
y_pred_proba = RANDOM_FOREST_PIPELINE.predict_proba(X_test)[:, 1]  


rf_results_df = pd.DataFrame({
    'y_true': y_test,
    'y_pred': y_pred,
    'y_pred_proba': y_pred_proba
})
rf_results_df.to_csv('random_forest_predictions.csv', index=False)


with open('best_random_forest_pipeline.pkl', 'wb') as rf_model_file:
    pickle.dump(RANDOM_FOREST_PIPELINE, rf_model_file)

In [66]:
print(f"Best cross-validation score: {CV_RF.best_score_}")
print(f"Best parameters: {best_params}")
print("Results saved to 'random_forest_predictions.csv'.")
print("Pipeline saved to 'best_random_forest_pipeline.pkl'.")

Best cross-validation score: 0.9271731271455118
Best parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
Results saved to 'random_forest_predictions.csv'.
Pipeline saved to 'best_random_forest_pipeline.pkl'.


In [67]:
from sklearn.metrics import accuracy_score, recall_score