In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_curve, classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay

In [3]:
df = pd.read_csv(r"C:\Users\sahil\OneDrive\Naresh IT Class\Data Files\Preprocessed_data_telecom_data.csv")

In [4]:
df.head()

Unnamed: 0,Gender,Age,Married,Number of Dependents,Latitude,Longitude,Number of Referrals,Tenure in Months,Offer,Avg Monthly Long Distance Charges,...,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status
0,0,37,1,0,34.827662,-118.999073,2,9,0,42.39,...,1,1,1,65.6,593.3,0.0,0,381.51,974.81,1
1,1,46,0,0,34.162515,-118.203869,0,9,0,10.69,...,0,0,1,-4.0,542.4,38.33,10,96.21,610.28,1
2,1,50,0,0,33.645672,-117.922613,0,4,5,33.65,...,0,1,0,73.9,280.85,0.0,0,134.6,415.45,0
3,1,78,1,0,38.014457,-122.115432,1,13,4,27.82,...,0,1,0,98.0,1237.85,0.0,0,361.66,1599.51,0
4,0,75,1,0,34.227846,-119.079903,3,3,0,7.38,...,0,1,1,83.9,267.4,0.0,0,22.14,289.54,0


**Divide the data into input and output**

In [6]:
x = df.drop('Customer Status',axis=1)
y = df['Customer Status']

**Read the base model**

In [7]:
RF = RandomForestClassifier()

**Get the parameters**

In [15]:
RF.get_params() # to get all hyperparameters

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

**Create the dictionary with Hyperparameters**

In [17]:
param_grid = {
    'n_estimators' : [100, 200],
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 8, 10 ],
    'max_features': ['sqrt', 'log2'],
    'random_state': [0, 42]
}

# RF story: will select not only sample but also features
# will select the feature sqrt or log2

In [24]:
grid_search = GridSearchCV(RF,  # Base modl
                           param_grid, # params 
                           scoring='accuracy', # metric
                           cv=5,
                           verbose=True,
                           n_jobs=-1 # It use all available CPU cores for parallel processing otherwise it takes 15 min to fit 
                           )

In [25]:
grid_search.fit(x, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


**Best hyperparameters**

In [26]:
print('Best Parameters : ', grid_search.best_params_)

Best Parameters :  {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 100, 'random_state': 0}


**Best model containing all best parameters**

In [28]:
print('best estimaters : ',grid_search.best_estimator_)

best estimaters :  RandomForestClassifier(criterion='entropy', max_depth=10, random_state=0)


In [32]:
# Get the best model
best_RF = grid_search.best_estimator_

**Evaluate the best model with cross-validation**

In [31]:
score = cross_val_score(best_RF, x, y, cv=5)
print(score)
print("Cross-validation score:", score.mean())

[0.81695967 0.78800414 0.82833506 0.82730093 0.81282316]
Cross-validation score: 0.8146845915201656
