Importing data and required packages 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN

Reading CSV

In [2]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [5]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

Train Test Split

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.shape , x_test.shape

((5625, 50), (1407, 50))

Create an Evaluate Function to give all metrics after model Training

In [7]:
def evaluate_model(y_test, y_pred):
 y_pred_binary = np.round(y_pred).astype(int)
    
 precision = precision_score(y_test, y_pred_binary)
 recall = recall_score(y_test, y_pred_binary)
 f1 = f1_score(y_test, y_pred_binary)
 return precision , recall , f1

In [8]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
f1_list =[]

In [9]:
def models_assemble(x_train , x_test, y_train, y_test):
 for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) 

    y_pred = model.predict(x_test)
    
    precision, recall, f1 = evaluate_model(y_test, y_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Precision : {:.4f}".format(precision))
    print("- Recall : {:.4f}".format(recall))
    print("- F1 : {:.4f}".format(f1))
    f1_list.append(f1)
 
    print('='*35)
    print('\n')

In [10]:
models_assemble(x_train, x_test,y_train, y_test )

Linear Regression
Model performance for Training set
- Precision : 0.6800
- Recall : 0.4960
- F1 : 0.5736


Lasso
Model performance for Training set
- Precision : 0.7157
- Recall : 0.3740
- F1 : 0.4913


Ridge
Model performance for Training set
- Precision : 0.6800
- Recall : 0.4960
- F1 : 0.5736




K-Neighbors Classifier
Model performance for Training set
- Precision : 0.5652
- Recall : 0.4828
- F1 : 0.5207


Decision Tree
Model performance for Training set
- Precision : 0.4963
- Recall : 0.5332
- F1 : 0.5141


Random Forest Classifier
Model performance for Training set
- Precision : 0.5967
- Recall : 0.4828
- F1 : 0.5337


AdaBoost Regressor
Model performance for Training set
- Precision : 0.6693
- Recall : 0.4509
- F1 : 0.5388




###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [11]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [12]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [13]:
models_assemble(xr_train,xr_test,yr_train,yr_test)

Linear Regression
Model performance for Training set
- Precision : 0.9635
- Recall : 0.9421
- F1 : 0.9527


Lasso
Model performance for Training set
- Precision : 0.9072
- Recall : 0.8425
- F1 : 0.8737


Ridge
Model performance for Training set
- Precision : 0.9635
- Recall : 0.9421
- F1 : 0.9527


K-Neighbors Classifier
Model performance for Training set
- Precision : 0.9519
- Recall : 0.9703
- F1 : 0.9610


Decision Tree
Model performance for Training set
- Precision : 0.9447
- Recall : 0.9391
- F1 : 0.9419


Random Forest Classifier
Model performance for Training set
- Precision : 0.9732
- Recall : 0.9703
- F1 : 0.9717


AdaBoost Regressor
Model performance for Training set
- Precision : 0.9630
- Recall : 0.9287
- F1 : 0.9455




In [14]:
pd.DataFrame(list(zip(model_list, f1_list)), columns=['Model Name', 'F1_Score']).sort_values(by=["F1_Score"], ascending=True)

Unnamed: 0,Model Name,F1_Score
1,Lasso,0.491289
4,Decision Tree,0.514066
3,K-Neighbors Classifier,0.520744
5,Random Forest Classifier,0.533724
6,AdaBoost Regressor,0.538827
0,Linear Regression,0.57362
2,Ridge,0.57362
8,Lasso,0.873652
11,Decision Tree,0.941878
13,AdaBoost Regressor,0.945537


From the above ingishts we can see tha the random forest classifier has highest f1 score , so we will go forward with it and pefor some hyper parameter tuning on it.

In [15]:
from sklearn.model_selection import GridSearchCV
rf_classifier = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# grid search object
grid_search = GridSearchCV(estimator=rf_classifier,param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

grid_search.fit(xr_train, yr_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_rf_model = grid_search.best_estimator_
yrf_pred = best_rf_model.predict(xr_test)

# print("Classification Report for the Best Model:")
# print(classification_report(y_test, y_pred))
precision, recall , f1 = evaluate_model(yr_test, yrf_pred)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)




Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Precision: 0.9716840536512668
Recall: 0.9687964338781575
F1-Score: 0.9702380952380951


Saving the model dumping it in a pickle file

In [16]:
import pickle 

filename = 'final_model'

pickle.dump(best_rf_model, open(filename, 'wb'))



In [17]:
load_model = pickle.load(open(filename, 'rb'))

In [18]:
model_score = load_model.score(xr_test, yr_test)
model_score


0.9662731871838112