In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_score, recall_score, classification_report, confusion_matrix, f1_score

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample


In [2]:
df_raw = pd.read_csv("../data/raw/Telco-Customer-Churn.csv")

In [3]:
df_raw.shape

(7043, 21)

In [4]:
df_raw.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df_raw.rename(columns = {"gender":"Gender", "tenure": "Tenure","customerID":"CustomerID",'InternetService':'InternetServiceType','Contract':'ContractType'}, inplace =True)

In [6]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CustomerID           7043 non-null   object 
 1   Gender               7043 non-null   object 
 2   SeniorCitizen        7043 non-null   int64  
 3   Partner              7043 non-null   object 
 4   Dependents           7043 non-null   object 
 5   Tenure               7043 non-null   int64  
 6   PhoneService         7043 non-null   object 
 7   MultipleLines        7043 non-null   object 
 8   InternetServiceType  7043 non-null   object 
 9   OnlineSecurity       7043 non-null   object 
 10  OnlineBackup         7043 non-null   object 
 11  DeviceProtection     7043 non-null   object 
 12  TechSupport          7043 non-null   object 
 13  StreamingTV          7043 non-null   object 
 14  StreamingMovies      7043 non-null   object 
 15  ContractType         7043 non-null   o

In [7]:
missing_values = df_raw.isnull().sum()

print("Columns with missing values:")
print(missing_values[missing_values > 0])

Columns with missing values:
Series([], dtype: int64)


In [8]:
print(df_raw[df_raw['TotalCharges']==' '][['MonthlyCharges','TotalCharges']]) 
df_raw.loc[df_raw['TotalCharges']==' ','TotalCharges'] = np.nan

      MonthlyCharges TotalCharges
488            52.55             
753            20.25             
936            80.85             
1082           25.75             
1340           56.05             
3331           19.85             
3826           25.35             
4380           20.00             
5218           19.70             
6670           73.35             
6754           61.90             


In [9]:
df_raw['TotalCharges'] = df_raw['TotalCharges'].astype('float64')
df_raw.loc[df_raw['TotalCharges'].isnull(), 'TotalCharges'] = df_raw['MonthlyCharges'] * df_raw['Tenure']

In [10]:
# Verify the changes
print("Rows with missing TotalCharges:")
print(df_raw[df_raw['TotalCharges'].isnull()])

Rows with missing TotalCharges:
Empty DataFrame
Columns: [CustomerID, Gender, SeniorCitizen, Partner, Dependents, Tenure, PhoneService, MultipleLines, InternetServiceType, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, ContractType, PaperlessBilling, PaymentMethod, MonthlyCharges, TotalCharges, Churn]
Index: []

[0 rows x 21 columns]


In [11]:
df_raw_1 = df_raw.copy()

In [12]:
categorical_columns = [
    "Gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService",
    "MultipleLines", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "PaperlessBilling",
    "InternetServiceType", "ContractType", "PaymentMethod"
]

#Convert Senior citizen to ye/ no 
df_raw_1['SeniorCitizen'] = df_raw_1['SeniorCitizen'].apply(lambda x: 'Yes' if x == 1 else 'No')


In [13]:
categorical_df = df_raw_1[categorical_columns]

In [14]:
for col in categorical_df.columns:
    print(col, ':', categorical_df[col].nunique(), 'labels')

Gender : 2 labels
SeniorCitizen : 2 labels
Partner : 2 labels
Dependents : 2 labels
PhoneService : 2 labels
MultipleLines : 3 labels
OnlineSecurity : 3 labels
OnlineBackup : 3 labels
DeviceProtection : 3 labels
TechSupport : 3 labels
StreamingTV : 3 labels
StreamingMovies : 3 labels
PaperlessBilling : 2 labels
InternetServiceType : 3 labels
ContractType : 3 labels
PaymentMethod : 4 labels


In [15]:
y = df_raw_1['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

In [16]:
# One-Hot Encode categorical data
categorical_encoded_df = pd.get_dummies(categorical_df, dtype='int', drop_first=False)

In [17]:
categorical_encoded_df.head()

Unnamed: 0,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,...,InternetServiceType_DSL,InternetServiceType_Fiber optic,InternetServiceType_No,ContractType_Month-to-month,ContractType_One year,ContractType_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,0,1,1,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,0,1,1,0,1,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,1
2,0,1,1,0,1,0,1,0,0,1,...,1,0,0,1,0,0,0,0,0,1
3,0,1,1,0,1,0,1,0,1,0,...,1,0,0,0,1,0,1,0,0,0
4,1,0,1,0,1,0,1,0,0,1,...,0,1,0,1,0,0,0,0,1,0


In [18]:
numerical_columns= ["Tenure","MonthlyCharges","TotalCharges"]
numeric_df = df_raw_1[numerical_columns]

In [19]:
y = df_raw_1["Churn"]

In [20]:
# Combine numeric and one-hot encoded categorical data
X = pd.concat([numeric_df, categorical_encoded_df], axis=1)

In [21]:
#Train-Test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [22]:
#Standardize the numeric features
scaler = StandardScaler()

In [23]:
# fit 
scaler.fit(X_train)
X_train_standardized = scaler.transform(X_train)

# Transform 
X_test_standardized = scaler.transform(X_test)

# Get the DataFrames 
X_train_standardized_df = pd.DataFrame(X_train_standardized, columns=X_train.columns)
X_test_standardized_df = pd.DataFrame(X_test_standardized, columns=X_test.columns)

# display the shapes to verify
print("X_train shape:", X_train_standardized_df.shape)
print("X_test shape:", X_test_standardized_df.shape)

X_train shape: (5634, 46)
X_test shape: (1409, 46)


In [24]:
X_train_standardized_df

Unnamed: 0,Tenure,MonthlyCharges,TotalCharges,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,...,InternetServiceType_DSL,InternetServiceType_Fiber optic,InternetServiceType_No,ContractType_Month-to-month,ContractType_One year,ContractType_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-0.705310,1.206336,-0.363074,1.010707,-1.010707,0.443207,-0.443207,0.966139,-0.966139,0.644653,...,-0.722413,1.120448,-0.521328,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
1,0.108051,-0.095544,-0.082258,1.010707,-1.010707,0.443207,-0.443207,0.966139,-0.966139,0.644653,...,1.384250,-0.892500,-0.521328,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
2,1.612768,-0.133785,0.995974,-0.989406,0.989406,0.443207,-0.443207,-1.035048,1.035048,-1.551221,...,1.384250,-0.892500,-0.521328,-1.110032,-0.511954,1.777921,1.875645,-0.524081,-0.715029,-0.538638
3,0.148719,-0.142099,-0.021400,-0.989406,0.989406,0.443207,-0.443207,-1.035048,1.035048,-1.551221,...,1.384250,-0.892500,-0.521328,-1.110032,1.953301,-0.562455,-0.533150,1.908103,-0.715029,-0.538638
4,1.328092,1.432461,2.125590,1.010707,-1.010707,0.443207,-0.443207,-1.035048,1.035048,0.644653,...,-0.722413,1.120448,-0.521328,-1.110032,1.953301,-0.562455,-0.533150,1.908103,-0.715029,-0.538638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,-0.949318,-1.512149,-0.937542,1.010707,-1.010707,0.443207,-0.443207,0.966139,-0.966139,0.644653,...,-0.722413,-0.892500,1.918177,0.900875,-0.511954,-0.562455,1.875645,-0.524081,-0.715029,-0.538638
5630,1.531432,1.222963,2.103788,1.010707,-1.010707,0.443207,-0.443207,-1.035048,1.035048,0.644653,...,-0.722413,1.120448,-0.521328,-1.110032,1.953301,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
5631,-0.542638,0.727484,-0.315975,1.010707,-1.010707,0.443207,-0.443207,-1.035048,1.035048,0.644653,...,-0.722413,1.120448,-0.521328,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
5632,1.490764,-0.376537,0.668651,-0.989406,0.989406,0.443207,-0.443207,-1.035048,1.035048,0.644653,...,1.384250,-0.892500,-0.521328,-1.110032,1.953301,-0.562455,-0.533150,1.908103,-0.715029,-0.538638


In [25]:
X_test_standardized_df

Unnamed: 0,Tenure,MonthlyCharges,TotalCharges,Gender_Female,Gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,...,InternetServiceType_DSL,InternetServiceType_Fiber optic,InternetServiceType_No,ContractType_Month-to-month,ContractType_One year,ContractType_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.233994,-1.508824,-0.987872,1.010707,-1.010707,0.443207,-0.443207,0.966139,-0.966139,-1.551221,...,-0.722413,-0.892500,1.918177,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
1,-0.664642,0.662639,-0.389008,-0.989406,0.989406,0.443207,-0.443207,-1.035048,1.035048,-1.551221,...,1.384250,-0.892500,-0.521328,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
2,0.880744,-0.053977,0.494891,-0.989406,0.989406,0.443207,-0.443207,-1.035048,1.035048,-1.551221,...,1.384250,-0.892500,-0.521328,-1.110032,-0.511954,1.777921,-0.533150,1.908103,-0.715029,-0.538638
3,-0.623974,1.118214,-0.254194,1.010707,-1.010707,0.443207,-0.443207,-1.035048,1.035048,-1.551221,...,-0.722413,1.120448,-0.521328,-1.110032,1.953301,-0.562455,1.875645,-0.524081,-0.715029,-0.538638
4,-0.298629,-1.475570,-0.765606,-0.989406,0.989406,0.443207,-0.443207,0.966139,-0.966139,0.644653,...,-0.722413,-0.892500,1.918177,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,-0.715029,1.856536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,-0.095289,-0.015735,-0.131028,1.010707,-1.010707,0.443207,-0.443207,-1.035048,1.035048,-1.551221,...,1.384250,-0.892500,-0.521328,-1.110032,1.953301,-0.562455,-0.533150,-0.524081,1.398545,-0.538638
1405,0.514731,1.445763,1.178679,-0.989406,0.989406,0.443207,-0.443207,0.966139,-0.966139,0.644653,...,-0.722413,1.120448,-0.521328,0.900875,-0.511954,-0.562455,-0.533150,1.908103,-0.715029,-0.538638
1406,-1.274662,-1.510486,-0.995037,-0.989406,0.989406,0.443207,-0.443207,0.966139,-0.966139,-1.551221,...,-0.722413,-0.892500,1.918177,0.900875,-0.511954,-0.562455,-0.533150,-0.524081,-0.715029,1.856536
1407,-0.867982,-1.492197,-0.906333,1.010707,-1.010707,0.443207,-0.443207,0.966139,-0.966139,-1.551221,...,-0.722413,-0.892500,1.918177,0.900875,-0.511954,-0.562455,-0.533150,1.908103,-0.715029,-0.538638


SMOTE for sampling 

In [27]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=2)

# SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the resampled training data
scaler.fit(X_train_resampled)

# Transform the resampled training data
X_train_resampled_standardized = scaler.transform(X_train_resampled)

# Transform the testing data using the fitted scaler
X_test_standardized = scaler.transform(X_test)

# Convert the results back to DataFrames
X_train_standardized_df = pd.DataFrame(X_train_resampled_standardized, columns=X_train.columns)
X_test_standardized_df = pd.DataFrame(X_test_standardized, columns=X_test.columns)

# Optionally, you can print the shapes to verify
print("X_train shape:", X_train_standardized_df.shape)
print("X_test shape:", X_test_standardized_df.shape)
print("y_train_resampled shape:", y_train_resampled.shape)
print("y_test shape:", y_test.shape)

X_train shape: (8226, 46)
X_test shape: (1409, 46)
y_train_resampled shape: (8226,)
y_test shape: (1409,)


## Model : Logistic Regression 

In [29]:
# Fit logistic regression on original data
logreg_original = LogisticRegression(random_state=2)

In [30]:
logreg_original.fit(X_train_standardized, y_train)

In [31]:
# Predict on test set
y_pred_original = logreg_original.predict(X_test_standardized)

# Print classification report for original data
print("Classification Report - Original Data:")
print(classification_report(y_test, y_pred_original))

Classification Report - Original Data:
              precision    recall  f1-score   support

          No       0.82      0.95      0.88      1061
         Yes       0.71      0.36      0.48       348

    accuracy                           0.81      1409
   macro avg       0.76      0.66      0.68      1409
weighted avg       0.79      0.81      0.78      1409



In [32]:
# Fit logistic regression on SMOTE-resampled data
logreg_smote = LogisticRegression(random_state=2)
logreg_smote.fit(X_train_resampled, y_train_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
# Predict on test set using SMOTE-resampled data
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_smote = logreg_smote.predict(X_test_resampled)

# Print classification report for SMOTE-resampled data
print("\nClassification Report - SMOTE Data:")
print(classification_report(y_test, y_pred_smote))


Classification Report - SMOTE Data:
              precision    recall  f1-score   support

          No       0.76      0.54      0.63      1061
         Yes       0.25      0.48      0.33       348

    accuracy                           0.52      1409
   macro avg       0.51      0.51      0.48      1409
weighted avg       0.63      0.52      0.56      1409





## MOdel: Random Forest

In [35]:
# Fit Random Forest on original data
rf_original = RandomForestClassifier(random_state=2)
rf_original.fit(X_train_standardized, y_train)

In [36]:
# Predict on test set
y_pred_rf_original = rf_original.predict(X_test_standardized)

# Print classification report for original data
print("Random Forest - Classification Report - Original Data:")
print(classification_report(y_test, y_pred_rf_original))

Random Forest - Classification Report - Original Data:
              precision    recall  f1-score   support

          No       0.82      0.94      0.88      1061
         Yes       0.67      0.39      0.49       348

    accuracy                           0.80      1409
   macro avg       0.75      0.66      0.68      1409
weighted avg       0.79      0.80      0.78      1409



In [37]:
# Fit Random Forest on SMOTE-resampled data
rf_smote = RandomForestClassifier(random_state=2)
rf_smote.fit(X_train_resampled, y_train_resampled)

In [38]:
# Predict on test set using SMOTE-resampled data
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_rf_smote = rf_smote.predict(X_test_resampled)

# Print classification report for SMOTE-resampled data
print("\nRandom Forest - Classification Report - SMOTE Data:")
print(classification_report(y_test, y_pred_rf_smote))


Random Forest - Classification Report - SMOTE Data:
              precision    recall  f1-score   support

          No       0.89      0.72      0.80      1061
         Yes       0.46      0.72      0.56       348

    accuracy                           0.72      1409
   macro avg       0.67      0.72      0.68      1409
weighted avg       0.78      0.72      0.74      1409





## Model: Gradient Booster

In [40]:
# Fit GradientBoostingClassifier on original data
gb_original = GradientBoostingClassifier(random_state=2)
gb_original.fit(X_train_standardized, y_train)


In [41]:
# Predict on test set
y_pred_gb_original = gb_original.predict(X_test_standardized)

# classification report for original data
print("Gradient Boosting - Classification Report - Original Data:")
print(classification_report(y_test, y_pred_gb_original))

Gradient Boosting - Classification Report - Original Data:
              precision    recall  f1-score   support

          No       0.83      0.94      0.88      1061
         Yes       0.68      0.41      0.51       348

    accuracy                           0.81      1409
   macro avg       0.75      0.67      0.70      1409
weighted avg       0.79      0.81      0.79      1409



In [42]:
# Fit GradientBoostingClassifier on SMOTE-resampled data
gb_smote = GradientBoostingClassifier(random_state=2)
gb_smote.fit(X_train_resampled, y_train_resampled)


In [43]:
# Predict on test set using SMOTE-resampled data
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_gb_smote = gb_smote.predict(X_test_resampled)

# Print classification report for SMOTE-resampled data
print("\nGradient Boosting - Classification Report - SMOTE Data:")
print(classification_report(y_test, y_pred_gb_smote))



Gradient Boosting - Classification Report - SMOTE Data:
              precision    recall  f1-score   support

          No       0.83      0.91      0.86      1061
         Yes       0.59      0.42      0.49       348

    accuracy                           0.79      1409
   macro avg       0.71      0.66      0.68      1409
weighted avg       0.77      0.79      0.77      1409





In [44]:
# Fit AdaBoostClassifier on original data
ada_original = AdaBoostClassifier(random_state=2)
ada_original.fit(X_train_standardized, y_train)



In [45]:
# Predict on test set
y_pred_ada_original = ada_original.predict(X_test_standardized)

# Print classification report for original data
print("AdaBoost - Classification Report - Original Data:")
print(classification_report(y_test, y_pred_ada_original))

AdaBoost - Classification Report - Original Data:
              precision    recall  f1-score   support

          No       0.83      0.93      0.88      1061
         Yes       0.67      0.41      0.51       348

    accuracy                           0.81      1409
   macro avg       0.75      0.67      0.69      1409
weighted avg       0.79      0.81      0.79      1409



In [46]:
# Fit AdaBoostClassifier on SMOTE-resampled data
ada_smote = AdaBoostClassifier(random_state=2)
ada_smote.fit(X_train_resampled, y_train_resampled)



In [47]:
# Predict on test set using SMOTE-resampled data
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_ada_smote = ada_smote.predict(X_test_resampled)

# Print classification report for SMOTE-resampled data
print("\nAdaBoost - Classification Report - SMOTE Data:")
print(classification_report(y_test, y_pred_ada_smote))


AdaBoost - Classification Report - SMOTE Data:
              precision    recall  f1-score   support

          No       0.90      0.73      0.80      1061
         Yes       0.47      0.75      0.58       348

    accuracy                           0.73      1409
   macro avg       0.68      0.74      0.69      1409
weighted avg       0.79      0.73      0.75      1409





## Parameter Tuning

In [49]:
from sklearn.model_selection import train_test_split, GridSearchCV

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

grid_search_lr = GridSearchCV(LogisticRegression(max_iter=1000, random_state=2), param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_standardized, y_train)

print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)

# Predict on test set
y_pred_lr = grid_search_lr.predict(X_test_standardized)

# Print classification report for Logistic Regression
print("\nLogistic Regression - Classification Report:")
print(classification_report(y_test, y_pred_lr))

Best parameters for Logistic Regression: {'C': 10, 'penalty': 'l2', 'solver': 'sag'}

Logistic Regression - Classification Report:
              precision    recall  f1-score   support

          No       0.82      0.95      0.88      1061
         Yes       0.71      0.36      0.48       348

    accuracy                           0.81      1409
   macro avg       0.76      0.66      0.68      1409
weighted avg       0.79      0.81      0.78      1409



325 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn

In [50]:
from sklearn.model_selection import GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['auto', 'sqrt']
}

In [51]:
from sklearn.model_selection import RandomizedSearchCV

random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=2), 
    param_distributions=param_grid_rf, 
    n_iter=10,  # Number of parameter settings that are sampled
    cv=5, 
    scoring='accuracy',
    random_state=2,
    n_jobs=-1  # Use all available CPU cores
)
random_search_rf.fit(X_train_resampled, y_train_resampled)

print("Best parameters for Random Forest:", random_search_rf.best_params_)

# Predict on test set
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_rf = random_search_rf.predict(X_test_resampled)

# Print classification report for Random Forest
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf))

30 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_const

Best parameters for Random Forest: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10}

Random Forest - Classification Report:
              precision    recall  f1-score   support

          No       0.90      0.72      0.80      1061
         Yes       0.47      0.76      0.58       348

    accuracy                           0.73      1409
   macro avg       0.69      0.74      0.69      1409
weighted avg       0.80      0.73      0.75      1409





In [52]:
grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=2), 
    param_grid_rf, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1  # Use all available CPU cores
)
grid_search_rf.fit(X_train_resampled, y_train_resampled)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Predict on test set
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_rf = grid_search_rf.predict(X_test_resampled)

# Print classification report for Random Forest
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf))

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_cons

Best parameters for Random Forest: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Random Forest - Classification Report:
              precision    recall  f1-score   support

          No       0.90      0.71      0.79      1061
         Yes       0.46      0.76      0.57       348

    accuracy                           0.72      1409
   macro avg       0.68      0.73      0.68      1409
weighted avg       0.79      0.72      0.74      1409





In [53]:
X_train_sub, _, y_train_sub, _ = train_test_split(
    X_train_resampled, y_train_resampled, test_size=0.8, random_state=2, stratify=y_train_resampled)

grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=2), 
    param_grid_rf, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1  # Use all available CPU cores
)
grid_search_rf.fit(X_train_sub, y_train_sub)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)

# Predict on test set
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_rf = grid_search_rf.predict(X_test_resampled)

# Print classification report for Random Forest
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf))

80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\hajar\bootcamp-files\week_8\Project_Customer_churn_analysis\churn_proj\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_cons

Best parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}

Random Forest - Classification Report:
              precision    recall  f1-score   support

          No       0.90      0.71      0.79      1061
         Yes       0.46      0.75      0.57       348

    accuracy                           0.72      1409
   macro avg       0.68      0.73      0.68      1409
weighted avg       0.79      0.72      0.74      1409



In [54]:
# Tuning Gradient booster

In [55]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the Gradient Boosting classifier
gb_clf = GradientBoostingClassifier(random_state=42)


In [56]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distributions for RandomizedSearchCV
param_dist = {
    'learning_rate': [0.05, 0.1],
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.5, 1.0]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=gb_clf, param_distributions=param_dist, n_iter=100, cv=5, scoring='f1', verbose=1, n_jobs=-1, random_state=42)

# Perform RandomizedSearchCV
random_search.fit(X_train_resampled, y_train_resampled)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan]


In [57]:
# Print the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best F1-score:", random_search.best_score_)

# Evaluate on the test set with the best estimator
best_gb_clf = random_search.best_estimator_
y_pred = best_gb_clf.predict(X_test_standardized)
print("\nClassification Report - Test Set:")
print(classification_report(y_test, y_pred))

Best Parameters: {'subsample': 0.5, 'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 3, 'learning_rate': 0.05}
Best F1-score: nan

Classification Report - Test Set:
              precision    recall  f1-score   support

          No       0.92      0.61      0.73      1061
         Yes       0.41      0.84      0.55       348

    accuracy                           0.66      1409
   macro avg       0.67      0.72      0.64      1409
weighted avg       0.80      0.66      0.69      1409





## Model Evaluation

In [59]:
# Logistic Regression with Best Parameters
best_params_lr = {'C': 10, 'penalty': 'l2', 'solver': 'sag'}
log_reg_best = LogisticRegression(**best_params_lr, max_iter=1000, random_state=2)
log_reg_best.fit(X_train_standardized, y_train)

# Predict on test set
y_pred_lr = log_reg_best.predict(X_test_standardized)

# Print classification report for Logistic Regression
print("Logistic Regression - Classification Report - Best Parameters:")
print(classification_report(y_test, y_pred_lr))

# Assume these are the best parameters found for Random Forest
best_params_rf = {
    'n_estimators': 100,
    'max_depth': 20,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 'auto'
}


Logistic Regression - Classification Report - Best Parameters:
              precision    recall  f1-score   support

          No       0.82      0.95      0.88      1061
         Yes       0.71      0.36      0.48       348

    accuracy                           0.81      1409
   macro avg       0.76      0.66      0.68      1409
weighted avg       0.79      0.81      0.78      1409



In [60]:
# Print confusion matrix for Logistic Regression
print("Logistic Regression - Confusion Matrix - Best Parameters:")
print(confusion_matrix(y_test, y_pred_lr))

Logistic Regression - Confusion Matrix - Best Parameters:
[[1009   52]
 [ 222  126]]


In [61]:
# Random Forest with Best Parameters
best_params_rf = {
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 5,
    'n_estimators': 100
}
rf_best = RandomForestClassifier(**best_params_rf, random_state=2)
rf_best.fit(X_train_resampled, y_train_resampled)

# Predict on test set with best parameters for Random Forest
X_test_resampled = scaler.transform(X_test)  # Transform test data with the same scaler
y_pred_rf = rf_best.predict(X_test_resampled)

# Print classification report for Random Forest
print("\nRandom Forest - Classification Report - Best Parameters:")
print(classification_report(y_test, y_pred_rf))


Random Forest - Classification Report - Best Parameters:
              precision    recall  f1-score   support

          No       0.90      0.71      0.79      1061
         Yes       0.46      0.76      0.57       348

    accuracy                           0.72      1409
   macro avg       0.68      0.73      0.68      1409
weighted avg       0.79      0.72      0.74      1409





In [62]:
# Print confusion matrix for Random Forest
print("Random Forest - Confusion Matrix - Best Parameters:")
print(confusion_matrix(y_test, y_pred_rf))

Random Forest - Confusion Matrix - Best Parameters:
[[751 310]
 [ 85 263]]


In [63]:
# Gradient Booster 

In [68]:
from sklearn.metrics import classification_report

best_params_gb = {
    'learning_rate': 0.05,
    'n_estimators': 50,
    'max_depth': 3,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'subsample': 0.5
    }

# Define the Gradient Boosting classifier with best parameters
gb_best = GradientBoostingClassifier(**best_params_gb, random_state=2)

# Fit on the resampled training data
gb_best.fit(X_train_resampled, y_train_resampled)

# Predict on test set with best parameters for Gradient Boosting
X_test_resampled = scaler.transform(X_test)  # Assuming 'scaler' is already fitted
y_pred_gb = gb_best.predict(X_test_resampled)

# Print classification report for Gradient Boosting
print("\nGradient Boosting - Classification Report - Best Parameters:")
print(classification_report(y_test, y_pred_gb))


Gradient Boosting - Classification Report - Best Parameters:
              precision    recall  f1-score   support

          No       0.91      0.61      0.73      1061
         Yes       0.41      0.82      0.55       348

    accuracy                           0.66      1409
   macro avg       0.66      0.72      0.64      1409
weighted avg       0.79      0.66      0.69      1409



