In [1]:
#pip install -U matplotlib


In [2]:
#pip install imblearn

In [3]:
#pip install scikit-learn==1.2.2

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score,ConfusionMatrixDisplay, RocCurveDisplay
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)

In [5]:
df = pd.read_csv("customer_churn.csv")

df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Extracting target variable 'Churn'
y = df['Churn'].map({'Yes': 1, 'No': 0}) # Converting categorical to numerical

# Extracting independent variables
X = df[['tenure', 'SeniorCitizen', 'MonthlyCharges']]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [7]:
# Scaling the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [8]:
# Building the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Predictions
y_pred = model.predict(X_test_scaled)



In [9]:
# Evaluating the model
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

conf_matrix, class_report

(array([[957,  79],
        [192, 181]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.83      0.92      0.88      1036\n           1       0.70      0.49      0.57       373\n\n    accuracy                           0.81      1409\n   macro avg       0.76      0.70      0.72      1409\nweighted avg       0.80      0.81      0.80      1409\n')

In [11]:
# Applying SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)



In [12]:
# Checking the balance after SMOTE
balance = y_train_smote.value_counts()



In [13]:
# Re-building the Logistic Regression model with SMOTE-applied data
model_smote = LogisticRegression()
model_smote.fit(X_train_smote, y_train_smote)

# Predictions with SMOTE model
y_pred_smote = model_smote.predict(X_test_scaled)



In [14]:
# Evaluating the SMOTE model
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote)
class_report_smote = classification_report(y_test, y_pred_smote)

balance, conf_matrix_smote, class_report_smote

(Churn
 0    4138
 1    4138
 Name: count, dtype: int64,
 array([[762, 274],
        [ 85, 288]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.90      0.74      0.81      1036\n           1       0.51      0.77      0.62       373\n\n    accuracy                           0.75      1409\n   macro avg       0.71      0.75      0.71      1409\nweighted avg       0.80      0.75      0.76      1409\n')

In [15]:
# Applying TomekLinks for under-sampling
TL = TomekLinks()
X_train_tl, y_train_tl = TL.fit_resample(X_train_scaled, y_train)



(Churn
 0    3725
 1    1496
 Name: count, dtype: int64,
 array([[914, 122],
        [168, 205]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.84      0.88      0.86      1036\n           1       0.63      0.55      0.59       373\n\n    accuracy                           0.79      1409\n   macro avg       0.74      0.72      0.72      1409\nweighted avg       0.79      0.79      0.79      1409\n')

In [16]:
# Checking the balance after applying TomekLinks
balance_check_tl = y_train_tl.value_counts()



In [17]:
# Re-building the Logistic Regression model with TomekLinks-applied data
model_tl = LogisticRegression()
model_tl.fit(X_train_tl, y_train_tl)

# Predictions with TomekLinks model
y_pred_tl = model_tl.predict(X_test_scaled)



In [18]:
# Evaluating the TomekLinks model
conf_matrix_tl = confusion_matrix(y_test, y_pred_tl)
class_report_tl = classification_report(y_test, y_pred_tl)

balance_check_tl, conf_matrix_tl, class_report_tl

(Churn
 0    3725
 1    1496
 Name: count, dtype: int64,
 array([[914, 122],
        [168, 205]], dtype=int64),
 '              precision    recall  f1-score   support\n\n           0       0.84      0.88      0.86      1036\n           1       0.63      0.55      0.59       373\n\n    accuracy                           0.79      1409\n   macro avg       0.74      0.72      0.72      1409\nweighted avg       0.79      0.79      0.79      1409\n')