### Import required libraries

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score  
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read data
df = pd.read_csv('/home/saif/Desktop/Customer Churn Analysis and Prediction/top_features.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7010 entries, 0 to 7009
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   TotalCharges      7010 non-null   float64
 1   MonthlyCharges    7010 non-null   float64
 2   tenure            7010 non-null   int64  
 3   Contract          7010 non-null   int64  
 4   PaymentMethod     7010 non-null   int64  
 5   InternetService   7010 non-null   int64  
 6   OnlineBackup      7010 non-null   int64  
 7   OnlineSecurity    7010 non-null   int64  
 8   gender            7010 non-null   int64  
 9   TechSupport       7010 non-null   int64  
 10  PaperlessBilling  7010 non-null   int64  
 11  MultipleLines     7010 non-null   int64  
 12  Churn             7010 non-null   int64  
dtypes: float64(2), int64(11)
memory usage: 712.1 KB


In [4]:
df.describe()

Unnamed: 0,TotalCharges,MonthlyCharges,tenure,Contract,PaymentMethod,InternetService,OnlineBackup,OnlineSecurity,gender,TechSupport,PaperlessBilling,MultipleLines,Churn
count,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0,7010.0
mean,2290.353388,64.888666,32.520399,8.838374,2.317404,1.225963,0.131098,0.072611,0.50428,0.076177,0.593153,0.326248,0.264907
std,2266.820832,30.064769,24.520441,9.54659,1.150581,0.7776,0.737334,0.70504,0.500017,0.70719,0.491281,0.643333,0.441315
min,18.8,18.25,1.0,1.0,1.0,0.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,0.0
25%,408.3125,35.75,9.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1403.875,70.4,29.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
75%,3807.8375,89.9,56.0,12.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,8684.8,118.75,72.0,24.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df.columns

Index(['TotalCharges', 'MonthlyCharges', 'tenure', 'Contract', 'PaymentMethod',
       'InternetService', 'OnlineBackup', 'OnlineSecurity', 'gender',
       'TechSupport', 'PaperlessBilling', 'MultipleLines', 'Churn'],
      dtype='object')

In [6]:
df.sample(5)

Unnamed: 0,TotalCharges,MonthlyCharges,tenure,Contract,PaymentMethod,InternetService,OnlineBackup,OnlineSecurity,gender,TechSupport,PaperlessBilling,MultipleLines,Churn
5446,219.5,19.95,10,12,3,0,-1,-1,1,-1,1,0,0
1574,3512.15,65.2,56,12,3,1,0,1,0,0,0,1,0
1266,2879.2,53.6,53,12,1,1,1,1,0,1,1,-1,0
6366,7475.1,106.15,70,12,1,2,1,0,1,1,1,0,0
2031,165.35,19.45,7,24,2,0,-1,-1,1,-1,0,0,0


## Applying min max scaling on TotalCharges, MonthlyCharges,tenure, and Contract columns

In [7]:
# apply min max scaling on TotalCharges, MonthlyCharges, and tenure

scaler = MinMaxScaler()
df['TotalCharges'] = scaler.fit_transform(df[['TotalCharges']])
df['MonthlyCharges'] = scaler.fit_transform(df[['MonthlyCharges']])
df['tenure'] = scaler.fit_transform(df[['tenure']])
df['Contract'] = scaler.fit_transform(df[['Contract']])

In [8]:
df.sample(5)

Unnamed: 0,TotalCharges,MonthlyCharges,tenure,Contract,PaymentMethod,InternetService,OnlineBackup,OnlineSecurity,gender,TechSupport,PaperlessBilling,MultipleLines,Churn
6943,0.492199,0.534826,0.859155,1.0,4,1,1,1,0,0,1,0,0
6770,0.214684,0.704975,0.267606,0.0,3,2,0,1,0,0,1,1,0
6695,0.514909,0.516915,0.887324,1.0,4,1,1,1,0,1,1,0,1
5522,0.118278,0.602985,0.183099,0.0,1,2,1,1,0,0,0,0,0
3680,0.82361,0.900498,0.929577,1.0,4,2,0,1,0,1,1,1,1


# Task 2 : Data Split

In [9]:
# split data into train and test
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5608, 12), (1402, 12), (5608,), (1402,))

# Task 4 : Model Selection
I will train these model in order from 1 to 7 

**1.Logistic Regression (Baseline Model)**

**2.Decision Tree Classifier**

**3.Random Forest Classifier**

**4.XGBoost**

**5.LightGBM**

**6.CatBoost**

**7.Support Vector Machine (SVM)**

### Logistic regression

In [12]:
# training logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [13]:
# predict on test data
y_pred_lr = lr.predict(X_test)

In [16]:
# evaluate model accuracy,precision, recall, F1-score, and ROC-AUC.
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_roc_auc = roc_auc_score(y_test, y_pred_lr)
print('Accuracy:', lr_accuracy)
print('Precision:', lr_precision)
print('Recall:', lr_recall)
print('F1-score:', lr_f1)
print('ROC-AUC:', lr_roc_auc)


Accuracy: 0.8259629101283881
Precision: 0.6410256410256411
Recall: 0.5451713395638629
F1-score: 0.5892255892255891
ROC-AUC: 0.7272572701519592


**1.Accuracy (82.6%)** → The model is mostly correct, but that doesn’t mean it’s perfect.

**2.Precision (64.1%)** → When the model predicts a customer will leave, it’s right 64% of the time.

**3.Recall (54.5%)** → The model is missing almost **half** of the actual churners, which is a problem if you want to save them.

**4.F1-Score (58.9%)** → A balance between catching churners and being correct when predicting churn. Not great, but not terrible.

**5.ROC-AUC (72.7%)** → The model is better than random guessing but needs improvement.

**Key Takeaway:**
- it fails to catch **many real churners**. The goal is to stop customers from leaving,so  model should improve recall.

### Decision Tree Classifier

In [18]:
# Training Decision Tree Classifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [19]:
# prediction on test data
y_pred_dt = dt.predict(X_test)

In [20]:
# evaluate model accuracy,precision, recall, F1-score, and ROC-AUC.
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_roc_auc = roc_auc_score(y_test, y_pred_dt)
print('Accuracy:', dt_accuracy)
print('Precision:', dt_precision)
print('Recall:', dt_recall)
print('F1-score:', dt_f1)
print('ROC-AUC:', dt_roc_auc)

Accuracy: 0.7425106990014265
Precision: 0.4470899470899471
Recall: 0.5264797507788161
F1-score: 0.4835479256080114
ROC-AUC: 0.6665701251581407


**Logistic Regression model is better than the Decision Tree in almost every way**. Here's why:

**1.Logistic Regression is more accurate (82.6% vs. 74.3%)**, meaning it makes fewer mistakes overall.

**2.It gives better precision (64.1% vs. 44.7%)**, so when it predicts a customer will leave, it's right more often.

**3.Both models have similar recall (~54% vs. ~52%)**, meaning they catch about the same number of actual churners.

**4.Logistic Regression has a better balance (F1-score 58.9% vs. 48.4%)**, so it’s more reliable.

**5.It also does a better job of separating churners from non-churners (ROC-AUC 72.7% vs. 66.7%)**.

#### What This Means
Logistic Regression is the better model right now because it makes fewer mistakes and is more reliable.