In [2]:
# data loading

import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/All Datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv")


In [3]:
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
print(df.isnull().values.any())

False


In [6]:
# Data Preprocessing / Cleaning

import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# convert total charges to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"],errors="coerce")
df.dropna(inplace=True)

# encode categorical values
le = LabelEncoder()
df["Churn"] = le.fit_transform(df["Churn"])

#separate features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]

# scale numerical features
num_cols = X.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [7]:
# Feature engineering

df["AvgChargesPerMonth"] = df["TotalCharges"] / (df["tenure"] + 1)
print(df[["AvgChargesPerMonth", "Churn"]].corr())


                    AvgChargesPerMonth     Churn
AvgChargesPerMonth            1.000000  0.070992
Churn                         0.070992  1.000000


In [9]:
X = X.drop("customerID", axis=1)


In [10]:
# One-hot encode all categorical variables
X = pd.get_dummies(X, drop_first=True)


In [13]:
# Model Training

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42, class_weight="balanced")

model.fit(X_train, y_train)

In [14]:
# model evalution

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.783226723525231

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86      1033
           1       0.63      0.45      0.52       374

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.69      1407
weighted avg       0.77      0.78      0.77      1407


Confusion Matrix:
 [[934  99]
 [206 168]]


In [16]:
# Hyper Pramater tuning

from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5, 10]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring="accuracy")
grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)


Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}


In [18]:
# model deploytment

import joblib
# save model
joblib.dump(grid.best_estimator_,"churn_model.pkl")

# load model
model = joblib.load("churn_model.pkl")

sample = X_test.iloc[0:1]
prediction = model.predict(sample)[0]

if prediction == 1:
    print("Customer is likely to CHURN")
else:
    print("Customer is NOT likely to churn")


Customer is NOT likely to churn
