In [284]:
import pandas as pd 
file_path = "WA_Fn-UseC_-Telco-Customer-Churn.csv"
df = pd.read_csv(file_path)


In [285]:
print("Dataset overview:")
print(df.info())



Dataset overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 n

In [286]:
df.drop(columns=["customerID", "PhoneService"], inplace=True)


In [287]:

print("random 5 samples:")
print(df.sample(5))



random 5 samples:
      gender  SeniorCitizen Partner Dependents  tenure     MultipleLines  \
2518    Male              0      No         No      24               Yes   
2858  Female              0     Yes         No      10                No   
6518    Male              0      No         No       1                No   
5648  Female              1     Yes        Yes      28  No phone service   
6218  Female              0     Yes        Yes      70  No phone service   

     InternetService       OnlineSecurity         OnlineBackup  \
2518     Fiber optic                   No                  Yes   
2858              No  No internet service  No internet service   
6518             DSL                   No                   No   
5648             DSL                   No                  Yes   
6218             DSL                   No                  Yes   

         DeviceProtection          TechSupport          StreamingTV  \
2518                   No                   No           

In [288]:
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

print("Numeric Cols:",numeric_cols)
print("Categorical cols:",categorical_cols)

print()

Numeric Cols: Index(['SeniorCitizen', 'tenure', 'MonthlyCharges'], dtype='object')
Categorical cols: Index(['gender', 'Partner', 'Dependents', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'TotalCharges', 'Churn'],
      dtype='object')



In [289]:
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [290]:
df.drop_duplicates(inplace=True)

In [291]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7021.0,7021.0,7021.0
mean,0.162512,32.469449,64.851894
std,0.368947,24.534965,30.069001
min,0.0,0.0,18.25
25%,0.0,9.0,35.75
50%,0.0,29.0,70.4
75%,0.0,55.0,89.9
max,1.0,72.0,118.75


In [292]:
print("Unique values in Categorical columns:")
for col in categorical_cols:
    print(f"\n{col}: {df[col].unique()}")

Unique values in Categorical columns:

gender: ['Female' 'Male']

Partner: ['Yes' 'No']

Dependents: ['No' 'Yes']

MultipleLines: ['No phone service' 'No' 'Yes']

InternetService: ['DSL' 'Fiber optic' 'No']

OnlineSecurity: ['No' 'Yes' 'No internet service']

OnlineBackup: ['Yes' 'No' 'No internet service']

DeviceProtection: ['No' 'Yes' 'No internet service']

TechSupport: ['No' 'Yes' 'No internet service']

StreamingTV: ['No' 'Yes' 'No internet service']

StreamingMovies: ['No' 'Yes' 'No internet service']

Contract: ['Month-to-month' 'One year' 'Two year']

PaperlessBilling: ['Yes' 'No']

PaymentMethod: ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']

TotalCharges: ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']

Churn: ['No' 'Yes']


In [293]:
#Binary Encoding:(Yes/No -> 1/0)
binary_cols = ['Partner','Dependents','PaperlessBilling','Churn']
df[binary_cols]



Unnamed: 0,Partner,Dependents,PaperlessBilling,Churn
0,Yes,No,Yes,No
1,No,No,No,No
2,No,No,Yes,Yes
3,No,No,No,No
4,No,No,Yes,Yes
...,...,...,...,...
7038,Yes,Yes,Yes,No
7039,Yes,Yes,Yes,No
7040,Yes,Yes,Yes,No
7041,Yes,No,Yes,Yes


In [294]:
for col in binary_cols:
    df[col] = df[col].map({'Yes':1,'No':0})
df[binary_cols]

Unnamed: 0,Partner,Dependents,PaperlessBilling,Churn
0,1,0,1,0
1,0,0,0,0
2,0,0,1,1
3,0,0,0,0
4,0,0,1,1
...,...,...,...,...
7038,1,1,1,0
7039,1,1,1,0
7040,1,1,1,0
7041,1,0,1,1


In [295]:
# Handling "No phone service" and "No internet service"
replace_cols = ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

df[replace_cols]

Unnamed: 0,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,No phone service,No,Yes,No,No,No,No
1,No,Yes,No,Yes,No,No,No
2,No,Yes,Yes,No,No,No,No
3,No phone service,Yes,No,Yes,Yes,No,No
4,No,No,No,No,No,No,No
...,...,...,...,...,...,...,...
7038,Yes,Yes,No,Yes,Yes,Yes,Yes
7039,Yes,No,Yes,Yes,No,Yes,Yes
7040,No phone service,Yes,No,No,No,No,No
7041,Yes,No,No,No,No,No,No


In [296]:
for col in replace_cols:
    df[col] = df[col].replace({'No internet service': 'No', 'No phone service': 'No'})
    df[col] = df[col].map({'Yes': 1, 'No': 0})
df[replace_cols]

Unnamed: 0,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,0,0,1,0,0,0,0
1,0,1,0,1,0,0,0
2,0,1,1,0,0,0,0
3,0,1,0,1,1,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
7038,1,1,0,1,1,1,1
7039,1,0,1,1,0,1,1
7040,0,1,0,0,0,0,0
7041,1,0,0,0,0,0,0


In [297]:
#Ordinal Encoding (Contract: Month-to-Month < One Year < Two Year)
df['Contract'] = df['Contract'].map({'Month-to-month':0, 'One year':1, 'Two year':2})

df['Contract']                            

0       0
1       1
2       0
3       1
4       0
       ..
7038    1
7039    1
7040    0
7041    0
7042    2
Name: Contract, Length: 7021, dtype: int64

In [298]:

#One-Hot Encoding for nomical categories:
df = pd.get_dummies(df,columns=['InternetService','PaymentMethod','gender'],drop_first=True)
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,gender_Male
0,0,1,0,1,0,0,1,0,0,0,...,1,29.85,29.85,0,False,False,False,True,False,False
1,0,0,0,34,0,1,0,1,0,0,...,0,56.95,1889.5,0,False,False,False,False,True,True
2,0,0,0,2,0,1,1,0,0,0,...,1,53.85,108.15,1,False,False,False,False,True,True
3,0,0,0,45,0,1,0,1,1,0,...,0,42.3,1840.75,0,False,False,False,False,False,True
4,0,0,0,2,0,0,0,0,0,0,...,1,70.7,151.65,1,True,False,False,True,False,False


In [299]:
bool_cols = df.select_dtypes(bool).columns  # Identify boolean columns
df[bool_cols] = df[bool_cols].astype("uint8")  # Convert to unsigned 8-bit int (0-255)
df.head()


Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,...,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,InternetService_Fiber optic,InternetService_No,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,gender_Male
0,0,1,0,1,0,0,1,0,0,0,...,1,29.85,29.85,0,0,0,0,1,0,0
1,0,0,0,34,0,1,0,1,0,0,...,0,56.95,1889.5,0,0,0,0,0,1,1
2,0,0,0,2,0,1,1,0,0,0,...,1,53.85,108.15,1,0,0,0,0,1,1
3,0,0,0,45,0,1,0,1,1,0,...,0,42.3,1840.75,0,0,0,0,0,0,1
4,0,0,0,2,0,0,0,0,0,0,...,1,70.7,151.65,1,1,0,0,1,0,0


In [300]:
import numpy as np 
# Convert 'TotalCharges' to numeric, replacing empty strings with NaN
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].replace(" ", np.nan), errors="coerce")

# Fill NaN values with the median or mean (choose based on distribution)
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)

# Confirm that 'TotalCharges' is now numeric
print(df["TotalCharges"].dtype)  # Should print: float64


float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


In [301]:
df["Churn"].value_counts()

Churn
0    5164
1    1857
Name: count, dtype: int64

In [302]:
df.shape

(7021, 22)

In [303]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

X = df.drop(columns=['Churn'])
y = df['Churn']

print("Original Class Distribution:",Counter(y))

Original Class Distribution: Counter({0: 5164, 1: 1857})


In [304]:

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Convert back to DataFrame (optional)
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled = pd.Series(y_resampled, name="Churn")

print("✅ SMOTE oversampling completed successfully!")
print("New class distribution:\n", y_resampled.value_counts())


✅ SMOTE oversampling completed successfully!
New class distribution:
 Churn
0    5164
1    5164
Name: count, dtype: int64


In [305]:
X_resampled["TotalCharges_log"] = np.log1p(X_resampled["TotalCharges"])


In [306]:
X_resampled["tenure_group"] = pd.cut(X_resampled["tenure"], bins=[0, 12, 24, 48, 72], labels=[1, 2, 3, 4])


In [307]:
X_resampled["MonthlyPerTenure"] = X_resampled["MonthlyCharges"] / (X_resampled["tenure"] + 1)
X_resampled["TotalPerTenure"] = X_resampled["TotalCharges"] / (X_resampled["tenure"] + 1)


In [308]:
X_resampled.drop(columns=["TotalCharges", "tenure"], inplace=True)


In [309]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_features = ["MonthlyCharges", "TotalCharges_log", "MonthlyPerTenure", "TotalPerTenure"]
X_resampled[num_features] = scaler.fit_transform(X_resampled[num_features])


In [310]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_resampled["Contract"] = le.fit_transform(X_resampled["Contract"])


In [311]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  accuracy_score, classification_report

In [312]:
param_grid = {
    "criterion": ["gini", "entropy", "log_loss"],
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": ["sqrt", "log2", None],
    "class_weight": [None, "balanced"]
}


In [313]:
dt = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5,scoring="accuracy",n_jobs=-1,verbose=1)
grid_search.fit(X_resampled,y_resampled)
print("Best Parameters:",grid_search.best_params_)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits
Best Parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [314]:
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_resampled)
print("Accuracy:",accuracy_score(y_resampled,y_pred))
print("classification_report:",classification_report(y_resampled,y_pred))

Accuracy: 0.874419054996127
classification_report:               precision    recall  f1-score   support

           0       0.91      0.83      0.87      5164
           1       0.84      0.92      0.88      5164

    accuracy                           0.87     10328
   macro avg       0.88      0.87      0.87     10328
weighted avg       0.88      0.87      0.87     10328

