In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# Create images folder if not exists
os.makedirs("../images", exist_ok=True)


In [2]:
data = pd.read_csv("../DATA/churn_data.csv")
data.head()


Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   tenure            7043 non-null   int64  
 2   PhoneService      7043 non-null   object 
 3   Contract          7043 non-null   object 
 4   PaperlessBilling  7043 non-null   object 
 5   PaymentMethod     7043 non-null   object 
 6   MonthlyCharges    7043 non-null   float64
 7   TotalCharges      7043 non-null   object 
 8   Churn             7043 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 495.3+ KB


In [4]:
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
data = data.drop(columns=["customerID"])

numeric_cols = data.select_dtypes(include=["int64","float64"]).columns
categorical_cols = data.select_dtypes(include=["object"]).columns




In [5]:
print("Shape:", data.shape)
print("\nMissing:\n", data.isnull().sum())
data[numeric_cols].describe()


Shape: (7043, 8)

Missing:
 tenure               0
PhoneService         0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7032.0
mean,32.371149,64.761692,2283.300441
std,24.559481,30.090047,2266.771362
min,0.0,18.25,18.8
25%,9.0,35.5,401.45
50%,29.0,70.35,1397.475
75%,55.0,89.85,3794.7375
max,72.0,118.75,8684.8


In [6]:
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(data[col], kde=True)
    plt.title(f"Distribution of {col}")
    plt.savefig(f"../images/{col}_hist.png")
    plt.close()


In [7]:
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x="Churn", y=col, data=data)
    plt.title(f"{col} vs Churn")
    plt.savefig(f"../images/{col}_vs_churn.png")
    plt.close()


In [8]:
for col in ["Contract", "PaymentMethod", "PaperlessBilling"]:
    ct = pd.crosstab(data[col], data["Churn"], normalize="index")
    print(ct)
   


Churn                 No       Yes
Contract                          
Month-to-month  0.572903  0.427097
One year        0.887305  0.112695
Two year        0.971681  0.028319
Churn                            No       Yes
PaymentMethod                                
Bank transfer (automatic)  0.832902  0.167098
Credit card (automatic)    0.847569  0.152431
Electronic check           0.547146  0.452854
Mailed check               0.808933  0.191067
Churn                   No       Yes
PaperlessBilling                    
No                0.836699  0.163301
Yes               0.664349  0.335651


In [9]:
plt.figure(figsize=(6,4))
sns.heatmap(data[numeric_cols].corr(), annot=True, cmap="coolwarm")
plt.savefig("../images/corr_numeric.png")
plt.close()
