# Exploratory Data Analysis (EDA)

In [47]:
!pip install seaborn scikit-learn plotly

Collecting plotly
  Downloading plotly-6.0.0-py3-none-any.whl.metadata (5.6 kB)
Downloading plotly-6.0.0-py3-none-any.whl (14.8 MB)
   ---------------------------------------- 0.0/14.8 MB ? eta -:--:--
   -- ------------------------------------- 0.8/14.8 MB 3.7 MB/s eta 0:00:04
   ---- ----------------------------------- 1.6/14.8 MB 4.2 MB/s eta 0:00:04
   ------- -------------------------------- 2.6/14.8 MB 4.1 MB/s eta 0:00:03
   --------- ------------------------------ 3.4/14.8 MB 4.1 MB/s eta 0:00:03
   ----------- ---------------------------- 4.2/14.8 MB 4.1 MB/s eta 0:00:03
   ------------- -------------------------- 5.0/14.8 MB 3.9 MB/s eta 0:00:03
   --------------- ------------------------ 5.8/14.8 MB 4.0 MB/s eta 0:00:03
   ------------------ --------------------- 6.8/14.8 MB 4.0 MB/s eta 0:00:03
   -------------------- ------------------- 7.6/14.8 MB 4.0 MB/s eta 0:00:02
   ----------------------- ---------------- 8.7/14.8 MB 4.0 MB/s eta 0:00:02
   -------------------------

In [48]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.graph_objects as go

# Set visualization style
sns.set(style="whitegrid")

### Loading dataset

In [49]:
# 📥 Load the dataset
df = pd.read_csv('../data/raw/Telco-Customer-Churn.csv')

# Check for missing values
print("\n Missing Values:\n", df.isnull().sum())

# Display basic information
print("\n Data Types:\n", df.dtypes)


 Missing Values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

 Data Types:
 customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod  

In [50]:
print("\n Dataset Shape:", df.shape)
print("\n First 5 Rows:\n")
df.head()


 Dataset Shape: (7043, 21)

 First 5 Rows:



Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Handling missing values

In [51]:
# Replace empty strings with NaN
df.replace(" ", np.nan, inplace=True)

# Check unique values per column
print("\n Unique Values per Column:\n", df.nunique())

# Drop CustomerID (not useful for modeling)
df.drop("customerID", axis=1, inplace=True)


 Unique Values per Column:
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6530
Churn                  2
dtype: int64


In [41]:
# Check missing values after replacement
print("\n Missing Values After Cleaning:\n", df.isnull().sum())

# Handle missing values in "TotalCharges" (convert to numeric & fill missing)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


 Missing Values After Cleaning:
 gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


### Summary Statistics

In [52]:
# Convert categorical columns to numeric (Label Encoding)
categorical_cols = df.select_dtypes(include=["object"]).columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Summary Statistics
print("\n Summary Statistics:\n")
df.describe()


 Summary Statistics:



Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,0.504756,0.162147,0.483033,0.299588,32.371149,0.903166,0.940508,0.872923,0.790004,0.906432,0.904444,0.797104,0.985376,0.992475,0.690473,0.592219,1.574329,64.761692,3266.994463,0.26537
std,0.500013,0.368612,0.499748,0.45811,24.559481,0.295752,0.948554,0.737796,0.859848,0.880162,0.879949,0.861551,0.885002,0.885091,0.833755,0.491457,1.068104,30.090047,1888.707571,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,0.0,0.0
25%,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.5,1609.0,0.0
50%,1.0,0.0,0.0,0.0,29.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,70.35,3259.0,0.0
75%,1.0,0.0,1.0,1.0,55.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,89.85,4911.5,1.0
max,1.0,1.0,1.0,1.0,72.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,118.75,6530.0,1.0


### Visualizations

In [64]:
# Churn Distribution

fig_churn = px.histogram(df, x="Churn", color="Churn",
                         title="Churn Distribution",
                         template="plotly_dark", 
                         width=600, height=400)
fig_churn.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
# 🎯 **Monthly Charges vs. Total Charges (Scatter Plot)**
fig_scatter = px.scatter(df, x="MonthlyCharges", y="TotalCharges", color="Churn",
                         title="Monthly Charges vs. Total Charges",
                         template="plotly_dark", 
                         width=700, height=500, 
                         opacity=0.7)
fig_scatter.show()


In [None]:
# 🎯 **Churn Rate by Contract Type (Bar Chart)**
fig_contract = px.bar(df, x=df["Contract"].value_counts().index, 
                      y=df["Contract"].value_counts().values,
                      color=df["Contract"].value_counts().index,
                      title="Number of Customers by Contract Type",
                      template="plotly_dark", width=700, height=500)
fig_contract.show()

In [None]:
# 🎯 **Churn Rate by Internet Service Type (Pie Chart)**
fig_pie = px.pie(df, names="InternetService", 
                 title="Internet Service Distribution",
                 template="plotly_dark", hole=0.4, width=700, height=500)
fig_pie.show()

In [None]:
# 🎯 **Correlation Heatmap**
import plotly.figure_factory as ff

corr_matrix = df.select_dtypes(include=[np.number]).corr()
fig_heatmap = ff.create_annotated_heatmap(
    z=corr_matrix.values, 
    x=list(corr_matrix.columns), 
    y=list(corr_matrix.index), 
    colorscale="coolwarm"
)
fig_heatmap.update_layout(title="Feature Correlation Heatmap", width=800, height=600)
fig_heatmap.show()

### Insights

In [46]:
# Insights
print("\n Insights:")
print("- Customers with month-to-month contracts churn the most.")
print("- Higher MonthlyCharges are correlated with churn.")
print("- TotalCharges shows a weak correlation with churn.")


 Insights:
- Customers with month-to-month contracts churn the most.
- Higher MonthlyCharges are correlated with churn.
- TotalCharges shows a weak correlation with churn.


### Save cleaned data

In [None]:
# Save cleaned dataset
df.to_csv("../data/preprossed/cleaned_telco_churn.csv", index=False)