In [18]:
import pandas as pd
import numpy as np

datapath = "data/data.csv"
df = pd.read_csv(datapath)

# 2) Quick inspection of the data
print("Shape:",df.shape)
display(df.head())
df.info()

Shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [14]:
# 3) Target check (before changes)
if "Churn" in df.columns:
    print("\nChurn distribution (raw):")
    print(df["Churn"].value_counts(dropna=False))
    print("\nChurn % (raw):")
    print(df["Churn"].value_counts(normalize=True, dropna=False) * 100)





Churn distribution (raw):
Churn
No     5174
Yes    1869
Name: count, dtype: int64

Churn % (raw):
Churn
No     73.463013
Yes    26.536987
Name: proportion, dtype: float64


In [15]:
if "customerID" in df.columns:
    df.drop(columns=["customerID"])

In [16]:
#5) Fix TotalCharges (common issue: object with blanks)
if "TotalCharges" in df.columns:
    # convert blanks to NaN, then to numeric
    df["TotalCharges"] = pd.to_numeric(df["TotalCharges"].astype(str).str.strip(), errors="coerce")


In [17]:
# 6) Check missing values AFTER fixing TotalCharges
print("\nMissing values per column (after TotalCharges fix):")
print(df.isna().sum().sort_values(ascending=False).head(10))


Missing values per column (after TotalCharges fix):
TotalCharges        11
customerID           0
DeviceProtection     0
MonthlyCharges       0
PaymentMethod        0
PaperlessBilling     0
Contract             0
StreamingMovies      0
StreamingTV          0
TechSupport          0
dtype: int64


In [19]:

# 7) Decide how to handle missing TotalCharges
# Hint: In Telco, missing TotalCharges usually happens when tenure=0.
# We'll fill missing TotalCharges with 0 (simple + defensible), but you can also drop those rows.
if "TotalCharges" in df.columns:
    missing_tc = df["TotalCharges"].isna().sum()
    print(f"\nMissing TotalCharges rows: {missing_tc}")
    df["TotalCharges"] = df["TotalCharges"].fillna(0)


Missing TotalCharges rows: 0


In [None]:

# 8) Convert target to 0/1
# Yes -> 1, No -> 0
if "Churn" in df.columns and df["Churn"].dtype == "object":
    df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

    


In [22]:

# 9) Final sanity checks
print("\nFINAL Shape:", df.shape)
print("\nFINAL churn rate (%):")
print(df["Churn"].mean() * 100)

print("\nDtypes summary:")
display(df.dtypes.value_counts())

print("\nAny missing left?")
print(df.isna().sum().sort_values(ascending=False).head(10))


FINAL Shape: (7043, 21)

FINAL churn rate (%):
26.536987079369588

Dtypes summary:


object     17
int64       3
float64     1
Name: count, dtype: int64


Any missing left?
customerID          0
DeviceProtection    0
TotalCharges        0
MonthlyCharges      0
PaymentMethod       0
PaperlessBilling    0
Contract            0
StreamingMovies     0
StreamingTV         0
TechSupport         0
dtype: int64


In [24]:
# 10) Identify numeric vs categorical columns (initial guess)
target_col = "Churn"
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Remove target from numeric list if present
if target_col in num_cols:
    num_cols.remove(target_col)

print("\nNumeric columns:", num_cols)
print("\nCategorical columns:", cat_cols)


Numeric columns: ['SeniorCitizen', 'tenure', 'MonthlyCharges']

Categorical columns: ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'TotalCharges']
