In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import re

In [2]:
# Load the dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

CLEANING

In [8]:
# 1) Standardize column names (snake_case, no weird chars)
def to_snake_case(s: str) -> str:
    s = s.strip()
    s = re.sub(r"[^0-9a-zA-Z]+", "_", s)   # non-alnum -> _
    s = re.sub(r"_+", "_", s).strip("_")   # collapse __ and trim edges
    return s.lower()

df.columns = [to_snake_case(c) for c in df.columns]

# 2) Trim whitespace in string columns + normalize blanks to NaN
obj_cols = df.select_dtypes(include=["object"]).columns
for c in obj_cols:
    df[c] = df[c].astype(str).str.strip()
    df.loc[df[c].isin(["", "nan", "None"]), c] = np.nan

# 3) Fix numeric columns (TotalCharges is commonly dirty in this dataset)
# tenure -> integer-like
if "tenure" in df.columns:
    df["tenure"] = pd.to_numeric(df["tenure"], errors="coerce")

# MonthlyCharges -> float
if "monthlycharges" in df.columns:
    df["monthlycharges"] = pd.to_numeric(df["monthlycharges"], errors="coerce")

# TotalCharges -> float (often has blanks)
if "totalcharges" in df.columns:
    df["totalcharges"] = pd.to_numeric(df["totalcharges"], errors="coerce")

    # Impute missing TotalCharges in a reasonable way:
    # - if tenure == 0, TotalCharges should be 0
    # - otherwise fallback to MonthlyCharges * tenure (approx)
    if "tenure" in df.columns and "monthlycharges" in df.columns:
        missing = df["totalcharges"].isna()
        df.loc[missing & (df["tenure"] == 0), "totalcharges"] = 0
        missing = df["totalcharges"].isna()
        df.loc[missing, "totalcharges"] = df.loc[missing, "monthlycharges"] * df.loc[missing, "tenure"]

# 4) Make CustomerID a clean string key
if "customerid" in df.columns:
    df["customerid"] = df["customerid"].astype(str).str.strip()

# 5) Convert Yes/No columns to 0/1 where appropriate
# NOTE: do NOT convert columns with values like "No internet service" (multi-category).
yes_no_cols = []
for c in df.columns:
    if df[c].dtype == "object":
        vals = set(df[c].dropna().unique())
        if vals.issubset({"Yes", "No"}):
            yes_no_cols.append(c)

for c in yes_no_cols:
    df[c] = df[c].map({"No": 0, "Yes": 1}).astype("Int64")  # nullable integer

# 6) Ensure SeniorCitizen is integer 0/1 (it usually is)
if "seniorcitizen" in df.columns:
    df["seniorcitizen"] = pd.to_numeric(df["seniorcitizen"], errors="coerce").astype("Int64")

# 7) Remove duplicate rows (safe housekeeping)
df = df.drop_duplicates()


âœ… Saved: customer_churn_clean.csv
Shape: (7043, 21)


SAVE CLEAN VERSION

In [9]:
df.to_csv("customer_churn_clean.csv", index=False)

print("Saved: customer_churn_clean.csv")
print("Shape:", df.shape)

Saved: customer_churn_clean.csv
Shape: (7043, 21)
