In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("C:/Users/Rhitik9579/Desktop/churn-prediction/data/raw/Telco-Customer-Churn.csv")

df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.fillna({"TotalCharges": df["TotalCharges"].median()}, inplace=True)



In [3]:
df.drop("customerID", axis=1, inplace=True)



In [4]:
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [5]:
# tenure group
def tenure_group(t):
    if t <= 6: return "0-6"
    elif t <= 12: return "6-12"
    elif t <= 24: return "12-24"
    elif t <= 48: return "24-48"
    else: return "48+"

df["tenure_group"] = df["tenure"].apply(tenure_group)

# total number of services
service_cols = ['PhoneService','MultipleLines','InternetService','OnlineSecurity',
                'OnlineBackup','DeviceProtection','TechSupport','StreamingTV',
                'StreamingMovies']

df["total_services"] = (df[service_cols] != "No").sum(axis=1)

# long term contract
df["is_long_term"] = df["Contract"].apply(lambda x: 1 if x != "Month-to-month" else 0)


In [6]:
cat_cols = df.select_dtypes(include="object").columns

df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

df_encoded.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,total_services,is_long_term,gender_Male,Partner_Yes,Dependents_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_12-24,tenure_group_24-48,tenure_group_48+,tenure_group_6-12
0,0,1,29.85,29.85,0,3,0,False,True,False,...,False,False,True,False,True,False,False,False,False,False
1,0,34,56.95,1889.5,0,4,1,True,False,False,...,True,False,False,False,False,True,False,True,False,False
2,0,2,53.85,108.15,1,4,0,True,False,False,...,False,False,True,False,False,True,False,False,False,False
3,0,45,42.3,1840.75,0,5,1,True,False,False,...,True,False,False,False,False,False,False,True,False,False
4,0,2,70.7,151.65,1,2,0,False,False,False,...,False,False,True,False,True,False,False,False,False,False


In [7]:
df_encoded.to_csv("C:/Users/Rhitik9579/Desktop/churn-prediction/data/processed/processed_churn.csv", index=False)
df_encoded.shape


(7043, 37)

### Feature Engineering Summary
- Cleaned TotalCharges and converted it to numeric.
- Dropped unnecessary customerID column.
- Converted Churn → binary labels.
Created new features:
- tenure_group
- total_services
- is_long_term
- Encoded all categorical variables using OneHotEncoding.
- Saved processed dataset to data/processed/processed_churn.csv.