In [10]:
import pandas as pd
from pandas.api.types import CategoricalDtype

## 1. Load & Preview Data

In [11]:
# Load the CSV
df = pd.read_csv("../data/Telco_Customer_Churn_cleansed.csv")

# Quick preview
df.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 2. 🔧 Feature Engineering

In [12]:
#Chrun by dependents and Partner seemed similar hence the creation of a combined feature
df['HasPartnerOrDependents'] = ((df['Partner'] == 'Yes') | (df['Dependents'] == 'Yes')).astype(int)


In [13]:
#cleaner column names
df = df.rename(columns={'SeniorCitizen': 'IsSeniorCitizen'})


In [14]:
df[['HasPartnerOrDependents', 'IsSeniorCitizen']].head()


Unnamed: 0,HasPartnerOrDependents,IsSeniorCitizen
0,1,0
1,0,0
2,0,0
3,0,0
4,0,0


### 📝 Justification for `TenureGroup` Feature

Based on the `churn_by_tenure_histogram.png` from the EDA phase, we identified three distinct behavioral patterns among customers at different stages of their lifecycle:

- **New (tenure < 6 months)**: High churn rates likely driven by onboarding or early experience issues.
- **Early (6–18 months)**: Customers still in the decision-making period, where churn remains elevated but begins to stabilize.
- **Loyal (18+ months)**: Customers who are more stable and significantly less likely to churn.

Segmenting tenure in this way enables us to capture potential retention opportunities and improve model interpretability by aligning with real customer lifecycle trends.

In [17]:
def categorize_tenure(tenure):
    if tenure <= 6:
        return 'New'
    elif tenure <= 18:
        return 'Early'
    else:
        return 'Loyal'

df['TenureGroup'] = df['tenure'].apply(categorize_tenure)

tenure_cat_type = CategoricalDtype(categories=['New', 'Early', 'Loyal'], ordered=True)
df['TenureGroup'] = df['TenureGroup'].astype(tenure_cat_type)
