In [45]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd

In [47]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [49]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=42)

In [51]:
# Shuffle split
X = data[['PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Contract',
          'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
          'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']]

y = data['Churn']
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X, y):
    train_set = data.iloc[train_index]
    test_set = data.iloc[test_index]
print("Train set shape:", train_set.shape)
print("Test set shape:", test_set.shape)
train_set.columns

Train set shape: (5634, 21)
Test set shape: (1409, 21)


Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [53]:
train_set = train_set.drop("customerID", axis=1)
train_set["TotalCharges"] = pd.to_numeric(train_set["TotalCharges"], errors="coerce")
train_set["TotalCharges"] = train_set["TotalCharges"].fillna(train_set["TotalCharges"].median())
y = train_set["Churn"].map({"No": 0, "Yes": 1})
X = train_set.drop("Churn", axis=1)


In [55]:
# Difining groups of columns
binary_features = ["gender", "Partner", "Dependents", "PhoneService", "PaperlessBilling"]

multi_features = [
    "Contract", "PaymentMethod", "MultipleLines", "InternetService",
    "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport",
    "StreamingTV", "StreamingMovies"
]

numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]

numeric_pass = ["SeniorCitizen"]


In [57]:
# Using a preprocessor for encoding & Scaling
preprocessor = ColumnTransformer(
    transformers=[
        ("binary", OneHotEncoder(drop="if_binary"), binary_features),     # binary categoricals
        ("multi", OneHotEncoder(handle_unknown="ignore"), multi_features), # multi-class categoricals
        ("num", StandardScaler(), numeric_features),                      # scale numerical
        ("pass", "passthrough", numeric_pass)                             # keep SeniorCitizen
    ]
)
test_set = preprocessor.fit_transform(X)

In [68]:
test_set = preprocessor.fit_transform(X)
binary_names = preprocessor.named_transformers_["binary"].get_feature_names_out(binary_features)
multi_names  = preprocessor.named_transformers_["multi"].get_feature_names_out(multi_features)
num_names    = numeric_features
pass_names   = numeric_pass
all_features = np.concatenate([binary_names, multi_names, num_names, pass_names])
test_set_df = pd.DataFrame(test_set.toarray() if hasattr(test_set, "toarray") else test_set, 
                            columns=all_features, index=X.index)


In [70]:
test_set_df

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,PaperlessBilling_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),...,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,tenure,MonthlyCharges,TotalCharges,SeniorCitizen
3738,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.102371,-0.521976,-0.263290,0.0
3151,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,-0.711743,0.337478,-0.504815,0.0
4860,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,-0.793155,-0.809013,-0.751214,0.0
3867,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,-0.263980,0.284384,-0.173700,0.0
3810,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,-1.281624,-0.676279,-0.990851,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6303,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.567778,1.470695,2.373711,0.0
6227,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,-1.240918,-0.626504,-0.975133,0.0
4673,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,-0.304686,1.256662,0.157569,1.0
2710,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,-0.345392,-1.477661,-0.798435,0.0
