In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("../Dataset/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [9]:
df = df.drop('customerID', axis=1)

In [10]:
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [14]:
# Check data types and missing values
print("Data Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nFirst few rows:")
print(df.head())
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values - {df[col].unique()[:5]}")

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null

In [None]:
# Convert TotalCharges to numeric (it's currently a string)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)

# ===== BINARY FEATURES (Simple Yes/No) =====
# gender, Partner, Dependents, PhoneService, PaperlessBilling
binary_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
binary_mapping = {'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0}

for col in binary_features:
    df[col] = df[col].map(binary_mapping)

# ===== FEATURES WITH SERVICE DEPENDENCIES =====
# MultipleLines - depends on PhoneService
df['MultipleLines_Yes'] = (df['MultipleLines'] == 'Yes').astype(int)
df['MultipleLines_NoService'] = (df['MultipleLines'] == 'No phone service').astype(int)

# OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies
# All depend on InternetService
internet_services = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                     'TechSupport', 'StreamingTV', 'StreamingMovies']
for col in internet_services:
    df[f'{col}_Yes'] = (df[col] == 'Yes').astype(int)
    df[f'{col}_NoInternet'] = (df[col] == 'No internet service').astype(int)

# Drop original service columns (now replaced with binary features)
df = df.drop(['MultipleLines'] + internet_services, axis=1)

# ===== ONE-HOT ENCODE MULTI-CLASS FEATURES =====
# InternetService, Contract, PaymentMethod
df = pd.get_dummies(df, columns=['InternetService', 'Contract', 'PaymentMethod'], drop_first=True)

print("Encoding complete!")   
print(f"Final shape: {df.shape}")
print(f"\nColumn names ({len(df.columns)} total):")
print(df.columns.tolist())

Encoding complete!
Final shape: (7043, 31)

Column names (31 total):
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'PaperlessBilling', 'MonthlyCharges', 'TotalCharges', 'Churn', 'MultipleLines_Yes', 'MultipleLines_NoService', 'OnlineSecurity_Yes', 'OnlineSecurity_NoInternet', 'OnlineBackup_Yes', 'OnlineBackup_NoInternet', 'DeviceProtection_Yes', 'DeviceProtection_NoInternet', 'TechSupport_Yes', 'TechSupport_NoInternet', 'StreamingTV_Yes', 'StreamingTV_NoInternet', 'StreamingMovies_Yes', 'StreamingMovies_NoInternet', 'InternetService_Fiber optic', 'InternetService_No', 'Contract_One year', 'Contract_Two year', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)


In [18]:
# Split features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")
print(f"\nTraining set class distribution:\n{y_train.value_counts()}")
print(f"\nTest set class distribution:\n{y_test.value_counts()}")


Training set shape: (5634, 30)
Test set shape: (1409, 30)

Training set class distribution:
Churn
0    4139
1    1495
Name: count, dtype: int64

Test set class distribution:
Churn
0    1035
1     374
Name: count, dtype: int64
