In [1]:
# .venv/bin/python
import pandas as pd
df = pd.read_csv('/Users/seokhyunyoon/Downloads/Churn/WA_Fn-UseC_-Telco-Customer-Churn (1).csv')
print(df.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

Data Cleaning

In [2]:
# Checking column types

print(df['TotalCharges'].dtype)
print(df['TotalCharges'].unique()[:10])  # peek at raw values

object
['29.85' '1889.5' '108.15' '1840.75' '151.65' '820.5' '1949.4' '301.9'
 '3046.05' '3487.95']


In [3]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [4]:
print(df.isnull().sum())

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [5]:
df = df.dropna(subset=['TotalCharges'])
df.reset_index(drop=True, inplace=True)

Design Fact & Dimension Tables (Data Warehouse Schema)

In [6]:
dim_customer = df[['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents']].copy()

In [7]:
dim_services = df[['customerID', 'PhoneService', 'MultipleLines',
                   'InternetService', 'OnlineSecurity', 'OnlineBackup',
                   'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']].copy()

In [8]:
dim_contract = df[['customerID', 'Contract', 'PaperlessBilling', 'PaymentMethod']].copy()

In [9]:
fact_customer_activity = df[['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']].copy()

OLAP-Style Exploration with pandas

In [10]:
df['Churn'] = (df['Churn'] == 'Yes').astype(int)
df.groupby('Contract')['Churn'].mean().sort_values(ascending=False) 

Contract
Month-to-month    0.427097
One year          0.112772
Two year          0.028487
Name: Churn, dtype: float64

In [11]:
df.groupby('InternetService')['Churn'].mean().sort_values(ascending=False)

InternetService
Fiber optic    0.418928
DSL            0.189983
No             0.074342
Name: Churn, dtype: float64

In [12]:
df['TenureGroup'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 72], labels=['0-12m', '13-24m', '25-48m', '49-72m'])
df.groupby('TenureGroup')['Churn'].mean()

  df.groupby('TenureGroup')['Churn'].mean()


TenureGroup
0-12m     0.476782
13-24m    0.287109
25-48m    0.203890
49-72m    0.095132
Name: Churn, dtype: float64

In [13]:
df.groupby('PaymentMethod')['Churn'].mean().sort_values(ascending=False)

PaymentMethod
Electronic check             0.452854
Mailed check                 0.192020
Bank transfer (automatic)    0.167315
Credit card (automatic)      0.152531
Name: Churn, dtype: float64

In [14]:
df['ChargeBucket'] = pd.cut(df['MonthlyCharges'], bins=[0, 35, 70, 100], labels=['Low', 'Medium', 'High'])
df.groupby('ChargeBucket')['Churn'].mean()

  df.groupby('ChargeBucket')['Churn'].mean()


ChargeBucket
Low       0.109312
Medium    0.239837
High      0.378499
Name: Churn, dtype: float64

In [15]:
df.to_csv('cleaned_telco_churn.csv', index=False)

Preprocessing for Modeling

In [16]:
df_model = df.drop(['customerID'], axis=1)

In [17]:
df_encoded = pd.get_dummies(df_model, drop_first=True)

In [18]:
X = df_encoded.drop('Churn', axis=1).values
y = df_encoded['Churn'].values

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import torch.nn.init as init
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import random

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
smote = SMOTE()
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

#3. Train the Model
model = nn.Sequential(
    nn.Linear(X_train_res.shape[1], 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
)

Define & Train Your PyTorch Model

In [20]:
#2. Import PyTorch and Prepare Tensors

# Convert to numpy arrays first if they aren't already
X_train_res = np.array(X_train_res, dtype=np.float32)
X_test = np.array(X_test, dtype=np.float32)
y_train_res = np.array(y_train_res, dtype=np.float32)
y_tests = np.array(y_test, dtype=np.float32)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

X_train_tensor = torch.from_numpy(X_train_scaled).float()
X_test_tensor = torch.from_numpy(X_test_scaled).float()

# Now create tensors
X_train_tensor = torch.from_numpy(X_train_res).float()
y_train_tensor = torch.from_numpy(y_train_res.reshape(-1, 1)).float()
X_test_tensor = torch.from_numpy(X_test).float()
y_test_tensor = torch.from_numpy(y_test.reshape(-1, 1)).float()

# Create DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

In [21]:
#3. Define the Neural Network
class ChurnNet(nn.Module):
    def __init__(self, input_dim):
        super(ChurnNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model = ChurnNet(input_dim=X_train.shape[1])

In [22]:
#4. Train the Model
# First, define the neural network model
class EnhancedChurnModel(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        return torch.sigmoid(self.net(x))
    
# Create the model instance
input_size = X_train_res.shape[1]  # Number of features
model = EnhancedChurnModel(input_size)

# Now define the loss function and optimizer
#criterion = nn.BCELoss() #replace this with the following code
# Calculate class weights
pos_weight = torch.tensor([sum(y_train == 0) / sum(y_train == 1)], dtype=torch.float32)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

# (Optional) Weighted Random Sampler for DataLoader
from torch.utils.data import WeightedRandomSampler

weights = torch.where(torch.from_numpy(y_train).float() == 1, 
                     torch.tensor([sum(y_train == 0) / sum(y_train == 1)]), 
                     torch.tensor(1.0))
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
train_loader = DataLoader(train_data, batch_size=64, sampler=sampler)

optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

Epoch 1/100, Loss: 93.2702
Epoch 2/100, Loss: 89.0097
Epoch 3/100, Loss: 88.1608
Epoch 4/100, Loss: 87.2554
Epoch 5/100, Loss: 87.5533
Epoch 6/100, Loss: 86.9431
Epoch 7/100, Loss: 87.5652
Epoch 8/100, Loss: 86.6726
Epoch 9/100, Loss: 86.6474
Epoch 10/100, Loss: 86.5001
Epoch 11/100, Loss: 86.4577
Epoch 12/100, Loss: 87.5083
Epoch 13/100, Loss: 87.3151
Epoch 14/100, Loss: 86.4932
Epoch 15/100, Loss: 86.2660
Epoch 16/100, Loss: 87.1266
Epoch 17/100, Loss: 86.9949
Epoch 18/100, Loss: 86.7648
Epoch 19/100, Loss: 86.3999
Epoch 20/100, Loss: 86.6441
Epoch 21/100, Loss: 86.5821
Epoch 22/100, Loss: 86.3198
Epoch 23/100, Loss: 86.9167
Epoch 24/100, Loss: 87.2122
Epoch 25/100, Loss: 87.2029
Epoch 26/100, Loss: 86.5034
Epoch 27/100, Loss: 87.1707
Epoch 28/100, Loss: 86.6036
Epoch 29/100, Loss: 86.3410
Epoch 30/100, Loss: 86.3165
Epoch 31/100, Loss: 86.3758
Epoch 32/100, Loss: 87.0807
Epoch 33/100, Loss: 85.9800
Epoch 34/100, Loss: 85.9807
Epoch 35/100, Loss: 86.6561
Epoch 36/100, Loss: 86.4989
E

Evaluate the Model


In [23]:
#2. Evaluate with Accuracy and Confusion Matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#1. Make Predictions on Test Set
model.eval()
with torch.no_grad():
    y_pred_probs = model(X_test_tensor)
    y_pred_labels = (y_pred_probs > 0.5).float()


# Convert to NumPy
y_true = y_test_tensor.numpy()
y_pred = y_pred_labels.numpy()

# Accuracy
print("Accuracy:", accuracy_score(y_true, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# Precision, Recall, F1
print("Classification Report:\n", classification_report(y_true, y_pred, digits=3))

Accuracy: 0.6183368869936035
Confusion Matrix:
 [[524 509]
 [ 28 346]]
Classification Report:
               precision    recall  f1-score   support

         0.0      0.949     0.507     0.661      1033
         1.0      0.405     0.925     0.563       374

    accuracy                          0.618      1407
   macro avg      0.677     0.716     0.612      1407
weighted avg      0.805     0.618     0.635      1407

