In [13]:
# train_model.ipynb

# 1. Install correct version
!pip install scikit-learn==1.2.1 pandas numpy joblib boto3 --quiet


In [14]:
# 2. Imports
import pandas as pd
import numpy as np
import joblib
import boto3
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [15]:
# 3. Load dataset from S3
bucket = "telco-churn-data-rcc"   # <-- your bucket
key = "Telco-Customer-Churn-prepared.csv"  # <-- confirm exact file name

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)
df = pd.read_csv(obj["Body"])

print(df.shape)
df.head()


(7043, 38)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_short,tenure_medium,tenure_long,charge_per_month
0,0,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,True,False,True,False,False,14.925
1,1,0,0,0,34,1,0,56.95,1889.5,0,...,True,False,False,False,False,True,False,True,False,1.627143
2,1,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,True,True,False,False,17.95
3,1,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,True,False,False,False,False,True,False,0.919565
4,0,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,True,False,True,False,False,23.566667


In [16]:
# 4. Train/Validation/Test Split (70/20/10)

# Define features (X) and target (y)
X = df.drop("Churn", axis=1)   # assuming "Churn" is your target column
y = df["Churn"]

# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Second split: of the temp set, 20% val, 10% test (2/3 vs 1/3)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.3333, random_state=42
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (4930, 37), Val: (1408, 37), Test: (705, 37)


In [17]:
# 5. Scale features (fit on train, apply to val + test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)


In [18]:
# 6. Train model with the Random Forest Classifier.

# Example with tuned hyperparameters (adjust if you have them)
rf_model = RandomForestClassifier(
    n_estimators=197,
    max_depth=7,
    min_samples_split=7,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    class_weight="balanced"
)
rf_model.fit(X_train_scaled, y_train)


In [19]:
# 7. Evaluate on validation + test
y_val_pred = rf_model.predict(X_val_scaled)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))

y_test_pred = rf_model.predict(X_test_scaled)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


Validation Accuracy: 0.7627840909090909
Test Accuracy: 0.7687943262411348
              precision    recall  f1-score   support

           0       0.90      0.76      0.83       514
           1       0.55      0.78      0.65       191

    accuracy                           0.77       705
   macro avg       0.73      0.77      0.74       705
weighted avg       0.81      0.77      0.78       705



In [21]:
# 8. Save model + scaler artifacts for deployment
import os

output_dir = "../app/models"
os.makedirs(output_dir, exist_ok=True)

joblib.dump(rf_model, os.path.join(output_dir, "churn_model.pkl"))
joblib.dump(scaler, os.path.join(output_dir, "scaler.pkl"))

print("✅ Saved churn_model.pkl and scaler.pkl to app/models/")


✅ Saved churn_model.pkl and scaler.pkl to app/models/
