In [7]:
# retrain.py

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# --- Load the data ---
df = pd.read_csv("cleaned_telco_churn.csv")

# --- Drop customerID if it exists ---
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])
    
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", categorical_cols)

# Use one-hot encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df.head()

# Define Features (X) and Target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# --- Train-test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Feature scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Train model ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# --- Save artifacts ---
joblib.dump(model, "model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "features.pkl")

# --- Evaluate on test set ---
accuracy = model.score(X_test_scaled, y_test)
print(f"✅ Model retrained successfully. Test accuracy: {accuracy:.4f}")


Categorical columns: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
✅ Model retrained successfully. Test accuracy: 0.7875
