In [None]:

# ✅ IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample

# ✅ LOAD TRAINING DATA
train = pd.read_csv("train_file.csv")
train.columns = train.columns.str.strip()
train['Churn'] = train['Churn'].astype(str).str.strip().replace({'Yes': 1, 'No': 0})

# ✅ DROP UNNEEDED COLUMNS
if 'CustomerID' in train.columns:
    train = train.drop(columns=['CustomerID'])

# ✅ ENCODING
train = pd.get_dummies(train, columns=["Gender", "Subscription Type", "Contract Length"], drop_first=True)

# ✅ BALANCING
majority = train[train['Churn'] == 1]
minority = train[train['Churn'] == 0]
majority_down = resample(majority, replace=False, n_samples=len(minority), random_state=42)
train_balanced = pd.concat([majority_down, minority]).sample(frac=1, random_state=42)

# ✅ SPLIT FEATURES
X = train_balanced.drop(columns=['Churn'])
y = train_balanced['Churn']

# ✅ SPLIT DATA
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# ✅ SCALE
numeric_cols = ["Age", "Tenure", "Usage Frequency", "Support Calls", "Payment Delay", "Total Spend", "Last Interaction"]
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val[numeric_cols] = scaler.transform(X_val[numeric_cols])

# ✅ TRAIN MODEL
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# ✅ VALIDATION
y_val_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred))
print(confusion_matrix(y_val, y_val_pred))

# ✅ LOAD & PREPROCESS TEST SET
test = pd.read_csv("test_file.csv")
test.columns = test.columns.str.strip()
y_test = test['Churn'].astype(str).str.strip().replace({'Yes': 1, 'No': 0})
test = test.drop(columns=['Churn'])
if 'CustomerID' in test.columns:
    test = test.drop(columns=['CustomerID'])
test = pd.get_dummies(test, columns=["Gender", "Subscription Type", "Contract Length"], drop_first=True)

# ✅ ALIGN & SCALE
for col in X_train.columns:
    if col not in test.columns:
        test[col] = 0
test = test[X_train.columns]
test[numeric_cols] = scaler.transform(test[numeric_cols])

# ✅ PREDICT
y_pred_test = model.predict(test)
print("Final Test Accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
