In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report, log_loss
from scipy.special import expit
import json

file_path = "netflix_customer_churn.csv"
df = pd.read_csv(file_path)
print("Dataset Loaded Successfully ")
print("Shape:", df.shape)
print(df.head())

possible_labels = ['churn', 'Churn', 'Churn?', 'Exited', 'is_churn', 'Cancelled', 'CustomerChurn']
label_col = None
for c in df.columns:
    if c.lower() in [x.lower() for x in possible_labels]:
        label_col = c
        break
if label_col is None:
    for c in df.columns:
        if 'churn' in c.lower() or 'exit' in c.lower() or 'cancel' in c.lower():
            label_col = c
            break
if label_col is None:
    raise ValueError("Could not detect churn label column. Please rename the target column to 'Churn' or similar.")

print("Detected target column:", label_col)


def to_binary(series):
    mapping = {'yes':1,'y':1,'true':1,'t':1,'1':1,'no':0,'n':0,'false':0,'f':0,'0':0}
    s = series.astype(str).str.lower().str.strip()
    return s.map(mapping).fillna(0).astype(int)

y = to_binary(df[label_col])
X = df.drop(columns=[label_col])

id_cols = [c for c in X.columns if 'id' in c.lower() and X[c].nunique()==len(X)]
X = X.drop(columns=id_cols, errors='ignore')

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.select_dtypes(exclude=[np.number]).columns if X[c].nunique() <= 50]
X[num_cols] = X[num_cols].fillna(X[num_cols].median())
X[cat_cols] = X[cat_cols].fillna('missing').astype(str)

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train/Test Split Done ")
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

K = 5
indices = np.arange(len(X_train))
np.random.shuffle(indices)
parts = np.array_split(indices, K)
client_data = [(X_train.iloc[p], y_train.iloc[p]) for p in parts]

classes = np.array([0,1])
global_coef = np.zeros((1, X_train.shape[1]))
global_intercept = np.zeros(1)

n_rounds = 10
local_epochs = 3
alpha = 0.0001

val_accs = []
val_losses = []

for rnd in range(n_rounds):
    client_coefs, client_intercepts, client_sizes = [], [], []
    for Xi, yi in client_data:
        mdl = SGDClassifier(loss='log_loss', alpha=alpha, max_iter=1, tol=None, warm_start=True)
        mdl.partial_fit(Xi.values[:2], yi.values[:2], classes=classes)
        mdl.coef_ = global_coef.copy()
        mdl.intercept_ = global_intercept.copy()
        for _ in range(local_epochs):
            mdl.partial_fit(Xi.values, yi.values, classes=classes)
        client_coefs.append(mdl.coef_)
        client_intercepts.append(mdl.intercept_)
        client_sizes.append(len(yi))
    total_samples = np.sum(client_sizes)
    global_coef = np.sum([c*s for c, s in zip(client_coefs, client_sizes)], axis=0) / total_samples
    global_intercept = np.sum([i*s for i, s in zip(client_intercepts, client_sizes)], axis=0) / total_samples

    logits = X_test.values.dot(global_coef.T).ravel() + global_intercept.ravel()
    probs = expit(logits)
    preds = (probs >= 0.5).astype(int)
    acc = accuracy_score(y_test, preds)
    loss = log_loss(y_test, probs)
    val_accs.append(acc)
    val_losses.append(loss)
    print(f"Round {rnd+1}/{n_rounds}: Accuracy={acc:.4f}, LogLoss={loss:.4f}")

logits = X_test.values.dot(global_coef.T).ravel() + global_intercept.ravel()
probs = expit(logits)
preds = (probs >= 0.5).astype(int)

acc = accuracy_score(y_test, preds)
auc = roc_auc_score(y_test, probs)
print("\n Final Model Performance")
print("Accuracy:", round(acc, 4))
print("ROC AUC:", round(auc, 4))
print("\nClassification Report:\n", classification_report(y_test, preds))

plt.figure(figsize=(7,4))
plt.plot(range(1, n_rounds+1), val_accs, marker='o', label="Validation Accuracy")
plt.xlabel("Federated Round")
plt.ylabel("Accuracy")
plt.title("Federated Round-wise Accuracy")
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize=(7,4))
plt.plot(range(1, n_rounds+1), val_losses, marker='o', color='red', label="Validation Log Loss")
plt.xlabel("Federated Round")
plt.ylabel("Log Loss")
plt.title("Validation Log Loss per Round")
plt.grid(True)
plt.legend()
plt.show()

fpr, tpr, _ = roc_curve(y_test, probs)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()

cm = confusion_matrix(y_test, preds)
plt.figure(figsize=(5,4))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix")
plt.colorbar()
plt.xticks([0,1], ["Predicted 0","Predicted 1"])
plt.yticks([0,1], ["Actual 0","Actual 1"])
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i,j], ha="center", va="center", color="black", fontsize=12)
plt.show()


model_info = {
    'coef': global_coef.ravel().tolist(),
    'intercept': global_intercept.ravel().tolist(),
    'feature_names': X.columns.tolist()
}
with open("federated_model_info.json", "w") as f:
    json.dump(model_info, f)
print("Model coefficients saved to federated_model_info.json ")


ModuleNotFoundError: No module named 'pandas'

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, log_loss
from scipy.special import expit
import json

# --- 1. Load Data ---
file_path = "netflix_customer_churn.csv"
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Using mock data for demonstration.")
    # Create mock data structure based on common churn datasets for deployment
    data = {
        'CustomerID': range(1, 1001),
        'Age': np.random.randint(18, 70, 1000),
        'Subscription_Length_Months': np.random.randint(1, 60, 1000),
        'Monthly_Bill': np.random.uniform(10, 150, 1000),
        'Total_Usage_Hours': np.random.uniform(50, 500, 1000),
        'Gender': np.random.choice(['Male', 'Female'], 1000, p=[0.5, 0.5]),
        'Plan': np.random.choice(['Basic', 'Standard', 'Premium'], 1000, p=[0.4, 0.4, 0.2]),
        'Payment_Method': np.random.choice(['Credit Card', 'Bank Transfer', 'PayPal'], 1000, p=[0.5, 0.3, 0.2]),
        'Device_Type': np.random.choice(['Mobile', 'Desktop', 'Smart TV'], 1000, p=[0.5, 0.3, 0.2]),
        'CustomerChurn': np.random.randint(0, 2, 1000) # Target
    }
    df = pd.DataFrame(data)

# --- 2. Identify Target Column and Prepare Data ---
possible_labels = ['churn', 'Churn', 'Churn?', 'Exited', 'is_churn', 'Cancelled', 'CustomerChurn']
label_col = None
for c in df.columns:
    if c.lower() in [x.lower() for x in possible_labels]:
        label_col = c
        break
if label_col is None:
    raise ValueError("Could not detect churn label column. Please rename the target column to 'Churn' or similar.")

def to_binary(series):
    mapping = {'yes':1,'y':1,'true':1,'t':1,'1':1,'no':0,'n':0,'false':0,'f':0,'0':0}
    s = series.astype(str).str.lower().str.strip()
    return s.map(mapping).fillna(0).astype(int)

y = to_binary(df[label_col])
X = df.drop(columns=[label_col])

id_cols = [c for c in X.columns if 'id' in c.lower() and X[c].nunique()==len(X)]
X = X.drop(columns=id_cols, errors='ignore')

# --- 3. Preprocessing Metadata Capture ---
num_cols_original = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols_original = [c for c in X.select_dtypes(exclude=[np.number]).columns if X[c].nunique() <= 50]

# Impute missing values (as per original script)
X[num_cols_original] = X[num_cols_original].fillna(X[num_cols_original].median())
X[cat_cols_original] = X[cat_cols_original].fillna('missing').astype(str)

# One-hot encoding (drop_first=True, as per original script)
X_processed = pd.get_dummies(X, columns=cat_cols_original, drop_first=True)

# Capture one-hot mapping for the front-end
one_hot_map = {}
for col in cat_cols_original:
    # Identify which new dummy columns belong to this original column
    dummy_cols = [c for c in X_processed.columns if c.startswith(f'{col}_')]

    # Get the unique categories (excluding the first one, which was dropped)
    categories = X[col].unique().tolist()

    # Store the mapping: Original_Value -> Processed_Column_Name
    # The first value (base case) maps to no column (all zeros)
    base_category = sorted(categories)[0] # Assuming alphabetically first is the dropped one

    # Map all categories except the base one to their respective dummy column
    for cat in categories:
        if cat != base_category:
            dummy_col_name = f"{col}_{cat}"
            if dummy_col_name in dummy_cols:
                one_hot_map[f"{col}__{cat}"] = dummy_col_name
        else:
             # This is the reference category (all dummy columns for this feature are 0)
            one_hot_map[f"{col}__{cat}"] = "REFERENCE"

# --- 4. Scaling (StandardScaler fitted on *all* processed features) ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)
X_scaled = pd.DataFrame(X_scaled, columns=X_processed.columns)

# --- 5. Model Training (Federated Averaging Logic) ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

K = 5
indices = np.arange(len(X_train))
np.random.shuffle(indices)
parts = np.array_split(indices, K)
client_data = [(X_train.iloc[p], y_train.iloc[p]) for p in parts]

classes = np.array([0,1])
global_coef = np.zeros((1, X_train.shape[1]))
global_intercept = np.zeros(1)

n_rounds = 10
local_epochs = 3
alpha = 0.0001

for rnd in range(n_rounds):
    client_coefs, client_intercepts, client_sizes = [], [], []
    for Xi, yi in client_data:
        mdl = SGDClassifier(loss='log_loss', alpha=alpha, max_iter=1, tol=None, warm_start=True, random_state=42)
        # partial_fit needs to be called once with all classes to initialize
        mdl.partial_fit(Xi.values[:2], yi.values[:2], classes=classes)
        mdl.coef_ = global_coef.copy()
        mdl.intercept_ = global_intercept.copy()
        for _ in range(local_epochs):
            mdl.partial_fit(Xi.values, yi.values, classes=classes)
        client_coefs.append(mdl.coef_)
        client_intercepts.append(mdl.intercept_)
        client_sizes.append(len(yi))

    total_samples = np.sum(client_sizes)
    global_coef = np.sum([c*s for c, s in zip(client_coefs, client_sizes)], axis=0) / total_samples
    global_intercept = np.sum([i*s for i, s in zip(client_intercepts, client_sizes)], axis=0) / total_samples

    logits = X_test.values.dot(global_coef.T).ravel() + global_intercept.ravel()
    probs = expit(logits)
    acc = accuracy_score(y_test, (probs >= 0.5).astype(int))
    loss = log_loss(y_test, probs)
    print(f"Round {rnd+1}/{n_rounds}: Accuracy={acc:.4f}, LogLoss={loss:.4f}")

# --- 6. Save Model and Metadata ---

# Ensure scaler mean_ and scale_ match the shape of the features
if len(scaler.mean_) != X_scaled.shape[1]:
    raise RuntimeError("Scaler dimensions mismatch with final features.")

model_metadata = {
    'coef': global_coef.ravel().tolist(),
    'intercept': global_intercept.ravel().tolist(),
    'feature_names_processed': X_scaled.columns.tolist(), # The 1D list of all feature names for the model
    'scaler': {
        'mean': scaler.mean_.tolist(),
        'std': scaler.scale_.tolist()
    },
    'original_features': {
        'numerical': num_cols_original,
        'categorical': cat_cols_original
    },
    'categorical_map': one_hot_map
}

with open("model_metadata.json", "w") as f:
    json.dump(model_metadata, f, indent=4)
print("\nFinal federated model coefficients and preprocessing metadata saved to model_metadata.json")

# --- End of Python Script ---


Error: The file 'netflix_customer_churn.csv' was not found. Using mock data for demonstration.
Round 1/10: Accuracy=0.5000, LogLoss=7.3052
Round 2/10: Accuracy=0.5350, LogLoss=5.2290
Round 3/10: Accuracy=0.4750, LogLoss=10.3164
Round 4/10: Accuracy=0.5550, LogLoss=7.4042
Round 5/10: Accuracy=0.4800, LogLoss=7.8545
Round 6/10: Accuracy=0.4700, LogLoss=8.3237
Round 7/10: Accuracy=0.4850, LogLoss=7.1941
Round 8/10: Accuracy=0.4850, LogLoss=7.8153
Round 9/10: Accuracy=0.4900, LogLoss=8.4985
Round 10/10: Accuracy=0.4950, LogLoss=9.4722

Final federated model coefficients and preprocessing metadata saved to model_metadata.json
