# üå≤ Random Forest Model ‚Äî Player Churn Prediction

Train and evaluate a Random Forest classifier for predicting player churn.  
**Issue #6** requirements: F1 > 0.75, confusion matrix, feature importance, save model.

In [None]:
import os, sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

# Project root
BASE_DIR = os.path.dirname(os.getcwd())   # notebooks -> project root
print(f"Project root: {BASE_DIR}")

## 1. Load & Preprocess Data

In [None]:
df = pd.read_csv(os.path.join(BASE_DIR, "data", "online_gaming_behavior_dataset.csv"))
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Target variable
df["Churned"] = (df["EngagementLevel"] == "Low").astype(int)
print(f"Churn rate: {df['Churned'].mean():.2%}")

# Encode categoricals
categorical_cols = ["Gender", "Location", "GameGenre", "GameDifficulty"]
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Feature engineering
df["EngagementScore"]    = df["SessionsPerWeek"] * df["AvgSessionDurationMinutes"]
df["ProgressionRate"]    = df["PlayerLevel"] / (df["PlayTimeHours"] + 1)
df["PurchaseFrequency"]  = df["InGamePurchases"] / (df["PlayTimeHours"] + 1)
df["IsInactive"]         = (df["SessionsPerWeek"] <= 2).astype(int)
df["SessionConsistency"] = (df["SessionsPerWeek"] > 3).astype(int)

print(f"Features after engineering: {df.shape[1]}")

In [None]:
# Train / test split
X = df.drop(columns=["PlayerID", "EngagementLevel", "Churned"])
y = df["Churned"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(scaler.transform(X_test),      columns=X_test.columns,  index=X_test.index)

feature_names = list(X_train.columns)
print(f"Train: {X_train_scaled.shape}, Test: {X_test_scaled.shape}")
print(f"Features ({len(feature_names)}): {feature_names}")

## 2. Train Random Forest (n_estimators=100)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)
print("‚úÖ Model trained")

## 3. Evaluate on Test Set

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

acc     = accuracy_score(y_test, y_pred)
prec    = precision_score(y_test, y_pred)
rec     = recall_score(y_test, y_pred)
f1      = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}  {'‚úÖ > 0.75' if f1 > 0.75 else '‚ùå < 0.75'}")
print(f"ROC-AUC:   {roc_auc:.4f}")
print()
print(classification_report(y_test, y_pred, target_names=["Active", "Churned"]))

## 4. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens",
            xticklabels=["Active", "Churned"],
            yticklabels=["Active", "Churned"])
plt.title("Random Forest ‚Äî Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()

os.makedirs(os.path.join(BASE_DIR, "notebooks", "plots"), exist_ok=True)
cm_path = os.path.join(BASE_DIR, "notebooks", "plots", "confusion_matrix_rf.png")
plt.savefig(cm_path)
print(f"Saved to {cm_path}")
plt.show()

## 5. Feature Importance

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
plt.barh(
    [feature_names[i] for i in indices],
    importances[indices],
    color=sns.color_palette("viridis", len(feature_names)),
)
plt.xlabel("Importance")
plt.title("Random Forest ‚Äî Feature Importance")
plt.gca().invert_yaxis()
plt.tight_layout()

fi_path = os.path.join(BASE_DIR, "notebooks", "plots", "feature_importance_rf.png")
plt.savefig(fi_path)
print(f"Saved to {fi_path}")
plt.show()

# Print table
print("\nFeature Importance Ranking:")
for rank, i in enumerate(indices, 1):
    print(f"  {rank:2d}. {feature_names[i]:30s} {importances[i]:.4f}")

## 6. Save Results & Model

In [None]:
models_dir = os.path.join(BASE_DIR, "models")
os.makedirs(models_dir, exist_ok=True)

# Save metrics
results_path = os.path.join(models_dir, "rf_results.txt")
with open(results_path, "w") as f:
    f.write("Random Forest Evaluation Metrics\n")
    f.write("================================\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"Precision: {prec:.4f}\n")
    f.write(f"Recall: {rec:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write(f"ROC-AUC: {roc_auc:.4f}\n")
print(f"‚úÖ Results saved to {results_path}")

# Save model
model_path = os.path.join(models_dir, "rf_model.pkl")
joblib.dump(model, model_path)
print(f"‚úÖ Model saved to {model_path}")

print(f"\nüéâ All done! F1 = {f1:.4f}")