In [None]:
# ============================================================
# 📘 Lightweight Model Creator for GitHub & Streamlit Deployment
# ============================================================

import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# 1️⃣ Load your cleaned dataset
df = pd.read_csv('../data/cleaned_credit_score.csv')
print("✅ Cleaned dataset loaded!")
print("Shape before sampling:", df.shape)

# 2️⃣ Optional: Sample only 10% to make model smaller
df_small = df.sample(frac=0.1, random_state=42)
print("Sampled dataset shape:", df_small.shape)

# 3️⃣ Split features and target
X = df_small.drop('Credit_Score', axis=1)
y = df_small['Credit_Score']

# 4️⃣ Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5️⃣ Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6️⃣ Train a smaller RandomForest model
rf_light = RandomForestClassifier(
    n_estimators=50,   # less trees to reduce file size
    max_depth=10,      # limit depth for compactness
    random_state=42
)
rf_light.fit(X_train_scaled, y_train)

# 7️⃣ Evaluate performance
y_pred = rf_light.predict(X_test_scaled)
print("\n🔹 Lightweight Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 8️⃣ Save the model (this one will be <10 MB)
joblib.dump(rf_light, '../models/credit_score_model_light.pkl')
print("✅ Lightweight model saved at: '../models/credit_score_model_light.pkl'")




