In [3]:
# Credit Score Classification Project

import os, re, zipfile, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
import shap
from fpdf import FPDF

# ---------------------- Load Data ----------------------
for file in ['train.csv', 'test.csv', 'Sample_Output.csv']:
    if not os.path.exists(file):
        raise FileNotFoundError(f"Missing file: {file}")

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('Sample_Output.csv')
train['source'], test['source'], test['Credit_Score'] = 'train', 'test', np.nan
combined = pd.concat([train, test])

# ---------------------- Clean Data ----------------------
def clean_data(df):
    df.replace({'Occupation': '_______', 'SSN': '#F%$D@*&8', 'Payment_Behaviour': '!@9#%8'}, np.nan, inplace=True)
    for col in ['Occupation', 'SSN', 'Payment_Behaviour']:
        df[col] = df.groupby('Customer_ID')[col].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else np.nan))
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df.loc[df['Age'] > 85, 'Age'] = np.nan
    df[df.select_dtypes(include=np.number).columns] = df.select_dtypes(include=np.number).apply(lambda col: col.where(col >= 0, np.nan))
    df[df.select_dtypes(include=np.number).columns] = df.groupby('Customer_ID')[df.select_dtypes(include=np.number).columns].transform(lambda x: x.fillna(x.median()))
    df['Credit_History_Age'] = df['Credit_History_Age'].apply(lambda x: float(re.findall(r'(\d+)', x)[0]) + float(re.findall(r'(\d+)', x)[1])/12 if isinstance(x, str) and len(re.findall(r'(\d+)', x)) >= 2 else np.nan)
    df['Payment_of_Min_Amount'] = df['Payment_of_Min_Amount'].replace({'NM': 'No', 'Not Available': np.nan})
    df['Payment_of_Min_Amount'] = df.groupby('Customer_ID')['Payment_of_Min_Amount'].transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else np.nan))
    df['Month'] = df['Month'].map({'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12})
    return df

combined = clean_data(combined)

# ---------------------- Encode + Scale ----------------------
combined['Credit_Score'] = combined['Credit_Score'].map({'Poor': 0, 'Standard': 1, 'Good': 2})
combined.drop(columns=['Name'], inplace=True)
le = LabelEncoder()
for col in combined.select_dtypes(include='object').columns.drop('source'):
    combined[col] = le.fit_transform(combined[col].astype(str))
scaler = StandardScaler()
num_cols = combined.select_dtypes(include=np.number).columns.drop(['ID', 'Customer_ID', 'Credit_Score'])
combined[num_cols] = scaler.fit_transform(combined[num_cols])

# ---------------------- Split Data ----------------------
train_df = combined[combined['source'] == 'train'].drop('source', axis=1)
test_df = combined[combined['source'] == 'test'].drop(['source', 'Credit_Score'], axis=1)
X = train_df.drop(['ID', 'Customer_ID', 'Credit_Score'], axis=1)
y = train_df['Credit_Score']
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# ---------------------- Feature Selection ----------------------
X_train = X_train.fillna(0)
mi = pd.Series(mutual_info_classif(X_train, y_train), index=X_train.columns).sort_values(ascending=False)
rfe = RFE(LogisticRegression(max_iter=1000), n_features_to_select=15)
rfe.fit(X_train[mi.head(20).index], y_train)
selected = list(X_train[mi.head(20).index].columns[rfe.support_])
X_train, X_val = X_train[selected], X_val[selected]
X_test_final = test_df.drop(['ID', 'Customer_ID'], axis=1)[selected]

# ---------------------- Model Comparison ----------------------
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

best_model, best_score = None, 0
for name, model in models.items():
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    print(f"{name} CV Mean Accuracy: {score:.4f}")
    if score > best_score: best_score, best_model = score, model

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)
print("\nClassification Report:\n", report)
print("Accuracy:", acc)

# ---------------------- SHAP & Visuals ----------------------
explainer = shap.Explainer(best_model, X_train)
shap_vals = explainer(X_val[:100])
shap.summary_plot(shap_vals, X_val[:100], show=False)
plt.savefig("shap_summary.png"); plt.close()

plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_val, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=['Poor','Standard','Good'], yticklabels=['Poor','Standard','Good'])
plt.title("Confusion Matrix"); plt.tight_layout()
plt.savefig("confusion_matrix.png"); plt.close()

imp = pd.Series(best_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
sns.barplot(x=imp[:15], y=imp.index[:15], palette="viridis")
plt.title("Top 15 Feature Importances"); plt.tight_layout()
plt.savefig("feature_importance.png"); plt.close()

# ---------------------- Final Output ----------------------
preds = best_model.predict(X_test_final)
preds = [ {0: 'Poor', 1: 'Standard', 2: 'Good'}[i] for i in preds ]
submission = pd.DataFrame({'ID': test_df['ID'], 'Credit_Score': preds})
submission.to_csv('final_predictions.csv', index=False)

# ---------------------- PDF Report ----------------------
pdf = FPDF(); pdf.add_page(); pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Credit Score Classification Report", ln=True, align='C')
pdf.ln(10); pdf.multi_cell(0, 8, f"Model: {best_model.__class__.__name__}\n\nAccuracy: {acc:.4f}\n\n{report}")
pdf.image("shap_summary.png", w=180); pdf.output("report.pdf")

# ---------------------- ZIP Submission ----------------------
readme = f"""Credit Score Classification Submission\n\nIncluded:\n- final_predictions.csv\n- Sample_Output.csv\n- report.pdf\n- shap_summary.png\n- confusion_matrix.png\n- feature_importance.png\n\nModel: {best_model.__class__.__name__}\nValidation Accuracy: {acc:.4f}\nCV Accuracy: {best_score:.4f}"""
with open("README.txt", "w") as f: f.write(readme)

with zipfile.ZipFile("submission.zip", "w") as z:
    for f in ["final_predictions.csv", "Sample_Output.csv", "README.txt", "report.pdf", "shap_summary.png", "confusion_matrix.png", "feature_importance.png"]:
        z.write(f)

print("\n submission.zip created with predictions, visuals & report")


RandomForest CV Mean Accuracy: 0.7657
XGBoost CV Mean Accuracy: 0.7416
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002953 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2775
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 15
[LightGBM] [Info] Start training from score -1.237928
[LightGBM] [Info] Start training from score -0.631611
[LightGBM] [Info] Start training from score -1.724393
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2781
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 15
[LightGBM] [Info] Start training from score -1.237928
[LightGBM] [Info] Start training from score -0.631611
[LightGBM] [Info]




 submission2.zip created with predictions, visuals & report
