In [None]:
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_df = pd.read_csv("creditscore/train.csv", low_memory=False)

## Data Preparation

In [None]:
train_df.shape

In [None]:
train_df.sample(30)

In [None]:
train_df.info()

In [None]:
train_df.nunique()

## Data Preparation

### Missing and duplicated values

In [None]:
train_df.isnull().sum()

In [None]:
train_df[train_df.duplicated()]

## Data Cleaning

In [None]:
numerical_columns = ['Age',
                     'Annual_Income',
                     'Monthly_Inhand_Salary',
                     'Num_Bank_Accounts',
                     'Num_Credit_Card',
                     'Interest_Rate',
                     'Num_of_Loan',
                     'Delay_from_due_date',
                     'Num_of_Delayed_Payment',
                     'Changed_Credit_Limit',
                     'Num_Credit_Inquiries',
                     'Outstanding_Debt',
                     'Credit_Utilization_Ratio',
                     'Credit_History_Age',
                     'Total_EMI_per_month',
                     'Amount_invested_monthly',
                     'Monthly_Balance']

In [None]:
categorical_columns = [
    "Occupation",
    "Credit_Mix",
    "Payment_of_Min_Amount",
    "Payment_Behaviour",
    "Credit_Score",
    "Month",
    "auto_loan",
    "credit-builder_loan",
    "debt_consolidation_loan",
    "home_equity_loan",
    "mortgage_loan",
    "unspecified_loan",
    "payday_loan",
    "personal_loan",
    "student_loan"
]

In [None]:
def clean_type_of_loan(frame):
    frame["auto_loan"] = frame["Type_of_Loan"].str.lower().str.contains("auto loan").astype(bool)
    frame["credit-builder_loan"] = frame["Type_of_Loan"].str.lower().str.contains("credit-builder loan").astype(bool)
    frame["debt_consolidation_loan"] = frame["Type_of_Loan"].str.lower().str.contains("debt consolidation loan").astype(bool)
    frame["home_equity_loan"] = frame["Type_of_Loan"].str.lower().str.contains("home equity loan").astype(bool)
    frame["mortgage_loan"] = frame["Type_of_Loan"].str.lower().str.contains("mortgage loan").astype(bool)
    frame["unspecified_loan"] = frame["Type_of_Loan"].str.lower().str.contains("not specified").astype(bool)
    frame["payday_loan"] = frame["Type_of_Loan"].str.lower().str.contains("payday loan").astype(bool)
    frame["personal_loan"] = frame["Type_of_Loan"].str.lower().str.contains("personal loan").astype(bool)
    frame["student_loan"] = frame["Type_of_Loan"].str.lower().str.contains("student loan").astype(bool)
    return frame

In [None]:
def clean_credit_age(age):
    if age == 'nan':
        return np.nan
    if not "Years" in age:
        return age
    years, months = age.split(" Years and ")
    months = months.replace(" Months", "")
    return int(years) + int(months) / 12

In [None]:
def clean_outliers(frame):
    frame.loc[frame["Age"] > 65, "Age"] = 65
    frame.loc[frame["Num_Bank_Accounts"] > 1000, "Num_Bank_Accounts"] = 1000
    frame.loc[frame["Monthly_Balance"] > 1e6, "Monthly_Balance"] = np.nan
    return frame

In [None]:
def clean_data(frame, is_test=False):
    numeric_object_columns = ['Age', 'Annual_Income', 'Delay_from_due_date', 'Num_of_Loan', 'Num_of_Delayed_Payment',
                              'Changed_Credit_Limit', 'Outstanding_Debt', 'Amount_invested_monthly', 'Monthly_Balance']
    for col in numeric_object_columns:
        frame[col] = frame[col].astype(str).str.replace(r'[^\d\.]', '', regex=True)
        frame[col] = pd.to_numeric(frame[col], errors="coerce")
    frame["Credit_History_Age"] = frame["Credit_History_Age"].astype(str).apply(clean_credit_age)
    frame["Credit_History_Age"] = pd.to_numeric(frame["Credit_History_Age"], errors="coerce")

    frame = clean_type_of_loan(frame)

    frame = clean_outliers(frame)

    frame["Occupation"] = frame["Occupation"].astype(str).str.replace("_______", "")
    frame["Occupation"] = frame["Occupation"].astype(str).str.replace("nan", "")
    frame["Payment_Behaviour"] = frame["Payment_Behaviour"].astype(str).str.replace("!@9#%8", "")
    frame["Payment_Behaviour"] = frame["Payment_Behaviour"].astype(str).str.replace("nan", "")
    frame = frame.drop_duplicates()

    frame = frame.drop(columns=['ID', 'Customer_ID', 'Name', 'SSN', "Type_of_Loan"], axis=1)

    frame.loc[frame["Num_Bank_Accounts"] < 0, "Num_Bank_Accounts"] = np.nan

    for f in frame.columns:
        if is_test and f == "Credit_Score":
            break
        if f in numerical_columns:
            frame[f].fillna(frame[f].median(), inplace=True)
        else:
            frame[f].fillna(frame[f].mode()[0], inplace=True)

    return frame

In [None]:
train_df = clean_data(train_df)

In [None]:
train_df.isnull().sum()

In [None]:
train_df[train_df.duplicated()]

In [None]:
train_df.info()

In [None]:
train_df.nunique()

In [None]:
train_df.sample(20)

## Exploratory Data Analysis

In [None]:
train_df.describe().T.style.bar(subset=['mean'])

In [None]:
_ = train_df.hist(bins=20, figsize=(20, 15))

In [None]:
plt.figure(figsize=(12,10))
sns.histplot(train_df["Age"], bins=20)
plt.title('Customer Age')
plt.savefig("figures/ch6_credit_age.png", dpi=600)

In [None]:
plt.figure(figsize=(12,10))
sns.histplot(train_df["Monthly_Inhand_Salary"], bins=30)
plt.title('Monthly Inhand Salary')
plt.savefig("figures/ch6_credit_salary.png", dpi=600)

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 15))
sns.histplot(train_df['Occupation'], ax=axes[0, 0])
sns.histplot(train_df['Credit_Mix'], ax=axes[0, 1])
sns.histplot(train_df['Payment_of_Min_Amount'], ax=axes[1, 0])
sns.histplot(train_df['Payment_Behaviour'], ax=axes[1, 1])
sns.histplot(train_df['Credit_Score'], ax=axes[2, 0])
sns.histplot(train_df['Month'], ax=axes[2, 1])

In [None]:
plt.figure(figsize=(12,10))
sns.histplot(train_df["Credit_Score"], bins=30)
plt.title('Credit Score')
plt.savefig("figures/ch6_credit_score.png", dpi=600)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(train_df.corr(method="spearman"), cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.savefig("figures/ch6_credit_correlations.png", dpi=600)

In [None]:
train_df.groupby("Credit_Score")["Annual_Income"].mean().plot.bar()

In [None]:
train_df.groupby("Credit_Score")["Age"].mean().plot.bar()

In [None]:
sns.scatterplot(x='Age',y='Annual_Income',hue='Credit_Score',data=train_df)

In [None]:
sns.scatterplot(x='Age',y='Monthly_Inhand_Salary',hue='Credit_Score',data=train_df)

In [None]:
sns.scatterplot(x='Num_of_Delayed_Payment',y='Credit_History_Age',hue='Credit_Score',data=train_df)

In [None]:
sns.scatterplot(x='Monthly_Inhand_Salary',y='Monthly_Balance',hue='Credit_Score',data=train_df)

In [None]:
sns.scatterplot(x='Delay_from_due_date',y='Outstanding_Debt',hue='Credit_Score',data=train_df)

In [None]:
sns.scatterplot(x='Monthly_Inhand_Salary',y='Outstanding_Debt',hue='Credit_Score',data=train_df)

In [None]:
sns.scatterplot(x='Annual_Income',y='Outstanding_Debt',hue='Credit_Score',data=train_df)

## Data Preparation

In [None]:
def prepare_data(frame, test=False, resample=False):
    for feature in categorical_columns:
        if test and feature == 'Credit_Score':
            continue
        frame[feature] = pd.Series(frame[feature], dtype="category")
    if not test:
        X_prep = frame.drop(columns=["Credit_Score"], axis=1)
    else:
        X_prep = frame
    X_dummies = pd.get_dummies(X_prep)
    y = train_df["Credit_Score"]
    if not test and resample:
        smote = SMOTE(sampling_strategy='auto')
        return smote.fit_resample(X_dummies, y)
    return X_dummies, y

In [None]:
X, y = prepare_data(train_df, resample=True)

In [None]:
y.value_counts(normalize=True)

In [None]:
plt.figure(figsize=(12,10))
sns.histplot(y, bins=30)
plt.title('Credit Score')
plt.savefig("figures/ch6_credit_score_normal.png", dpi=600)

## Modeling

In [None]:
def cross_validate_with_smote(val_model, X_cross, y_cross, verbose=True):
    X_cross = X_cross.values
    y_cross = y_cross.values
    kf = KFold(n_splits=5)
    accuracies = []
    f1_scores = []
    for fold, (train_index, test_index) in enumerate(kf.split(X_cross), 1):
        if verbose:
            print(f'Fold {fold}:')
        X_train = X_cross[train_index]
        y_train = y_cross[train_index]
        X_val = X_cross[test_index]
        y_val = y_cross[test_index]
        smote = SMOTE()
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
        if verbose:
            print("Fitting model")
        val_model.fit(X_train_resampled, y_train_resampled)  
        y_pred = val_model.predict(X_val)
        
        accuracy = val_model.score(X_val, y_val)
        f1 = f1_score(y_val, y_pred, average="macro")
        accuracies.append(accuracy)
        f1_scores.append(f1)
        
        if verbose:
            print(f'Accuracy: {accuracy}')
            print(f'F1 score: {f1}')
    return np.array(accuracies), np.array(f1_scores)


In [None]:
tree = DecisionTreeClassifier()
scores, f1_scores = cross_validate_with_smote(tree, X, y)
print(f"Accuracy: {scores.mean()}")
print(f"F1: {f1_scores.mean()}")

In [None]:
forest = RandomForestClassifier()
scores, f1_scores = cross_validate_with_smote(forest, X, y)
print(f"Accuracy: {scores.mean()}")
print(f"F1: {f1_scores.mean()}")

In [None]:
lgbm = lgb.LGBMClassifier(force_row_wise=True, verbose = -1)
scores, f1_scores = cross_validate_with_smote(lgbm, X, y)
print(f"Accuracy: {scores.mean()}")
print(f"F1: {f1_scores.mean()}")

## Parameter Optimization

In [None]:
X, y = prepare_data(train_df, resample=False) # we need the original data (without resampling) for cross validation

In [None]:
def objective(trial):
    boosting_type = trial.suggest_categorical("boosting_type", ["dart", "gbdt"])
    lambda_l1 = trial.suggest_float(
        'lambda_l1', 1e-8, 10.0, log=True),
    lambda_l2 = trial.suggest_float(
        'lambda_l2', 1e-8, 10.0, log=True),
    num_leaves = trial.suggest_int(
        'num_leaves', 2, 256),
    feature_fraction = trial.suggest_float(
        'feature_fraction', 0.4, 1.0),
    bagging_fraction = trial.suggest_float(
        'bagging_fraction', 0.4, 1.0),
    bagging_freq = trial.suggest_int(
        'bagging_freq', 1, 7),
    min_child_samples = trial.suggest_int(
        'min_child_samples', 5, 100),
    learning_rate = trial.suggest_float(
        "learning_rate", 0.0001, 0.5, log=True),
    max_bin = trial.suggest_int(
        "max_bin", 128, 512, 32)
    n_estimators = trial.suggest_int(
        "n_estimators", 40, 400, 20)

    model = lgb.LGBMClassifier(
        force_row_wise=True,
        boosting_type=boosting_type,
        n_estimators=n_estimators,
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2,
        num_leaves=num_leaves,
        feature_fraction=feature_fraction,
        bagging_fraction=bagging_fraction,
        bagging_freq=bagging_freq,
        min_child_samples=min_child_samples,
        learning_rate=learning_rate,
        max_bin=max_bin,
        verbose=-1)
    scores, f1_scores = cross_validate_with_smote(model, X, y, verbose=False)
    return f1_scores.mean()

In [None]:
sampler = optuna.samplers.TPESampler()
pruner = optuna.pruners.HyperbandPruner(
    min_resource=20, max_resource=400, reduction_factor=3)

study = optuna.create_study(
    direction='maximize', sampler=sampler,
    pruner=pruner
)
study.optimize(objective, n_trials=50, n_jobs=-1)

In [None]:
print(study.best_trial)

## Train model using best results

In [None]:
model = lgb.LGBMClassifier(
        force_row_wise=True,
        boosting_type='gbdt',
        n_estimators=200,
        lambda_l1=0.0003,
        lambda_l2=1.4418,
        num_leaves=58,
        feature_fraction=0.6628,
        bagging_fraction=0.6651,
        bagging_freq=3,
        min_child_samples=58,
        learning_rate=0.013,
        max_bin=128,
        verbose=-1
)
scores, f1_scores = cross_validate_with_smote(model, X, y, verbose=False)
print(f"Accuracy: {scores.mean()}")
print(f"F1: {f1_scores.mean()}")
X, y = prepare_data(train_df, resample=True)
model = model.fit(X, y)

In [None]:
lgb.plot_importance(model, figsize=(15, 12))

### Saving and loading the model

In [None]:
X, y = prepare_data(train_df, resample=False)

In [None]:
joblib.dump(model, "lgb_credit_score_classification.pkl")

In [None]:
def make_predictions(data):
    model = joblib.load("lgb_credit_score_classification.pkl")
    return model.predict(data)

In [None]:
predictions = make_predictions(X)