In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
  ~ ~ ~ ~ ~ ~ ~ ~ ~  Credit scoring model  ~ ~ ~ ~ ~ ~ ~ ~ ~
      generalized linear model without machine learning
'''
import pandas as pd
import statsmodels.api as sm
df = pd.read_excel("") # <-------------------- fill here .xlsx
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default

In [None]:
# Calculate the counts of defaults
default_counts_raw = df['default'].value_counts()
total = default_counts_raw.sum()

# Get the counts for default and no default
no_default_count = default_counts_raw.get(0.0, 0)
default_count = default_counts_raw.get(1.0, 0)

# Calculate percentages
no_default_pct = (no_default_count / total) * 100
default_pct = (default_count / total) * 100

# Update the summary table with counts, percentages and totals
default_table = pd.DataFrame({
    'Status': ['No Default', 'Default', 'Total'],
    'Count': [no_default_count, default_count, total],
    'Percentage': [no_default_pct, default_pct, 100.0],
})
print(default_table)

In [None]:
# Define dependent and independent variables
X = df[["age", "education", "employears", "salary", "creddebt", "othdebt"]]
y = df["default"]

# Add a constant to the model (intercept)
X = sm.add_constant(X)

# Fit a logistic regression model
_model = sm.Lo... (y, X).fit() # <-------------------- fill here

# Calculate predicted scores (probabilities)
df["score"] = _model.predict(X)

logit_model.summary()

In [None]:
import statsmodels.formula.api as smf

# Convert target to int
df["default"] = df["default"].astype(int)

# Fit logistic regression using a formula, treating education and address as categorical
logit_model = smf.logit("default ~ age + C(education) + employears + salary + creddebt + othdebt", data=df).fit()

# Show model summary
print(logit_model.summary())


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Predict probabilities (scores)
df["probab_score"] = logit_model.predict(df)
df["credit_score"] = (319 - 72.13 * np.log(df["probab_score"] / (1 - df["probab_score"]))).round(0).astype(int)

# Build a scorecard based on log-odds
scorecard = pd.DataFrame({
    "Variable": logit_model.params.index,
    "Coefficient": logit_model.params.values
})
scorecard["Points"] = -scorecard["Coefficient"] * 20  # Scaling factor 

# Plot distribution of predicted scores
plt.figure(figsize=(10, 6))
plt.hist(df["probab_score"], bins=30, edgecolor="black", alpha=0.7)
plt.title("Distribution of Predicted Default Probabilities")
plt.xlabel("Predicted Probability of Default")
plt.ylabel("Number of Applicants")
plt.grid(True)
plt.show()

# Plot distribution of predicted scores
plt.figure(figsize=(10, 6))
plt.hist(df["credit_score"], bins=30, edgecolor="black", alpha=0.7)
plt.title("Distribution of Credit Scores")
plt.xlabel("Credit Scores")
plt.ylabel("Number of Applicants")
plt.grid(True)
plt.show()

# Generate a scatterplot: predicted probability vs. credit score
plt.figure(figsize=(10, 6))
plt.scatter(df["credit_score"], df["probab_score"], alpha=0.5)
plt.title("Credit Score vs. Predicted Default Probability")
plt.xlabel("Credit Score")
plt.ylabel("Predicted Probability of Default")
plt.grid(True)
plt.show()

# Create a table of score bins and corresponding average default probability
df["score_bin"] = pd.cut(df["credit_score"], 
                               bins=[0, 550, 600, 650, 700, 850],
                               labels=["Very Poor", "Poor", "Fair", "Good", "Excellent"],
                               include_lowest=True)

# Calculate mean default probability per score bin
score_prob_table = df.groupby("score_bin", observed=True).agg(
    Number_of_Applicants=("credit_score", "count"),
    Average_Default_Probability=("probab_score", "mean")
).reset_index()

print(score_prob_table)

In [None]:
from sklearn.metrics import roc_curve, auc

# True labels and predicted probabilities
y_true = df["default"]
y_scores = df["score"]

# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver Operating Characteristic (ROC) Curve")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Print AUC and Gini with 2 decimals
roc_auc = round(roc_auc, 2)
gini = round(2 * roc_auc - 1, 2)
print(['AUC: ', roc_auc])
print(['Gini: ', gini])
