In [19]:
# 1. Install required libraries
!pip install -q -U torch torchvision torchaudio transformers accelerate bitsandbytes

print("✅ Installation complete.")
print("⚠️ NOW: Go to the top menu, click 'Runtime' > 'Restart Session', then run Cell 2 below.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = {
    'Decision Tree':  {'ROC-AUC': 0.65, 'Recall': 0.11, 'F1-score': 0.15},
    'Neural Network': {'ROC-AUC': 0.79, 'Recall': 0.39, 'F1-score': 0.42},
    'XGBoost':        {'ROC-AUC': 0.91, 'Recall': 0.82, 'F1-score': 0.53},
    'Random Forest':  {'ROC-AUC': 0.77, 'Recall': 0.56, 'F1-score': 0.25},
    'kNN':            {'ROC-AUC': 0.82, 'Recall': 0.56, 'F1-score': 0.26}
}

df_results = pd.DataFrame.from_dict(data, orient='index').reset_index()
df_results.rename(columns={'index': 'Model Name'}, inplace=True)

# Dictionary of the Top 3 features found for EACH model
feature_importance = {
    "Decision Tree": ["debt_to_income", "credit_score", "payment_punctuality"],
    "Neural Network": ["credit_score", "savings_ratio", "debt-to-income"],
    "XGBoost": ["credit_score", "business", "feedback"],
    "Random Forest": ["credit_score", "savings_ratio", "debt_to_income"],
    "kNN": ["credit_score", "savings_ratio", "debt_to_income"]
}

styled_table = df_results.style\
    .format({'ROC-AUC': '{:.2f}', 'Recall': '{:.2f}', 'F1-score': '{:.2f}'})\
    .set_caption("Sterling Financial: Model Performance Summary")

print("Performance Table:\n")
display(styled_table)


Performance Table:



Unnamed: 0,Model Name,ROC-AUC,Recall,F1-score
0,Decision Tree,0.65,0.11,0.15
1,Neural Network,0.79,0.39,0.42
2,XGBoost,0.91,0.82,0.53
3,Random Forest,0.77,0.56,0.25
4,kNN,0.82,0.56,0.26


In [2]:
import torch
import os
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_TOKEN')
login(token=hf_token)

# 1. LOAD MISTRAL 7B (4-bit quantization for T4 GPU)
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

torch.cuda.empty_cache()

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    quantization_config=quant_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Model Ready!")

# 2. DEFINE THE PROMPT FUNCTION
def generate_interpretation_prompt(df_results, feature_importance):
    prompt = f"""
[INST]
SYSTEM ROLE: Chief Risk Officer at Sterling Financial.

CONTEXT:
We are evaluating 5 credit risk models. Our primary concern is the "Cost of Missed Defaults."
A model with high accuracy but low Recall is a failure for our business.

INPUT DATA:
{df_results.to_string(index=False)}

FEATURE IMPORTANCE:
{feature_importance}

TASK:
1. SUMMARY: Compare the models. Explain why XGBoost stands out in the 0.91 ROC-AUC vs. the 0.77-0.82 range of the others.
2. INSIGHTS: Interpret the F1-score and Recall. Why is XGBoost's 82% Recall the "gold standard" for our bank compared to the Decision Tree's 11%?
3. FEATURE IMPORTANCE: Explain how 'business' and 'feedback' features help XGBoost find risk that models focusing only on 'credit_score' miss.
4. FINAL VERDICT: Formally recommend the best model for deployment.
[/INST]
"""
    return prompt

# 3. RUN INTERPRETATION
final_prompt = generate_interpretation_prompt(df_results, feature_importance)
inputs = tokenizer(final_prompt, return_tensors="pt").to("cuda")

print("Analyzing Results...")

gen_config = GenerationConfig(
    max_new_tokens=2000,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)
outputs = model.generate(
    **inputs,
    generation_config=gen_config
)
interpretation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n" + "="*60)
print("FINAL STRATEGIC REPORT (GENERATED BY MISTRAL 7B)")
print("="*60)
print(interpretation.split("[/INST]")[-1].strip())

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model Ready!
Analyzing Results...

FINAL STRATEGIC REPORT (GENERATED BY MISTRAL 7B)
1. Summary:
The XGBoost model stands out from the others due to its high ROC-AUC score of 0.91, which is significantly higher than the range of the other models (0.77-0.82). This indicates that XGBoost is highly accurate in predicting credit risk and has a lower false negative rate, making it less likely to miss potential defaults.
2. Insights:
The F1-score and Recall values provide insights into the performance of each model. XGBoost's F1-score of 0.53 is higher than the range of the other models (0.25-0.42), indicating that it has a good balance between precision and recall. However, its Recall value of 82% is particularly noteworthy, as it is significantly higher than the Decision Tree's 11%. This means that XGBoost is more effective at identifying potential defaults and reducing the cost of missed defaults.
3. Feature Importance:
The 'business' and 'feedback' features in XGBoost help it identify ris