In [None]:
from utils import inference_validator, reasoning, feature_engineering
import pandas as pd
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # None means unlimited column width

In [None]:
import pickle, json
with open('saved/nimputer.pkl', 'rb') as f:
    nimputer = pickle.load(f)

with open('saved/cimputer.pkl', 'rb') as f:
    cimputer = pickle.load(f)

with open('saved/xgb_model.pkl', 'rb') as f:
    xgb = pickle.load(f)

with open('saved/before_feature.json', 'r') as f:
    before_columns = json.load(f)

with open('saved/df_train_encoded.json', 'r') as f:
    df_train_encoded = json.load(f)
    
with open('saved/encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# List of categorical columns
c_columns = ['HighDebtToIncome', 'AgeBin', 'CreditScoreBin', 'EmploymentStatus', 
             'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose']

# Get feature names from the encoder
feature_names = encoder.get_feature_names_out(c_columns)

# During training
def transform_with_encoder(X, categorical_cols, other_cols, encoder):
    # Apply encoding to categorical columns
    encoded_array = encoder.transform(X[categorical_cols])
    
    # Convert to DataFrame with proper column names
    encoded_df = pd.DataFrame(
        encoded_array, 
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X.index
    )
    
    # Combine with non-categorical columns
    if other_cols:
        result = pd.concat([X[other_cols], encoded_df], axis=1)
    else:
        result = encoded_df
        
    return result

In [None]:
# groq_api_key = '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'
import groq

# Initialize Groq client
client = groq.Client(api_key="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")

def get_risk_category_reasoning(user_df, prediction, prediction_proba):
    # Convert the user_df to a readable format for the LLM
    user_data_str = user_df.to_string()

    # Prepare the prompt for the LLM
    prompt = (
        f"You are a financial risk analyst. A user has applied for a loan, and the machine learning model has predicted their risk category as {prediction[0]} (0-4, where 0 is the lowest risk and 4 is the highest).\n\n"
        f"The prediction probabilities are: {prediction_proba[0]}.\n\n"
        f"Below are the user's details after feature engineering:\n"
        f"{user_data_str}\n\n"
        f"### Instructions:\n"
        f"- Provide a detailed explanation of why this person belongs to the predicted risk category.\n"
        f"- Highlight key features that contributed to the risk level (e.g., high debt-to-income ratio, low credit score, etc.).\n"
        f"- Explain how these features align with the risk category.\n"
        f"- Mention any mitigating factors that could lower the risk or exacerbate it.\n"
        f"- Provide recommendations for the user to improve their risk profile if applicable.\n\n"
        f"### Output Format:\n"
        f"- Output ONLY a plain text explanation without any additional formatting or labels.\n"
    )

    # Call the Groq API
    response = client.chat.completions.create(
        model="llama-3.2-3b-preview",  # Replace with the actual Groq model name
        messages=[
            {"role": "system", "content": "You are a financial risk analyst. Provide a detailed explanation of the user's risk category based on their data."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=500,
        temperature=0.7,
    )

    # Extract and return the reasoning
    reasoning = response.choices[0].message.content.strip()
    return reasoning


In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

user_input = {
  "ApplicationDate": "2024-03-02",
  "Age": 52,
  "AnnualIncome": 9500000,
  "CreditScore": 820,
  "EmploymentStatus": "Employed",
  "EducationLevel": "Master",
  "Experience": 28,
  "LoanAmount": 15000,
  "LoanDuration": 36,
  "MaritalStatus": "Married",
  "NumberOfDependents": 2,
  "HomeOwnershipStatus": "Own",
  "MonthlyDebtPayments": 900,
  "CreditCardUtilizationRate": 0.12,
  "NumberOfOpenCreditLines": 3,
  "NumberOfCreditInquiries": 0,
  "DebtToIncomeRatio": 0.15,
  "BankruptcyHistory": 0,
  "LoanPurpose": "Home",
  "PreviousLoanDefaults": 0,
  "PaymentHistory": 36,
  "LengthOfCreditHistory": 20,
  "SavingsAccountBalance": 30000,
  "CheckingAccountBalance": 20000,
  "TotalAssets": 300000,
  "TotalLiabilities": 25000,
  "MonthlyIncome": 130000,
  "UtilityBillsPaymentHistory": 0.98,
  "JobTenure": 15,
  "NetWorth": 275000,
  "BaseInterestRate": 3.5,
  "InterestRate": 4.5,
  "MonthlyLoanPayment": 500,
  "TotalDebtToIncomeRatio": 0.2
}
user_input = inference_validator(user_input)
user_df = pd.DataFrame([user_input])

if isinstance(user_df, pd.Series):
    user_df = user_df.to_frame().T

# Ensure the user input has the same columns as the training data
user_df = user_df.reindex(columns=before_columns, fill_value=np.nan)

# Assuming you know which features each imputer was trained on
c_features = [f for f in cimputer.feature_names_in_ if f in user_df.columns]
n_features = [f for f in nimputer.feature_names_in_ if f in user_df.columns]


# Apply only if features exist
if c_features:
    user_df[c_features] = cimputer.transform(user_df[c_features])
if n_features:
    user_df[n_features] = nimputer.transform(user_df[n_features])


user_df = feature_engineering(user_df)

# Get non-categorical columns
other_columns = [col for col in user_df.columns if col not in c_columns]

# print(user_df.columns)
# print(c_columns)
# print(other_columns) #'AgeBin_30-40', 'AgeBin_40-50', 'AgeBin_50-60', 'AgeBin_60+', 'CreditScoreBin_Fair', 'CreditScoreBin_Good', 'CreditScoreBin_Excellent']
# Apply the transformation

user_df = transform_with_encoder(user_df, c_columns, other_columns, encoder)
# user_df = user_df.reindex(columns=df_train_encoded, fill_value=0)
user_df = user_df.drop(columns=['LoanApproved', "RiskScore"])

# print(user_df.columns)
# user_df = user_df.reindex(columns=xgb.get_booster().feature_names, fill_value=0)

prediction = xgb.predict(user_df)
prediction_proba = xgb.predict_proba(user_df)

print("Prediction:", prediction)
reasoning = get_risk_category_reasoning(user_df, prediction, prediction_proba)
print("Reasoning for Risk Category:", reasoning)
print("Prediction Probability:", prediction_proba)
# print(user_processed)
# 6 → 3 → 7 → 1 → 2 → 0 → 5 → 4