In [None]:
from utils import inference_validator, reasoning, feature_engineering
import pandas as pd
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # None means unlimited column width

In [None]:
import pickle, json
with open('saved/nimputer.pkl', 'rb') as f:
    nimputer = pickle.load(f)

with open('saved/cimputer.pkl', 'rb') as f:
    cimputer = pickle.load(f)

with open('saved/xgb_model.pkl', 'rb') as f:
    xgb = pickle.load(f)

with open('saved/before_feature.json', 'r') as f:
    before_columns = json.load(f)

with open('saved/df_train_encoded.json', 'r') as f:
    df_train_encoded = json.load(f)
    
with open('saved/encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

In [None]:
import pandas as pd

def manual_one_hot_encode(input_df, encoder_columns):
    """
    Manually applies one-hot encoding to the input DataFrame based on the encoder_columns.

    Args:
        input_df (pd.DataFrame): Input DataFrame with categorical columns.
        encoder_columns (list): List of columns after one-hot encoding.

    Returns:
        pd.DataFrame: DataFrame with one-hot encoded columns.
    """
    # Create a copy of the input DataFrame to avoid modifying the original
    df = input_df.copy()

    # Initialize all encoder columns with 0
    # for col in encoder_columns:
    #     df[col] = 0

    # Map categorical values to one-hot encoded columns
    if 'HighDebtToIncome' in df.columns:
        df['HighDebtToIncome_True'] = df['HighDebtToIncome'].apply(lambda x: 1 if x else 0)

    if 'AgeBin' in df.columns:
        age_bins = ['30-40', '40-50', '50-60', '60+']
        for bin in age_bins:
            df[f'AgeBin_{bin}'] = df['AgeBin'].apply(lambda x: 1 if x == bin else 0)

    if 'CreditScoreBin' in df.columns:
        credit_bins = ['Good', 'Poor']
        for bin in credit_bins:
            df[f'CreditScoreBin_{bin}'] = df['CreditScoreBin'].apply(lambda x: 1 if x == bin else 0)

    if 'EmploymentStatus' in df.columns:
        employment_statuses = ['Self-Employed', 'Unemployed']
        for status in employment_statuses:
            df[f'EmploymentStatus_{status}'] = df['EmploymentStatus'].apply(lambda x: 1 if x == status else 0)

    if 'MaritalStatus' in df.columns:
        marital_statuses = ['Married', 'Single', 'Widowed']
        for status in marital_statuses:
            df[f'MaritalStatus_{status}'] = df['MaritalStatus'].apply(lambda x: 1 if x == status else 0)

    if 'HomeOwnershipStatus' in df.columns:
        home_statuses = ['Other', 'Own', 'Rent']
        for status in home_statuses:
            df[f'HomeOwnershipStatus_{status}'] = df['HomeOwnershipStatus'].apply(lambda x: 1 if x == status else 0)

    if 'EducationLevel' in df.columns:
        education_levels = ['Bachelor', 'Doctorate', 'High School', 'Master']
        for level in education_levels:
            df[f'EducationLevel_{level}'] = df['EducationLevel'].apply(lambda x: 1 if x == level else 0)

    if 'LoanPurpose' in df.columns:
        loan_purposes = ['Debt Consolidation', 'Education', 'Home', 'Other']
        for purpose in loan_purposes:
            df[f'LoanPurpose_{purpose}'] = df['LoanPurpose'].apply(lambda x: 1 if x == purpose else 0)

    # Drop original categorical columns
    categorical_columns = ['HighDebtToIncome', 'AgeBin', 'CreditScoreBin', 'EmploymentStatus',
                           'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose']
    df = df.drop(columns=[col for col in categorical_columns if col in df.columns])

    return df

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# List of categorical columns
c_columns = ['HighDebtToIncome', 'AgeBin', 'CreditScoreBin', 'EmploymentStatus', 
             'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose']

# Get feature names from the encoder
feature_names = encoder.get_feature_names_out(c_columns)
print(feature_names)

# During training
def transform_with_encoder(X, categorical_cols, other_cols, encoder):
    # Apply encoding to categorical columns
    encoded_array = encoder.transform(X[categorical_cols])
    
    # Convert to DataFrame with proper column names
    encoded_df = pd.DataFrame(
        encoded_array, 
        columns=encoder.get_feature_names_out(categorical_cols),
        index=X.index
    )
    
    # Combine with non-categorical columns
    if other_cols:
        result = pd.concat([X[other_cols], encoded_df], axis=1)
    else:
        result = encoded_df
        
    return result

In [None]:
# groq_api_key = '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'
import groq

# Initialize Groq client
client = groq.Client(api_key="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")

import numpy as np

def get_risk_score_reasoning(user_df, prediction):
    """
    Generates a reasoning for the risk score using an LLM.

    Args:
        user_df (pd.DataFrame): User data after feature engineering.
        prediction (np.ndarray): The predicted risk score (NumPy array).

    Returns:
        str: The reasoning for the risk score.
    """
    risk_score = prediction[0]  # Extract the first element (scalar)
    user_data_str = user_df.to_string()

    # Prepare the prompt for the LLM
    prompt = (
        f"You are a financial risk analyst. A user has applied for a loan, and the machine learning model has assigned them a risk score of {risk_score:.2f}.\n\n"
        f"Below are the user's details after feature engineering:\n"
        f"{user_data_str}\n\n"
        f"### Instructions:\n"
        f"- Provide a detailed explanation of why this person received the given risk score.\n"
        f"- Highlight key features that contributed to the risk score (e.g., high debt-to-income ratio, low credit score, late payments, etc.).\n"
        f"- Explain how these features impact the user's financial risk.\n"
        f"- Mention any factors that could improve or worsen their risk score.\n"
        f"- Provide actionable recommendations for the user to reduce their risk score if applicable.\n\n"
        f"- Remember risk score between 27 and 84. \n\n"
        f"### Output Format:\n"
        f"- Output ONLY a plain text explanation without any additional formatting or labels.\n"
    )

    # ... (rest of your LLM interaction code) ...
    response = client.chat.completions.create(
        model="llama-3.2-3b-preview",
        messages=[
            {"role": "system", "content": "You are a financial risk analyst."},
            {"role": "user", "content": prompt},
        ],
        max_tokens=250,
        temperature=0.7,
    )
    return response.choices[0].message.content



In [None]:
import numpy as np
import pandas as pd

user_input = {
  "ApplicationDate": "2024-03-01",
  "Age": 48,
  "AnnualIncome": 8500000,
  "CreditScore": 800,
  "EmploymentStatus": "Employed",
  "EducationLevel": "Doctorate",
  "Experience": 22,
  "LoanAmount": 12000,
  "LoanDuration": 24,
  "MaritalStatus": "Married",
  "NumberOfDependents": 1,
  "HomeOwnershipStatus": "Own",
  "MonthlyDebtPayments": 700,
  "CreditCardUtilizationRate": 0.1,
  "NumberOfOpenCreditLines": 2,
  "NumberOfCreditInquiries": 1,
  "DebtToIncomeRatio": 0.18,
  "BankruptcyHistory": 0,
  "LoanPurpose": "Home",
  "PreviousLoanDefaults": 0,
  "PaymentHistory": 24,
  "LengthOfCreditHistory": 18,
  "SavingsAccountBalance": 20000,
  "CheckingAccountBalance": 15000,
  "TotalAssets": 250000,
  "TotalLiabilities": 20000,
  "MonthlyIncome": 110000,
  "UtilityBillsPaymentHistory": 0.95,
  "JobTenure": 12,
  "NetWorth": 230000,
  "BaseInterestRate": 3.5,
  "InterestRate": 4.8,
  "MonthlyLoanPayment": 450,
  "TotalDebtToIncomeRatio": 0.22
}
user_input = inference_validator(user_input)
user_df = pd.DataFrame([user_input])

if isinstance(user_df, pd.Series):
    user_df = user_df.to_frame().T

# Ensure the user input has the same columns as the training data
user_df = user_df.reindex(columns=before_columns, fill_value=np.nan)

# Assuming you know which features each imputer was trained on
c_features = [f for f in cimputer.feature_names_in_ if f in user_df.columns]
n_features = [f for f in nimputer.feature_names_in_ if f in user_df.columns]


# Apply only if features exist
if c_features:
    user_df[c_features] = cimputer.transform(user_df[c_features])
if n_features:
    user_df[n_features] = nimputer.transform(user_df[n_features])


user_df = feature_engineering(user_df)

# Get non-categorical columns
other_columns = [col for col in user_df.columns if col not in c_columns]

# print(user_df.columns)
# print(c_columns)
# print(other_columns) #'AgeBin_30-40', 'AgeBin_40-50', 'AgeBin_50-60', 'AgeBin_60+', 'CreditScoreBin_Fair', 'CreditScoreBin_Good', 'CreditScoreBin_Excellent']
# Apply the transformation

# user_df = transform_with_encoder(user_df, c_columns, other_columns, encoder)
user_df = manual_one_hot_encode(user_df, c_columns)
# user_df = user_df.reindex(columns=df_train_encoded, fill_value=0)
user_df = user_df.drop(columns=["RiskScore"])
# print(user_df)

# user_df = user_df.reindex(columns=xgb.get_booster().feature_names, fill_value=0)

prediction = xgb.predict(user_df)

print("Prediction:", prediction)
# reasoning = get_risk_score_reasoning(user_df, prediction)
# print("Reasoning for Risk Category:", reasoning)
# print(user_processed)