<a href="https://colab.research.google.com/github/Pegah1367/DeepFinanceAgent-Explainable-Financial-Risk-AI-Agent/blob/main/Final_project_with_lang_chain_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json

# ---------------------------------------------
# Purpose:
# This script ensures that a feature-to-concept
# mapping exists in the concepts.json file and
# adds a mapping for the "age" feature.
# ---------------------------------------------

# Path to the concepts configuration file
file_path = "/content/concepts.json"

# Step 1: Load the existing JSON file into memory
with open(file_path, "r", encoding="utf-8") as f:
    concepts = json.load(f)

# Step 2: Ensure the "feature_to_concept" dictionary exists
# If it does not exist, initialize it to avoid KeyError
if "feature_to_concept" not in concepts:
    concepts["feature_to_concept"] = {}

# Step 3: Add or update the feature-to-concept mapping
# This links the raw feature name used in the model ("age")
# to a human-readable business concept ("Age")
concepts["feature_to_concept"]["age"] = "Age"

# Step 4: Write the updated configuration back to the JSON file
# - indent=2 makes the file human-readable
# - ensure_ascii=False preserves non-ASCII characters
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(concepts, f, indent=2, ensure_ascii=False)

# Step 5: Log confirmation for traceability
print("'age': 'Age' successfully added to feature_to_concept")


'age': 'Age' successfully added to feature_to_concept


In [None]:
"""
Feature-to-Concept Mapping

This dictionary defines a strict mapping between raw dataset feature names
and standardized domain concepts used throughout the system.

Purpose:
- Decouples raw input data from business and risk concepts
- Ensures consistent terminology across ML, rules, and LLM explanations
- Enables explainability and auditability in regulated environments

How it is used:
- Raw features (e.g., 'credit_score') are mapped to canonical concepts
  (e.g., 'CreditScore')
- Concepts are then used for:
    - Risk model explanations
    - RAG-based concept retrieval
    - Human-readable justifications
    - Schema-validated outputs

Design principle:
- Raw data stays technical
- Concepts stay semantic and business-oriented
"""
concepts["feature_to_concept"]


{'id': 'Id',
 'status': 'Status',
 'credit_score': 'CreditScore',
 'ltv': 'LoanToValue',
 'income': 'Income',
 'loan_amount': 'LoanAmount',
 'rate_of_interest': 'InterestRate',
 'dtir1': 'DebtToIncomeRatio',
 'credit_worthiness': 'CreditWorthiness',
 'loan_type': 'LoanType',
 'loan_purpose': 'LoanPurpose',
 'property_value': 'PropertyValue',
 'region': 'Region',
 'year': 'Year',
 'interest_rate_spread': 'InterestRateSpread',
 'upfront_charges': 'UpfrontCharges',
 'term': 'Term',
 'loan_limit': 'LoanLimit',
 'gender': 'Gender',
 'approv_in_adv': 'ApprovInAdv',
 'open_credit': 'OpenCredit',
 'business_or_commercial': 'BusinessOrCommercial',
 'age': 'Age'}

# ---------- Identifiers & Metadata ----------
    'id': 'Id',                         # Unique application or record identifier
    'status': 'Status',                 # Current processing or approval status
    'year': 'Year',                     # Application or loan issuance year

    # ---------- Borrower Financial Profile ----------
    'credit_score': 'CreditScore',      # Creditworthiness indicator
    'income': 'Income',                 # Applicant's gross income
    'age': 'Age',                       # Applicant's age
    'gender': 'Gender',                 # Applicant's gender (if applicable)

    # ---------- Loan Characteristics ----------
    'loan_amount': 'LoanAmount',        # Requested loan principal
    'loan_type': 'LoanType',             # Type of loan (e.g., mortgage, personal)
    'loan_purpose': 'LoanPurpose',       # Intended use of the loan
    'term': 'Term',                     # Loan duration
    'loan_limit': 'LoanLimit',           # Maximum allowed loan amount

    # ---------- Risk & Affordability Metrics ----------
    'ltv': 'LTV',                        # Loan-to-Value ratio
    'dtir1': 'DTIR1',                    # Debt-to-Income ratio (variant 1)
    'dti': 'DebtToIncomeRatio',          # General DTI metric
    'credit_worthiness': 'CreditWorthiness',  # Aggregated credit risk assessment

    # ---------- Interest & Cost Structure ----------
    'rate_of_interest': 'InterestRate',          # Base interest rate
    'interest_rate_spread': 'InterestRateSpread',# Risk-based rate adjustment
    'upfront_charges': 'UpfrontCharges',          # Fees paid at loan initiation

    # ---------- Contextual & Operational Features ----------
    'property_value': 'PropertyValue',   # Value of collateral property
    'region': 'Region',                  # Geographic region
    'open_credit': 'OpenCredit',          # Existing open credit lines
    'business_or_commercial': 'BusinessOrCommercial',  # Loan usage category
    'approve_in_adv': 'ApproveInAdv'      # Pre-approval indicator
}

In [None]:
!pip -q install langchain langchain-community langchain-text-splitters faiss-cpu sentence-transformers joblib


In [None]:
import json, pandas as pd
from pathlib import Path

CSV_PATH = "/content/Loan_Default_Cleaned.csv"
CONCEPTS_PATH = "/content/concepts.json"
DICT_PATH = "/content/data_dictionary.csv"
SCHEMA_PATH = "/content/response_schema.md"
META_PATH = "/content/model_meta.json"

df = pd.read_csv(CSV_PATH)
concepts = json.load(open(CONCEPTS_PATH, "r", encoding="utf-8"))
data_dict = pd.read_csv(DICT_PATH)
schema_md = open(SCHEMA_PATH, "r", encoding="utf-8").read()
model_meta = json.load(open(META_PATH, "r", encoding="utf-8"))

print("df shape:", df.shape)
print("df columns:", len(df.columns))
print("concept keys:", list(concepts.keys()))
print("schema lines:", len(schema_md.splitlines()))
print("model_meta:", model_meta)
df.head()


df shape: (120488, 34)
df columns: 34
concept keys: ['risk_buckets', 'offer_types', 'conditions_catalog', 'feature_to_concept', 'policy_topics']
schema lines: 29
model_meta: {'sklearn': '1.8.0'}


Unnamed: 0,id,year,loan_limit,gender,approv_in_adv,loan_type,loan_purpose,credit_worthiness,open_credit,business_or_commercial,...,credit_type,credit_score,co-applicant_credit_type,age,submission_of_application,ltv,region,security_type,status,dtir1
0,24890,2019,cf,sex not available,nopre,type1,p1,l1,nopc,nob/c,...,exp,758.0,cib,25-34,to_inst,98.728814,south,direct,1.0,45.0
1,24891,2019,cf,male,nopre,type2,p1,l1,nopc,b/c,...,equi,552.0,exp,55-64,to_inst,75.152439,north,direct,1.0,39.0
2,24892,2019,cf,male,pre,type1,p1,l1,nopc,nob/c,...,exp,834.0,cib,35-44,to_inst,80.019685,south,direct,0.0,46.0
3,24893,2019,cf,male,nopre,type1,p4,l1,nopc,nob/c,...,exp,587.0,cib,45-54,not_inst,69.3769,north,direct,0.0,42.0
4,24894,2019,cf,joint,pre,type1,p1,l1,nopc,nob/c,...,crif,602.0,exp,25-34,not_inst,91.886544,north,direct,0.0,39.0


In [None]:
TARGET = "status"

# 1) target to int
df[TARGET] = df[TARGET].astype(int)

# 2) required columns for your agent
REQUIRED_FOR_AGENT = ["income", "credit_score", "loan_amount", "ltv", "dtir1", "age"]
missing = [c for c in REQUIRED_FOR_AGENT if c not in df.columns]
print("Missing required columns:", missing)

# 3) quick type normalization for numeric columns
for c in ["income","credit_score","loan_amount","ltv","dtir1","age"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

print(df[REQUIRED_FOR_AGENT + [TARGET]].isna().mean().sort_values(ascending=False).head(10))


Missing required columns: []
age             1.0
credit_score    0.0
income          0.0
loan_amount     0.0
ltv             0.0
dtir1           0.0
status          0.0
dtype: float64


In [None]:
"""
Logistic Regression Risk Model (Scikit-Learn Pipeline)

This block trains a baseline binary classification model (Logistic Regression)
to predict TARGET (e.g., default = 1, non-default = 0).

Key design goals:
1) Prevent data leakage by fitting preprocessing only on the training split.
2) Handle mixed data types correctly:
   - Numerical: impute missing values + standardize
   - Categorical: impute missing values + one-hot encode
3) Evaluate with ROC-AUC (threshold-independent ranking metric).
"""

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# ------------------------
# 1) Separate features (X) and label (y)
# ------------------------
# X contains all predictors; y is the binary target variable.
X = df.drop(columns=[TARGET])
y = df[TARGET].astype(int)

# ------------------------
# 2) Detect numeric vs categorical columns
# ------------------------
# Numeric columns will be scaled; categorical columns will be one-hot encoded.
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# ------------------------
# 3) Define preprocessing pipelines
# ------------------------
# Numeric pipeline:
# - median imputation handles missing numeric values robustly (less sensitive to outliers)
# - standardization improves optimization stability for Logistic Regression
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical pipeline:
# - most_frequent fills missing categories
# - one-hot encoding converts categories to binary indicator columns
# - handle_unknown="ignore" prevents inference-time crashes if new categories appear
cat_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# ColumnTransformer applies the correct preprocessing to each column group
preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ]
)

# ------------------------
# 4) Define the model + full training pipeline
# ------------------------
# max_iter increased to ensure convergence on larger / sparse one-hot feature spaces
model = LogisticRegression(max_iter=2000, n_jobs=None)

# Full pipeline: preprocessing -> model
# This guarantees preprocessing is fitted only on training data.
pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", model)
])

# ------------------------
# 5) Train-test split
# ------------------------
# stratify=y preserves class ratio (important for imbalanced datasets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------
# 6) Fit + evaluate
# ------------------------
pipe.fit(X_train, y_train)

# Predict probability of the positive class (class 1)
p = pipe.predict_proba(X_test)[:, 1]

# ROC-AUC measures how well the model ranks positives above negatives
auc = roc_auc_score(y_test, p)
print("ROC-AUC:", auc)

"""
Note about the warning you see:

"Skipping features without any observed values: ['age']"

Meaning:
- In the TRAIN split, the 'age' column had no usable (non-missing) values.
- Median imputation cannot compute a median if all values are missing.
- Scikit-learn therefore skips that feature during fitting.

Fix options (choose one):
1) Drop columns that are entirely missing before training.
2) Use SimpleImputer(add_indicator=True) to keep a missingness flag.
3) Ensure 'age' is correctly loaded/parsed (not all NaN due to bad parsing).
"""




ROC-AUC: 0.860401268974365




In [None]:
import joblib, json

joblib.dump(pipe, "risk_model.joblib")

artifact = {
    "target": TARGET,
    "input_columns": list(X.columns),
    "required_for_agent": REQUIRED_FOR_AGENT,
    "risk_thresholds": {"low": 0.30, "high": 0.70},
}
json.dump(artifact, open("agent_artifacts.json","w"), indent=2)

print("Saved: risk_model.joblib, agent_artifacts.json")


Saved: risk_model.joblib, agent_artifacts.json


In [None]:
"""
RAG Knowledge Preparation Pipeline

This block prepares structured project knowledge to be used by a
Retrieval-Augmented Generation (RAG) system.

It:
1) Wraps multiple structured sources into LangChain Document objects
2) Attaches metadata to preserve source traceability
3) Splits large documents into overlapping chunks for semantic retrieval

These chunks will later be embedded and indexed (e.g., FAISS) so the LLM
can answer ONLY from approved, auditable sources.
"""

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
import json

# --------------------------------------------------
# 1. Create source documents
# --------------------------------------------------
# Each Document represents an authoritative knowledge source.
# Metadata is used later to identify where an answer came from.

docs = [
    # Core domain concepts and mappings
    Document(
        page_content=json.dumps(concepts, ensure_ascii=False, indent=2),
        metadata={"source": "concepts_json"}
    ),

    # Output format contract (JSON schema / response rules)
    Document(
        page_content=schema_md,
        metadata={"source": "response_schema"}
    ),

    # Feature definitions and field-level descriptions
    Document(
        page_content=data_dict.to_csv(index=False),
        metadata={"source": "data_dictionary"}
    )
]

# --------------------------------------------------
# 2. Configure the text splitter
# --------------------------------------------------
# RecursiveCharacterTextSplitter:
# - Preserves semantic structure where possible
# - Falls back gracefully to smaller separators if needed
# - Overlap ensures important context is not lost between chunks

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,           # Max characters per chunk
    chunk_overlap=120,        # Overlap to maintain continuity
    separators=["\n\n", "\n", ". ", " ", ""]
)

# --------------------------------------------------
# 3. Split documents into retrievable chunks
# --------------------------------------------------
# Each chunk keeps the original metadata so we can trace answers
# back to their authoritative source.

chunks = splitter.split_documents(docs)

# --------------------------------------------------
# 4. Sanity check / debugging output
# --------------------------------------------------
# Confirms how many source documents and final chunks were created
# Prints a preview to verify content and metadata correctness.

print("docs:", len(docs), "chunks:", len(chunks))
print(chunks[0].metadata, chunks[0].page_content[:200])


docs: 3 chunks: 7
{'source': 'concepts_json'} {
  "risk_buckets": [
    "Low",
    "Medium",
    "High"
  ],
  "offer_types": [
    "Approve",
    "Approve_with_conditions",
    "Decline",
    "ManualReview"
  ],
  "conditions_catalog": [
    "Re


In [None]:
# --------------------------------------------------
# Create embeddings and FAISS vector index
# --------------------------------------------------
# This block converts text chunks into dense vector embeddings
# and stores them in a FAISS index for fast semantic retrieval.

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Initialize the embedding model
# - Uses a lightweight sentence-transformer
# - Optimized for semantic similarity search
# - Suitable for production RAG systems
emb = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Build the FAISS index from document chunks
# - Each chunk is embedded into a vector
# - FAISS enables efficient nearest-neighbor search
vs = FAISS.from_documents(chunks, emb)

# Create a retriever interface
# - Returns the top-k most similar chunks for a query
# - Used later by the RAG pipeline
retriever = vs.as_retriever(search_kwargs={"k": 4})

# Save the FAISS index for reuse (e.g., Streamlit app)
# - Avoids recomputing embeddings on every run
vs.save_local("faiss_index")
print("Saved FAISS index to: faiss_index/")


Saved FAISS index to: faiss_index/


In [None]:
# --------------------------------------------------
# RAG retrieval helper
# --------------------------------------------------
# This function retrieves the top-k most relevant text chunks
# from the FAISS vector store for a given query.
# It is used to supply grounded context to the LLM.

def rag_fetch(query: str, k: int = 4):

    # NOTE:
    # The standard LangChain retriever interface raised an AttributeError
    # for get_relevant_documents in this environment.
    # To avoid abstraction issues and keep behavior explicit and stable,
    # we directly call FAISS's similarity_search method.

    # similarity_search:
    # - embeds the query
    # - performs vector similarity matching
    # - returns the k closest document chunks
    return vs.similarity_search(query, k=k)


# --------------------------------------------------
# Example usage / sanity check
# --------------------------------------------------
# This query tests whether the RAG index correctly retrieves
# domain knowledge related to DTIR1.

hits = rag_fetch("What is DTIR1 and how is it used?", k=4)

# Each returned item is a Document containing:
# - metadata: source of the knowledge (traceability)
# - page_content: the retrieved text chunk
# We print a short preview to verify correctness.

[
    (
        h.metadata.get("source"),                # origin of the chunk (e.g., concepts_json)
        h.page_content[:120].replace("\n", " ")  # compact preview for readability
    )
    for h in hits
]



[('response_schema',
  '# Chatbot Output Schema (Always Return This Structure)  ## 1) Risk_Assessment - risk_bucket: Low | Medium | High - confi'),
 ('data_dictionary',
  'co-applicant_credit_type,object,0.0,2,"cib, exp" submission_of_application,object,0.0,2,"to_inst, not_inst" security_typ'),
 ('data_dictionary',
  'loan_purpose,object,0.0,4,"p1, p4, p3" total_units,object,0.0,4,"1u, 2u, 3u" credit_type,object,0.0,4,"exp, equi, crif" '),
 ('response_schema',
  '## 5) Evidence - key_features_used: {feature: value} - cohort_stats: short stats (optional) - notes: any assumptions  ##')]

# ***Final code***

In [None]:
!pip -q uninstall -y numpy scipy scikit-learn
!pip -q install "numpy<2.1" "scipy<1.12" "scikit-learn<1.5"
!pip -q install -U langchain langchain-community langchain-text-splitters faiss-cpu sentence-transformers transformers accelerate


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.8/35.8 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
esda 2.8.0 requires scipy>=1.12, but you have scipy 1.11.4 which is incompatible.
giddy 2.3.8 req

In [None]:
import numpy, scipy, sklearn
print("numpy:", numpy.__version__)
print("scipy:", scipy.__version__)
print("sklearn:", sklearn.__version__)


numpy: 2.0.2
scipy: 1.16.3
sklearn: 1.6.1


In [None]:
import os
import re
import json
import math
import joblib
import numpy as np
import pandas as pd

from typing import Dict, Any, List, Optional, Tuple

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document


import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [None]:
import numpy, scipy, sklearn
print("numpy:", numpy.__version__)
print("scipy:", scipy.__version__)
print("sklearn:", sklearn.__version__)
print("torch:", torch.__version__)


numpy: 2.0.2
scipy: 1.16.3
sklearn: 1.6.1
torch: 2.9.0+cu126


In [None]:
# ---------- 1) Paths ----------
CSV_PATH       = "/content/Loan_Default_Cleaned.csv"
CONCEPTS_PATH  = "/content/concepts.json"
DICT_PATH      = "/content/data_dictionary.csv"       # optional (ok if missing)
SCHEMA_PATH    = "/content/response_schema.md"
MODEL_PATH     = "/content/risk_model.joblib"
ART_PATH       = "/content/agent_artifacts.json"
FAISS_DIR      = "/content/faiss_index"

TARGET_COL     = "Status"  # change if your dataset uses 'status' or another label


This code is a utility layer for input cleaning and safety.

It runs before the model, rules, or RAG logic and ensures that:

Files are read safely without crashing if they are missing

User inputs like "$3,500", "45%", or "1,200" are converted into valid numbers

Values are kept within acceptable ranges

Age values written as text or ranges (e.g., "25–34") are converted into usable numeric values

Its role in the system:

Raw user input
→ Input normalization & safety (this code)
→ Feature engineering / rules
→ Model & RAG explanation
→ Reliable output


Without this layer, real human input would easily break the system or lead to incorrect decisions.

In [None]:


# ---------- 2) Small utilities ----------
def _safe_read_text(path: str) -> str:
    if not os.path.exists(path):
        return ""
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def _safe_read_json(path: str) -> Dict[str, Any]:
    if not os.path.exists(path):
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        return {}

def _to_number(x: Any) -> Optional[float]:
    if x is None:
        return None
    s = str(x).strip().replace(",", "")
    if s == "":
        return None

    # allow $, %
    s = s.replace("$", "").replace("%", "")

    try:
        return float(s)
    except Exception:
        return None

def _clamp(v: float, lo: float, hi: float) -> float:
    return max(lo, min(hi, v))

def age_to_num(x: Any) -> Optional[float]:
    """
    Accepts:
      - "25-34", "25 to 34 years" -> average
      - "35", "35 years"          -> 35
    """
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return None

    s = str(x).strip().lower()

    # range: 25-34 OR 25 to 34
    m = re.search(r"(\d{1,3})\s*(?:-|to)\s*(\d{1,3})", s)
    if m:
        a, b = int(m.group(1)), int(m.group(2))
        return (a + b) / 2.0

    # single number
    m = re.search(r"(\d{1,3})", s)
    if m:
        return float(m.group(1))

    return None


In [None]:
# ---------- 3) Load data + minimal cleaning ----------
df = pd.read_csv(CSV_PATH)

# Normalize target name
if TARGET_COL not in df.columns:
    if "status" in df.columns:
        TARGET_COL = "status"
    elif "Status" in df.columns:
        TARGET_COL = "Status"
    else:
        raise ValueError(
            f"TARGET_COL '{TARGET_COL}' not found. Available columns: {list(df.columns)[:30]} ..."
        )

# Ensure target is numeric
df[TARGET_COL] = pd.to_numeric(df[TARGET_COL], errors="coerce")
df = df.dropna(subset=[TARGET_COL])

# Cast to int
df[TARGET_COL] = df[TARGET_COL].astype(int)

# Normalize target to 0/1 (handles -1/1, 1/2, etc.)
unique_y = sorted(df[TARGET_COL].unique().tolist())
if unique_y == [-1, 1]:
    df[TARGET_COL] = (df[TARGET_COL] == 1).astype(int)
elif unique_y == [1, 2]:
    df[TARGET_COL] = (df[TARGET_COL] == 2).astype(int)
else:
    # If already 0/1, keep; otherwise leave as-is (but warn)
    if set(unique_y) != {0, 1} and len(unique_y) <= 5:
        print(f"Warning: unusual target values: {unique_y}")

# If you have an 'age' column, convert it; otherwise ignore
if "age" in df.columns:
    df["age"] = df["age"].apply(age_to_num).astype("float64")

print("Data shape:", df.shape)
print("Target:", TARGET_COL, "pos rate:", float(df[TARGET_COL].mean()))


Data shape: (120488, 34)
Target: status pos rate: 0.2465390744306487


In [None]:
# ---------- 4) Train (or load) risk model ----------
def train_or_load_model(df: pd.DataFrame, target: str) -> Tuple[Pipeline, Dict[str, Any]]:
    if os.path.exists(MODEL_PATH) and os.path.exists(ART_PATH):
        pipe = joblib.load(MODEL_PATH)
        art = _safe_read_json(ART_PATH)
        return pipe, art

    X = df.drop(columns=[target])
    y = df[target].astype(int)

    # Robust numeric detection
    num_cols = X.select_dtypes(include=["number"]).columns.tolist()
    cat_cols = [c for c in X.columns if c not in num_cols]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    num_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocess = ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols),
        ]
    )

    model = LogisticRegression(max_iter=2000)

    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)

    input_columns = list(X.columns)

    # REQUIRED fields for chatbot dialog (minimal, but reliable)
    candidates = ["income", "credit_score", "loan_amount", "ltv", "dtir1", "dti", "age"]
    input_cols_lower = {c.lower(): c for c in input_columns}

    required_for_agent = []
    for c in candidates:
        if c in input_cols_lower:
            required_for_agent.append(input_cols_lower[c])

    # Store BOTH: full model columns + minimal chatbot fields (canonical names)
    risk_thresholds = {"low": 0.30, "high": 0.70}

    art = {
        "target": target,
        "input_columns": input_columns,                 # columns used by trained model
        "required_for_agent": candidates,               # canonical names the agent expects
        "column_name_map": input_cols_lower,            # helps map canonical->actual df columns
        "risk_thresholds": risk_thresholds,
    }

    joblib.dump(pipe, MODEL_PATH)
    with open(ART_PATH, "w", encoding="utf-8") as f:
        json.dump(art, f, indent=2, ensure_ascii=False)

    print("Saved:", MODEL_PATH, ART_PATH)
    return pipe, art


pipe, art = train_or_load_model(df, TARGET_COL)

INPUT_COLS: List[str] = art["input_columns"]
REQ: List[str] = art["required_for_agent"]          # canonical keys
COLMAP: Dict[str, str] = art.get("column_name_map", {})
TH: Dict[str, float] = art["risk_thresholds"]

print("Required fields (canonical):", REQ)


Required fields (canonical): ['income', 'credit_score', 'loan_amount', 'ltv', 'dtir1', 'age']


In [None]:
import shutil
shutil.rmtree("/content/faiss_index", ignore_errors=True)
print("FAISS index removed")


FAISS index removed


In [None]:
def concepts_to_documents(concepts: dict) -> List[Document]:
    docs = []

    for key, val in concepts.items():
        # فقط مفاهیم واقعی، نه mappingها
        if not isinstance(val, dict):
            continue
        if "description" not in val:
            continue

        text = f"""
Concept: {key}
Name: {val.get('name', key)}
Description: {val.get('description', '')}
Used in risk model: {val.get('used_in_risk', False)}
Related features: {", ".join(val.get('related_features', []))}
""".strip()

        docs.append(
            Document(
                page_content=text,
                metadata={
                    "source": "concepts.json",
                    "concept": key
                }
            )
        )

    return docs


In [None]:
# ---------- 5) Build / load RAG index ----------
concepts = _safe_read_json(CONCEPTS_PATH)
schema_md = _safe_read_text(SCHEMA_PATH)

dict_text = ""
if os.path.exists(DICT_PATH):
    try:
        dict_df = pd.read_csv(DICT_PATH)
        dict_text = dict_df.head(200).to_csv(index=False)
    except Exception:
        dict_text = _safe_read_text(DICT_PATH)

def _needs_rebuild(index_dir: str, watched_files: List[str]) -> bool:
    """Rebuild if index missing OR any watched file is newer than index."""
    if not os.path.isdir(index_dir):
        return True
    try:
        index_mtime = os.path.getmtime(index_dir)
    except Exception:
        return True

    for p in watched_files:
        if os.path.exists(p):
            try:
                if os.path.getmtime(p) > index_mtime:
                    return True
            except Exception:
                pass
    return False

def build_or_load_faiss(force_rebuild: bool = False) -> FAISS:
    emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    watched = [CONCEPTS_PATH, SCHEMA_PATH, DICT_PATH]

    # Load if OK
    if not force_rebuild and not _needs_rebuild(FAISS_DIR, watched):
        try:
            return FAISS.load_local(
                FAISS_DIR,
                emb,
                allow_dangerous_deserialization=True
            )
        except Exception:
            pass  # rebuild

    # Build docs
    docs: List[Document] = []

    # Concepts
    if isinstance(concepts, dict) and concepts:
        docs.extend(concepts_to_documents(concepts))
    else:
        docs.append(Document(page_content="No concepts loaded.", metadata={"source": "concepts.json"}))

    # Schema (useful for response formatting/policy)
    if schema_md and schema_md.strip():
        docs.append(Document(page_content=schema_md, metadata={"source": "response_schema.md"}))

    # OPTIONAL: data dictionary
    # If you keep it, name the metadata consistently so later filters work.
    # If you don't need it for answering, comment this out.
    if dict_text and dict_text.strip():
        docs.append(Document(page_content=dict_text, metadata={"source": "data_dictionary"}))

    splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=120)
    chunks = splitter.split_documents(docs)

    vs = FAISS.from_documents(chunks, emb)
    os.makedirs(FAISS_DIR, exist_ok=True)
    vs.save_local(FAISS_DIR)

    print("RAG built. Chunks:", len(chunks))
    return vs

vs = build_or_load_faiss(force_rebuild=False)

def rag_fetch(query: str, k: int = 4, allowed_sources: Optional[set] = None) -> List[Document]:
    docs = vs.similarity_search(query, k=k)
    if allowed_sources:
        docs = [d for d in docs if d.metadata.get("source") in allowed_sources]
    return docs


RAG built. Chunks: 3


This code is the system’s “speaking brain”: it loads the FLAN-T5 language model and generates clear, human-readable text responses from a given prompt.

In [None]:
# ---------- 6) Load LLM (FLAN-T5) ----------
LLM_MODEL = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
llm = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL)

device = "cuda" if torch.cuda.is_available() else "cpu"
llm = llm.to(device)

print("LLM loaded:", LLM_MODEL, "on", device)

def llm_generate(prompt: str, max_new_tokens: int = 180, deterministic: bool = True) -> str:
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(device)

    with torch.no_grad():
        if deterministic:
            out = llm.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=False,
                num_beams=1
            )
        else:
            out = llm.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                num_beams=1
            )

    return tokenizer.decode(out[0], skip_special_tokens=True).strip()



tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

LLM loaded: google/flan-t5-base on cuda


This code decides what the user is trying to do before taking any action.

It routes each message into one of two intents:

concept → the user is asking for definitions, explanations, policies, or schemas

application → the user is providing numbers or asking for a loan decision

The logic is rule-first:

If the message looks like code → treat it as concept

If it contains two or more numbers → treat it as application

If it contains concept-related keywords → treat it as concept

Otherwise, default to concept (safer and avoids unnecessary data collection)

An LLM-based classifier exists as a fallback, but the system prefers rules to stay fast, stable, and predictable.

In [None]:
# ---------- 7) Intent routing (RULE-FIRST, robust) ----------

# explicit application triggers
APPLICATION_KEYWORDS = [
    "apply for a loan", "loan application", "i want to apply", "apply loan",
    "get a loan", "loan request", "i want a loan", "need a loan", "mortgage application",
]

# definition / concept triggers (but must be loan-domain)
DEF_PREFIX = ("what is", "what does", "define", "explain", "meaning of")

# loan-domain tokens (guard concepts + help detect application)
LOAN_TOKENS = {
    "loan", "mortgage", "credit", "risk", "default",
    "ltv", "dti", "dtir", "dtir1", "income", "credit score", "loan amount", "age",
}

# if user mentions these, it's very likely an application message (even one-by-one)
FIELD_TOKENS = {
    "income", "salary",
    "credit score", "credit_score", "score",
    "loan amount", "loan_amount", "amount", "borrow",
    "ltv", "dti", "dtir", "dtir1",
    "age", "years old", "yo",
}

def looks_like_code(t: str) -> bool:
    t = (t or "").strip().lower()
    return (
        "import " in t or "from " in t or "def " in t or "class " in t
        or t.startswith("!pip") or "```" in t
    )

def _has_any(text: str, terms: set[str]) -> bool:
    t = (text or "").lower()
    return any(term in t for term in terms)

def _starts_with_any(text: str, prefixes: tuple[str, ...]) -> bool:
    t = (text or "").strip().lower()
    return any(t.startswith(p) for p in prefixes)

def classify_intent_llm(user_text: str) -> str:
    # optional fallback; keep deterministic in llm_generate(deterministic=True)
    prompt = f"""
You route messages for a loan risk assistant.

Return ONLY one label: concept OR application OR out_of_scope.

- concept: user asks for meaning/definition/explanation of loan terms like DTI/LTV/DTIR1/credit score.
- application: user provides applicant numbers or loan fields (income, credit score, loan amount, LTV, DTIR1, age) or asks for a decision.
- out_of_scope: anything else.

Message:
{user_text}
""".strip()

    y = llm_generate(prompt, max_new_tokens=8, deterministic=True).lower()
    if "application" in y:
        return "application"
    if "concept" in y:
        return "concept"
    return "out_of_scope"

def route_intent(user_text: str) -> str:
    t = (user_text or "").strip().lower()
    if not t:
        return "out_of_scope"

    # 0) code always treated as out_of_scope (or concept if you prefer), but NOT application
    if looks_like_code(t):
        return "out_of_scope"

    # 1) explicit application keywords
    if any(k in t for k in APPLICATION_KEYWORDS):
        return "application"

    # 2) if it mentions any loan field token -> application (supports step-by-step input)
    #    examples: "my income is 4500" / "credit score 680" / "ltv 75"
    if _has_any(t, FIELD_TOKENS):
        return "application"

    # 3) concept questions only if: definition prompt AND within loan domain
    #    example: "what does dtir1 mean?" -> concept
    if _starts_with_any(t, DEF_PREFIX) and _has_any(t, LOAN_TOKENS):
        return "concept"

    # 4) short single-term concept query: "dtir1" / "ltv" / "credit score"
    if len(t.split()) <= 3 and _has_any(t, {"dtir1", "dtir", "dti", "ltv", "credit score"}):
        return "concept"

    # 5) numbers alone are NOT enough; otherwise random messages with numbers go application
    # If you REALLY want numeric heuristic, gate it with loan domain tokens:
    nums = re.findall(r"\d+(?:\.\d+)?", t)
    if len(nums) >= 2 and _has_any(t, LOAN_TOKENS):
        return "application"

    # 6) fallback: out_of_scope (prevents "langchain?" going to concept/application)
    # If you want: return classify_intent_llm(user_text) instead.
    return "out_of_scope"




This code extracts key loan-related numbers from free-text user input (income, credit score, loan amount, DTI, age), cleans and converts them to proper numeric values, and reduces common extraction mistakes by using strict patterns and selecting the last valid match


In [None]:
# ---------- 8) Field extraction (regex, SAFE) ----------
import re
from typing import Dict, Any, Optional

def extract_fields_regex(text: str) -> Dict[str, Any]:
    t = (text or "").lower()
    out: Dict[str, Any] = {}

    def pick_last_number(pattern: str, group: int) -> Optional[float]:
        matches = list(re.finditer(pattern, t, flags=re.IGNORECASE))
        if not matches:
            return None
        return _to_number(matches[-1].group(group))

    # ---------- income ----------
    out["income"] = pick_last_number(
        r"\b(income|monthly\s*income|salary)\b[^\d]{0,40}(\d[\d,]*(?:\.\d+)?)",
        group=2
    )

    # ---------- credit score ----------
    out["credit_score"] = pick_last_number(
        r"\b(credit\s*score|score)\b[^\d]{0,40}(\d{2,4})",
        group=2
    )

    # ---------- loan amount ----------
    # CRITICAL FIX: never match "loan" alone
    out["loan_amount"] = pick_last_number(
        r"\b("
        r"(requested\s*)?(loan\s*amount|loan\s*amt|loan\s*value|borrow(?:ed)?\s*amount|amount\s*requested)"
        r"|amount\s*(is|=)"
        r")\b[^\d]{0,60}(\d[\d,]*(?:\.\d+)?)",
        group=5
    )

    # ---------- LTV ----------
    out["ltv"] = pick_last_number(
        r"\bltv\b[^\d]{0,40}(\d[\d,]*(?:\.\d+)?)\s*%?",
        group=1
    )

    # ---------- DTIR1 ----------
    out["dtir1"] = pick_last_number(
        r"\bdtir1\b[^\d]{0,40}(\d[\d,]*(?:\.\d+)?)\s*%?",
        group=1
    )

    # ---------- DTI (only if dtir1 not present) ----------
    if out.get("dtir1") is None:
        out["dti"] = pick_last_number(
            r"\bdti\b[^\d]{0,40}(\d[\d,]*(?:\.\d+)?)\s*%?",
            group=1
        )

    # ---------- age ----------
    out["age"] = pick_last_number(
        r"\b(age|years\s*old|yo)\b[^\d]{0,40}(\d{1,3})",
        group=2
    )

    # ---------- type enforcement ----------
    if out.get("credit_score") is not None:
        out["credit_score"] = int(out["credit_score"])
    if out.get("age") is not None:
        out["age"] = int(out["age"])

    # ---------- HARD GUARD (very important) ----------
    # Prevent loan_amount accidentally being income
    if out.get("loan_amount") and out.get("income"):
        if out["loan_amount"] <= out["income"]:
            out["loan_amount"] = None

    return out


This code is the decision engine that turns model output into a clear loan decision.

predict_default_probability
Takes the user’s known inputs, runs them through the trained model, and returns a default probability between 0 and 1.

map_prob_to_bucket
Converts that probability into a risk bucket:

Low risk

Medium risk

High risk
using predefined thresholds.

offer_engine
Translates the risk bucket into a final decision:

Low → Approve

Medium → Approve with conditions

High → Decline

One-line summary:
This code converts model probabilities into risk levels and then into a clear loan approval decision with optional conditions.

In [None]:
# ---------- 9) Risk + offer engine ----------
from typing import Dict, Any
import pandas as pd

def predict_default_probability(known: Dict[str, Any]) -> float:
    # Build a single-row dataframe with EXACT training columns
    row = {c: known.get(c, None) for c in INPUT_COLS}
    X_one = pd.DataFrame([row], columns=INPUT_COLS)

    # Safety: some models may not expose predict_proba
    if not hasattr(pipe, "predict_proba"):
        raise RuntimeError("Model does not support predict_proba(). Train a classifier with probability outputs.")

    p = float(pipe.predict_proba(X_one)[:, 1][0])
    return _clamp(p, 0.0, 1.0)

def map_prob_to_bucket(p: float) -> str:
    p = float(p)
    if p < float(TH["low"]):
        return "Low"
    if p < float(TH["high"]):
        return "Medium"
    return "High"

def offer_engine(bucket: str) -> Dict[str, Any]:
    b = (bucket or "").strip().title()  # "low" -> "Low"
    if b == "Low":
        return {"decision": "Approve", "conditions": []}
    if b == "Medium":
        return {
            "decision": "Approve_with_conditions",
            "conditions": ["Request additional documents", "Reduce loan amount"],
        }
    return {"decision": "Decline", "conditions": []}





This code finds the correct concept definition from concepts.json based on the user’s text.

_normalize_key
Cleans text by lowercasing it and removing spaces/symbols, so matching is consistent.

lookup_concept_from_json
Tries to identify which concept the user is asking about:

Direct match: checks if the normalized concept name (e.g., DTIR1) appears in the user’s text and returns its definition.

Fallback mapping: if no direct match is found, it uses a feature_to_concept mapping to link features (e.g., dti) to a concept and returns that definition.

If nothing matches, it returns None.

One-line summary:
This code maps a user’s question to the right concept in concepts.json and returns its definition in a robust, typo-tolerant way.

In [None]:
import re
from typing import Dict, Any, Optional

def _normalize_key(s: str) -> str:
    # keep only letters/numbers -> stable matching
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())

def _tokenize_norm(s: str) -> set[str]:
    # words (normalized) for boundary-like matching
    raw_tokens = re.findall(r"[a-z0-9]+", str(s).lower())
    return { _normalize_key(tok) for tok in raw_tokens if tok }

def lookup_concept_from_json(user_text: str, concepts: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    if not isinstance(concepts, dict) or not user_text:
        return None

    t_norm = _normalize_key(user_text)
    t_tokens = _tokenize_norm(user_text)

    # 1) direct hit on top-level concept objects (e.g., "DTIR1": {...})
    for k, v in concepts.items():
        if not isinstance(v, dict):
            continue
        if not ("description" in v or "name" in v):
            continue

        k_norm = _normalize_key(k)
        # match by substring OR by token (handles "dtir1?" / "what does dtir1 mean")
        if (k_norm and k_norm in t_norm) or (k_norm in t_tokens):
            return {"key": k, "data": v}

    # 2) fallback via feature_to_concept mapping (if present)
    ftc = concepts.get("feature_to_concept")
    if isinstance(ftc, dict):
        for feat, concept_name in ftc.items():
            feat_norm = _normalize_key(feat)
            if not feat_norm:
                continue

            if (feat_norm in t_norm) or (feat_norm in t_tokens):
                # resolve mapped concept object
                if concept_name in concepts and isinstance(concepts[concept_name], dict):
                    return {"key": concept_name, "data": concepts[concept_name]}

                # fallback object if mapping points to missing concept
                return {
                    "key": str(concept_name),
                    "data": {
                        "name": str(concept_name),
                        "description": f"Mapped from feature: {feat}"
                    }
                }

    return None


This function answers loan-related concept questions by first returning an exact definition from concepts.json when available, otherwise safely retrieving relevant documents with RAG and using an LLM to generate a controlled explanation only from approved context, while rejecting out-of-scope questions.

In [None]:
from typing import Dict, Any, List

# ---------- 10) RAG answer (concept questions) ----------
def concept_answer(user_text: str, k: int = 6) -> Dict[str, Any]:
    q = (user_text or "").strip()
    q_low = q.lower()

    # 0) Direct concept lookup (best practice)
    hit = lookup_concept_from_json(q, concepts)
    if hit:
        data = hit["data"] or {}
        name = data.get("name", hit.get("key", "Concept"))
        desc = (data.get("description") or "").strip()
        used = data.get("used_in_risk", data.get("used_in_model", None))
        rel = data.get("related_features", [])

        answer_lines: List[str] = []
        answer_lines.append(f"{name}: {desc}" if desc else f"{name}.")
        if used is not None:
            answer_lines.append(f"It is used in risk assessment: {bool(used)}.")
        if isinstance(rel, list) and rel:
            answer_lines.append(f"Related features: {', '.join(map(str, rel))}.")

        return {
            "type": "rag_answer",
            "answer": " ".join(answer_lines),
            "sources": ["concepts.json"],
        }

    # 1) Domain guard (reject out-of-scope questions)
    # IMPORTANT: include dtir1 explicitly so "What does DTIR1 mean?" never gets rejected
    LOAN_TERMS = [
        "loan", "credit", "ltv", "dti", "dtir", "dtir1",
        "interest", "rate", "apr", "mortgage", "risk", "default"
    ]
    if not any(t in q_low for t in LOAN_TERMS):
        return {"type": "rag_answer", "answer": "Not found in provided context.", "sources": []}

    # 2) RAG retrieval with relevance scores
    # NOTE: FAISS scores are distances; smaller is better. 0.7 can be too strict.
    docs_scores = vs.similarity_search_with_score(q, k=k)

    # keep only reasonably relevant chunks (more forgiving threshold)
    docs = [d for d, score in docs_scores if score is None or score < 1.2]

    # 3) Block unsafe / non-explanatory sources
    BLOCKED_SOURCES = {"sample_data_head.csv", "data_dictionary"}
    docs = [d for d in docs if d.metadata.get("source") not in BLOCKED_SOURCES]

    if not docs:
        return {"type": "rag_answer", "answer": "Not found in provided context.", "sources": []}

    # 4) Build controlled context
    sources = sorted(set(d.metadata.get("source", "unknown") for d in docs))
    context = "\n\n".join(
        f"[{i+1}] ({d.metadata.get('source','unknown')})\n{d.page_content[:900]}"
        for i, d in enumerate(docs)
    )

    # 5) Strict prompt for LLM
    prompt = f"""
You are a loan-domain assistant.

Answer the user's question using ONLY the CONTEXT below.
If the answer is not present, say exactly:
"Not found in provided context."

STRICT RULES:
- Do NOT output raw data rows, CSV lines, or tables.
- Do NOT invent definitions or values.
- Explain concepts in clear, simple English.
- No numbers unless they appear explicitly in the context.

Write 4–8 sentences.

CONTEXT:
{context}

QUESTION:
{q}

ANSWER:
""".strip()

    ans = (llm_generate(prompt, max_new_tokens=220) or "").strip()
    if not ans:
        ans = "Not found in provided context."

    return {"type": "rag_answer", "answer": ans, "sources": sources}


AS a test, each definition you add in concept , the chatbot can get it and aswers the user, if you need any update you do not need to train model just change the concept

In [None]:
import json
import os

# IMPORTANT: use the SAME path you use everywhere else
# If your notebook uses /content/concepts.json, keep it that way.
CONCEPTS_PATH = "/content/concepts.json"  # <-- unify this with your project

# Load existing concepts.json (or start fresh)
if os.path.exists(CONCEPTS_PATH):
    with open(CONCEPTS_PATH, "r", encoding="utf-8") as f:
        concepts = json.load(f)
else:
    concepts = {}

# -------------------------
# Core risk concepts
# -------------------------
concepts["DTIR1"] = {
    "name": "DTIR1 (Debt-to-Income Ratio Tier 1)",
    "description": (
        "DTIR1 measures the ratio of a borrower’s monthly debt obligations "
        "to their gross monthly income. It is used to assess repayment capacity "
        "and credit risk."
    ),
    "used_in_risk": True,
    "related_features": ["income", "total_debt"],
}

concepts["DebtToIncomeRatio"] = {
    "name": "Debt-to-Income Ratio (DTI)",
    "description": (
        "Debt-to-Income Ratio compares total monthly debt payments "
        "to gross monthly income to evaluate affordability."
    ),
    "used_in_risk": True,
    "related_features": ["income", "total_debt"],
}

concepts["LTV"] = {
    "name": "Loan-to-Value Ratio (LTV)",
    "description": (
        "LTV represents the ratio of the loan amount to the value of the property. "
        "Higher LTV values generally indicate higher lending risk."
    ),
    "used_in_risk": True,
    "related_features": ["loan_amount", "property_value"],
}

concepts["CreditScore"] = {
    "name": "Credit Score",
    "description": (
        "A credit score is a numerical measure of a borrower’s creditworthiness, "
        "based on past repayment behavior and credit history."
    ),
    "used_in_risk": True,
    "related_features": ["credit_score"],
}

concepts["Income"] = {
    "name": "Income",
    "description": (
        "Income refers to the borrower’s gross periodic earnings and is a key factor "
        "in assessing repayment ability."
    ),
    "used_in_risk": True,
    "related_features": ["income"],
}

concepts["LoanAmount"] = {
    "name": "Loan Amount",
    "description": "Loan amount is the total amount of money requested by the borrower.",
    "used_in_risk": True,
    "related_features": ["loan_amount"],
}

concepts["Age"] = {
    "name": "Borrower Age",
    "description": (
        "Age represents the borrower’s age at the time of application and may be "
        "used as a supporting risk factor."
    ),
    "used_in_risk": False,
    "related_features": ["age"],
}

# -------------------------
# OPTIONAL: add simple aliases as top-level keys (helps direct matching)
# -------------------------
# These aliases make lookup succeed even if user types "DTI" or "Credit Score" etc.
concepts["DTI"] = concepts["DebtToIncomeRatio"]
concepts["credit score"] = concepts["CreditScore"]
concepts["loan amount"] = concepts["LoanAmount"]

# -------------------------
# Feature-to-concept mapping
# -------------------------
if "feature_to_concept" not in concepts or not isinstance(concepts["feature_to_concept"], dict):
    concepts["feature_to_concept"] = {}

concepts["feature_to_concept"].update({
    # main
    "dtir1": "DTIR1",
    "dti": "DebtToIncomeRatio",
    "ltv": "LTV",
    "credit_score": "CreditScore",
    "income": "Income",
    "loan_amount": "LoanAmount",
    "age": "Age",
    # extra aliases users may type
    "creditscore": "CreditScore",
    "credit score": "CreditScore",
    "loanamount": "LoanAmount",
    "loan amount": "LoanAmount",
})

# Save back to concepts.json
with open(CONCEPTS_PATH, "w", encoding="utf-8") as f:
    json.dump(concepts, f, indent=2, ensure_ascii=False)

print(f"✅ concepts.json updated at: {CONCEPTS_PATH}")


✅ concepts.json updated at: /content/concepts.json


In [None]:
# ---------- 11) Human-like explanation (LLM) ----------
def explanation_with_llm(
    p_default: float,
    bucket: str,
    offer: Dict[str, Any],
    known: Dict[str, Any],
    rag_sources: List[str],  # kept for compatibility; not used in prompt
) -> str:

    def _clean(text: str) -> str:
        # normalize whitespace, remove trailing junk
        t = (text or "").strip()
        # remove duplicated "OUTPUT:" if model repeats it
        t = t.replace("OUTPUT:", "").strip()
        return t

    def _count_bullets(t: str) -> int:
        return sum(1 for ln in (t or "").splitlines() if ln.strip().startswith("- "))

    def _has_two_fields(t: str) -> bool:
        # ensure it mentions at least 2 provided fields by name
        # include both dtir1 and dti to be safe
        names = ["income", "credit_score", "loan_amount", "ltv", "dtir1", "dti", "age"]
        t_low = (t or "").lower()
        mentioned = 0
        for n in names:
            # only count if user actually provided it
            if known.get(n) is not None and n in t_low:
                mentioned += 1
        return mentioned >= 2

    def _fallback() -> str:
        used = []
        for k in ["income", "credit_score", "loan_amount", "ltv", "dtir1", "dti", "age"]:
            if known.get(k) is not None:
                used.append(f"{k}={known.get(k)}")
        used_txt = ", ".join(used[:4]) if used else "the provided application fields"

        para = (
            f"Based on the provided application data ({used_txt}), the estimated default probability is {p_default:.2f}, "
            f"which falls into the {bucket} risk bucket. The recommended decision is {offer.get('decision')}."
        )
        b1 = f"- Risk bucket: {bucket} (estimated default probability: {p_default:.2f})."
        b2 = f"- Decision: {offer.get('decision')}."
        conds = offer.get("conditions", []) or []
        cond_txt = "None" if not conds else ", ".join(map(str, conds))
        b3 = f"- Conditions: {cond_txt}."
        return "\n".join([para, b1, b2, b3])

    prompt = f"""
You are a loan risk assistant. Write an explanation grounded ONLY in the provided fields.

STRICT RULES (must follow):
- Do NOT mention interest rate, pricing, fees, APR, or costs.
- Do NOT invent policies, thresholds, or extra variables.
- Use ONLY these applicant fields if present: income, credit_score, loan_amount, ltv, dtir1, dti, age.
- Mention at least 2 of the provided applicant fields by name (e.g., credit_score, income).
- Output format MUST be EXACTLY:
  1 short paragraph (2–3 sentences)
  then EXACTLY 3 bullet lines, each starting with "- "

MODEL:
default_probability = {p_default:.2f}
risk_bucket = {bucket}
decision = {offer.get("decision")}
conditions = {offer.get("conditions", [])}

APPLICANT:
{json.dumps({k: known.get(k) for k in ["income","credit_score","loan_amount","ltv","dtir1","dti","age"]}, ensure_ascii=False)}

OUTPUT:
""".strip()

    # Try 1
    text = _clean(llm_generate(prompt, max_new_tokens=260))
    ok = (_count_bullets(text) == 3) and _has_two_fields(text)

    # Try 2
    if not ok:
        text = _clean(llm_generate(
            prompt + "\nFINAL REMINDER: Output must be 1 paragraph + EXACTLY 3 bullet lines starting with '- '. Do not output anything else.",
            max_new_tokens=260
        ))
        ok = (_count_bullets(text) == 3) and _has_two_fields(text)

    # Fallback
    if not ok:
        text = _fallback()

    return text


In [None]:
# ---------- 12) Final structured JSON ----------
def build_final_json(
    p_default: float,
    bucket: str,
    offer: Dict[str, Any],
    known: Dict[str, Any],
    rag_sources: List[str],
    explanation: str,
    missing: List[str]
) -> Dict[str, Any]:
    confidence = _clamp(abs(p_default - 0.5) * 2.0, 0.0, 1.0)

    # ---------- normalize explanation bullets ----------
    raw_lines = [l.strip() for l in (explanation or "").splitlines() if l.strip()]
    norm_lines = []
    for l in raw_lines:
        # normalize common bullet formats into "- "
        if l.startswith("• "):
            l = "- " + l[2:].strip()
        elif l.startswith("-") and not l.startswith("- "):
            l = "- " + l[1:].strip()
        norm_lines.append(l)

    bullets = [l for l in norm_lines if l.startswith("- ")]
    paragraph = " ".join([l for l in norm_lines if not l.startswith("- ")]).strip()

    # Guard 1: paragraph must exist
    if not paragraph:
        paragraph = (
            f"Based on the provided fields, the estimated default probability is {p_default:.2f} "
            f"(risk bucket: {bucket}). The recommended decision is {offer.get('decision')}."
        )

    # Guard 2: bullets must be EXACTLY 3
    conditions = offer.get("conditions", [])
    if conditions is None:
        conditions = []
    if not isinstance(conditions, list):
        conditions = [str(conditions)]

    if len(bullets) != 3:
        cond_text = "No additional conditions." if not conditions else "Conditions: " + ", ".join(map(str, conditions))
        bullets = [
            f"- Risk bucket: {bucket} (estimated default probability: {p_default:.2f}).",
            f"- Decision: {offer.get('decision')}.",
            f"- {cond_text}"
        ]

    # unify dti/dtir1 display
    dtir1_val = known.get("dtir1")
    if dtir1_val is None:
        dtir1_val = known.get("dti")

    # REQ safe
    try:
        req_fields = list(REQ)  # if REQ exists
    except Exception:
        req_fields = ["income", "credit_score", "loan_amount", "ltv", "dtir1", "age"]

    return {
        "Risk_Assessment": {
            "risk_bucket": bucket,
            "default_probability": round(float(p_default), 4),
            "confidence": round(float(confidence), 2),
            "short_summary": f"Estimated default probability is {p_default:.2f} (bucket: {bucket})."
        },
        "Offer": {
            "decision": offer.get("decision"),
            "proposed_terms": {"loan_amount": known.get("loan_amount", None)},
            "conditions": conditions
        },
        "Reasons_Data": [
            f"default_probability={p_default:.2f}",
            f"credit_score={known.get('credit_score')}",
            f"income={known.get('income')}",
            f"loan_amount={known.get('loan_amount')}",
            f"ltv={known.get('ltv')}",
            f"dtir1/dti={dtir1_val}",
            f"age={known.get('age')}",
        ],
        "Evidence": {
            "key_fields_used": {k: known.get(k) for k in req_fields},
            "rag_sources": sorted(set(rag_sources or []))
        },
        "Next_Actions": {
            "missing_fields_needed": missing or [],
            "recommended_verifications_or_documents": conditions
        },
        "Explanation": {
            "paragraph": paragraph,
            "bullets": bullets
        }
    }


In [None]:
from typing import Any, Dict, List
import re

# =========================================================
# 1) Intent routing (FINAL)
# =========================================================

APPLICATION_KEYWORDS = [
    "apply for a loan",
    "loan application",
    "i want to apply",
    "apply loan",
    "get a loan",
    "loan request",
    "i want a loan",
    "need a loan",
    "mortgage application",
]

# loan-domain vocabulary
LOAN_TERMS = {
    "loan", "credit", "ltv", "dti", "dtir", "dtir1", "mortgage", "risk", "default",
}

# concept terms you explicitly support (also allow short one-word queries)
SHORT_CONCEPT_TERMS = {
    "dtir1", "dti", "ltv", "credit score", "loan amount", "income", "age",
}

# numeric field signals for application messages
NUMERIC_SIGNALS = {
    "income", "credit score", "credit_score", "loan amount", "loan_amount",
    "ltv", "dti", "dtir", "dtir1", "age",
}

def _req_safe() -> List[str]:
    try:
        return list(REQ)  # type: ignore[name-defined]
    except Exception:
        return ["income", "credit_score", "loan_amount", "ltv", "dtir1", "age"]

def _norm_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip().lower())

def is_loan_domain(text: str) -> bool:
    t = _norm_spaces(text)
    return any(term in t for term in LOAN_TERMS) or any(term in t for term in SHORT_CONCEPT_TERMS)

def is_concept_question(text: str) -> bool:
    """
    Concept if:
      - user asks definition/explanation/meaning, AND loan-domain
      - OR user sends a short concept token alone (e.g., "dtir1", "ltv")
    """
    t = _norm_spaces(text)
    if not t:
        return False

    # direct short token query
    if t in SHORT_CONCEPT_TERMS:
        return True

    definition_prompt = any(t.startswith(x) for x in ("what is", "what does", "define", "explain", "meaning of"))
    questionish = ("?" in t)

    # If it's definition-style AND loan domain -> concept
    if (definition_prompt or questionish) and is_loan_domain(t) and any(tok in t for tok in SHORT_CONCEPT_TERMS.union(LOAN_TERMS)):
        return True

    # "explain dtir1" / "define ltv" variants
    if any(x in t for x in ("define", "explain", "meaning")) and is_loan_domain(t):
        return True

    return False

def looks_like_application(text: str) -> bool:
    """
    Application if:
      - explicit application keywords
      - OR message contains any numeric field words (income/ltv/...) even without numbers
      - OR message contains 2+ numbers (real user behavior)
      - OR regex extraction finds at least one field value
    """
    t = _norm_spaces(text)
    if not t:
        return False

    if any(k in t for k in APPLICATION_KEYWORDS):
        return True

    # 2+ numbers usually means they are providing parameters
    nums = re.findall(r"\d+(?:\.\d+)?", t)
    if len(nums) >= 2:
        return True

    # if they mention any required fields, treat as application
    req = _req_safe()
    if any(f.lower() in t for f in req):
        return True

    # mention of numeric signals is enough to treat as application conversation
    if any(s in t for s in NUMERIC_SIGNALS):
        return True

    # last safety: if your extractor can pull something, it's application
    try:
        extracted = extract_fields_regex(text)  # type: ignore[name-defined]
        if any(v is not None for v in extracted.values()):
            return True
    except Exception:
        pass

    return False

def route_intent(text: str) -> str:
    t = _norm_spaces(text)

    # concept wins first (but only loan-domain concepts)
    if is_concept_question(t):
        return "concept"

    # application second
    if looks_like_application(t):
        return "application"

    return "out_of_scope"


# =========================================================
# 2) Agent step (FINAL)
# =========================================================

def missing_fields(known: Dict[str, Any]) -> List[str]:
    req = _req_safe()
    return [f for f in req if known.get(f) is None]

def _missing_question(state: Dict[str, Any], miss: List[str]) -> str:
    cur = tuple(miss)
    if state.get("last_missing") == cur:
        q = "I still need the same missing fields. Please provide them in one message."
    else:
        q = "Please provide: " + ", ".join(miss)
    state["last_missing"] = cur
    return q

def _run_assessment_with_saved_fields(state: Dict[str, Any]) -> Dict[str, Any]:
    known = state["known_fields"]

    p_default = predict_default_probability(known)  # type: ignore[name-defined]
    bucket = map_prob_to_bucket(p_default)          # type: ignore[name-defined]
    offer = offer_engine(bucket)                   # type: ignore[name-defined]

    policy_docs = rag_fetch("loan risk offer policy schema conditions catalog", k=4)  # type: ignore[name-defined]
    rag_sources = [d.metadata.get("source", "unknown") for d in policy_docs]

    expl = explanation_with_llm(p_default, bucket, offer, known, rag_sources)  # type: ignore[name-defined]

    final_out = build_final_json(  # type: ignore[name-defined]
        p_default=p_default,
        bucket=bucket,
        offer=offer,
        known=known,
        rag_sources=rag_sources,
        explanation=expl,
        missing=[]
    )
    return {"type": "final_answer", "output": final_out}

def agent_step(user_text: str, state: Dict[str, Any]) -> Dict[str, Any]:
    state.setdefault("known_fields", {})
    state.setdefault("history", [])
    state.setdefault("last_missing", None)

    user_text = (user_text or "").strip()

    # HARD OVERRIDE
    if user_text == "__RUN_ASSESSMENT__":
        miss = missing_fields(state["known_fields"])
        if miss:
            q = _missing_question(state, miss)
            state["history"].append({"role": "user", "text": "__RUN_ASSESSMENT__"})
            state["history"].append({"role": "assistant", "text": q})
            return {"type": "missing_fields", "missing": miss, "question": q, "known_fields": dict(state["known_fields"])}

        out = _run_assessment_with_saved_fields(state)
        state["history"].append({"role": "user", "text": "__RUN_ASSESSMENT__"})
        state["history"].append({"role": "assistant", "text": "FINAL"})
        return out

    # log user
    state["history"].append({"role": "user", "text": user_text})

    intent = route_intent(user_text)

    # CONCEPT
    if intent == "concept":
        out = concept_answer(user_text, k=4)  # type: ignore[name-defined]
        state["history"].append({"role": "assistant", "text": out.get("answer", "")})
        return out

    # OUT OF SCOPE
    if intent == "out_of_scope":
        msg = "I can help with loan risk concepts (e.g., LTV/DTI/DTIR1/credit score) or a loan application assessment. Please ask a loan-related question."
        state["history"].append({"role": "assistant", "text": msg})
        return {"type": "out_of_scope", "answer": msg, "sources": []}

    # APPLICATION
    extracted = extract_fields_regex(user_text)  # type: ignore[name-defined]
    state["known_fields"].update({k: v for k, v in extracted.items() if v is not None})

    miss = missing_fields(state["known_fields"])
    if miss:
        q = _missing_question(state, miss)
        state["history"].append({"role": "assistant", "text": q})
        return {"type": "missing_fields", "missing": miss, "question": q, "known_fields": dict(state["known_fields"])}

    out = _run_assessment_with_saved_fields(state)
    state["history"].append({"role": "assistant", "text": "FINAL"})
    return out


In [None]:
!pip install -q gradio


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.7/87.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.4/462.4 kB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[0m

In [None]:
import inspect
print(inspect.signature(build_final_json))


(p_default: float, bucket: str, offer: Dict[str, Any], known: Dict[str, Any], rag_sources: List[str], explanation: str, missing: List[str]) -> Dict[str, Any]


In [None]:
# =========================================================
# 13) Chat UI layer (Gradio) — "the rest"
# - Keeps state across messages (does NOT forget values)
# - After each message: asks ONLY remaining fields
# - "run assessment" triggers __RUN_ASSESSMENT__ sentinel
# - Concept questions are answered from concepts.json (via concept_answer)
# - Out-of-scope questions return a clean message (no field loop)
# =========================================================

import gradio as gr
import json
import re
from typing import Any, Dict, List

# -----------------------------
# A) Small utilities (chat control + rendering)
# -----------------------------
EXIT_WORDS = {"exit", "quit", "q", "stop", "end"}
SMALLTALK_PATTERNS = [
    r"\b(thanks|thank you|thx|ty|merci|tnx|no thanks)\b",
    r"\b(bye|goodbye|see you|cya)\b",
    r"\b(hi|hello|hey|salam|سلام)\b",
]

def normalize_command(text: str) -> str:
    """Normalize common typos/variants so commands don't fall into the wrong branch."""
    t = (text or "").strip().lower()
    t = t.replace("asseement", "assessment")
    t = t.replace("assesment", "assessment")
    t = t.replace("assessement", "assessment")
    t = re.sub(r"\s+", " ", t).strip()
    return t

def is_exit(text_norm: str) -> bool:
    return (text_norm or "").strip().lower() in EXIT_WORDS

def is_smalltalk(text: str) -> bool:
    t = (text or "").strip().lower()
    return any(re.search(p, t) for p in SMALLTALK_PATTERNS)

def is_run_request(text_norm: str) -> bool:
    t = (text_norm or "").strip().lower()
    # explicit commands
    if t in {"run", "assess", "assessment", "rerun", "re-run", "run assessment", "run the assessment"}:
        return True
    # loose match
    return ("run" in t and "assessment" in t) or (t.startswith("run ") and "assess" in t)

def safe_string(x) -> str:
    if x is None:
        return ""
    if isinstance(x, str):
        return x
    try:
        return json.dumps(x, ensure_ascii=False)
    except Exception:
        return str(x)

def render_final_output_as_sentences(output_dict: dict) -> str:
    """
    Renders the final JSON into:
    - explanation paragraph
    - summary line
    - collapsible technical JSON
    """
    ra = output_dict.get("Risk_Assessment", {}) or {}
    off = output_dict.get("Offer", {}) or {}
    expl = output_dict.get("Explanation", {}) or {}

    paragraph = (expl.get("paragraph", "") or "").strip()
    bucket = ra.get("risk_bucket", "Unknown")
    p = ra.get("default_probability", None)
    p_txt = f"{p:.2f}" if isinstance(p, (int, float)) else "N/A"
    decision = off.get("decision", "Unknown")

    if not paragraph:
        paragraph = (
            f"Based on the provided information, the estimated default probability is about {p_txt} "
            f"and the risk bucket is {bucket}. Recommended decision: {decision}."
        )

    summary = f"Summary: decision={decision} | risk_bucket={bucket} | default_probability={p_txt}"

    details = (
        "<details><summary>Show technical JSON</summary>\n\n"
        f"```json\n{json.dumps(output_dict, indent=2, ensure_ascii=False)}\n```\n"
        "</details>"
    )
    return f"{paragraph}\n\n{summary}\n\n{details}"

def smalltalk_reply(text: str, agent_state: dict) -> str:
    t = (text or "").strip().lower()

    # Keep it polite, do not corrupt state
    if any(w in t for w in ["thanks", "thank", "thx", "ty", "merci", "tnx", "no thanks"]):
        return "You're welcome. You can continue the loan application or type 'run assessment' when ready."

    if any(w in t for w in ["bye", "goodbye", "cya", "see you"]):
        return "Goodbye. You can continue later; your current session state remains unless you type 'exit' or press Reset."

    return "Hi. Ask a loan concept definition (e.g., DTIR1, LTV) or start an application (e.g., 'I want to apply for a loan')."


# -----------------------------
# B) Gradio chat function (stateful per session)
# IMPORTANT: requires your agent_step() to be defined earlier (the code you already have)
# -----------------------------
def gradio_chat(user_message, chat_history, agent_state):
    raw = (user_message or "").strip()
    chat_history = chat_history or []

    # Ensure state structure
    agent_state = agent_state or {"known_fields": {}, "history": [], "last_missing": None}
    agent_state.setdefault("known_fields", {})
    agent_state.setdefault("history", [])
    agent_state.setdefault("last_missing", None)

    if not raw:
        return "", chat_history, agent_state

    user_norm = normalize_command(raw)

    # Exit -> clears state
    if is_exit(user_norm):
        agent_state = {"known_fields": {}, "history": [], "last_missing": None}
        chat_history.append({"role": "user", "content": raw})
        chat_history.append({"role": "assistant", "content": "Session ended. Cleared saved application info. Type 'hi' to start again."})
        return "", chat_history, agent_state

    # Small talk -> polite answer, no state change
    if is_smalltalk(raw):
        assistant_msg = smalltalk_reply(raw, agent_state)
        chat_history.append({"role": "user", "content": raw})
        chat_history.append({"role": "assistant", "content": assistant_msg})
        return "", chat_history, agent_state

    # Run assessment command -> sentinel (your agent_step handles it)
    try:
        if is_run_request(user_norm):
            out = agent_step("__RUN_ASSESSMENT__", agent_state)
        else:
            out = agent_step(raw, agent_state)

        out_type = (out.get("type", "") or "").strip()

        if out_type == "missing_fields":
            assistant_msg = safe_string(out.get("question", "Please provide the missing fields."))

        elif out_type == "rag_answer":
            ans = safe_string(out.get("answer", ""))
            sources = out.get("sources", [])
            if isinstance(sources, list) and sources:
                assistant_msg = f"{ans}\n\nSources: {', '.join(map(str, sources))}"
            else:
                assistant_msg = ans

        elif out_type == "final_answer":
            output_dict = out.get("output", {}) or {}
            assistant_msg = render_final_output_as_sentences(output_dict)

        elif out_type == "out_of_scope":
            assistant_msg = safe_string(out.get("answer", "Out of scope."))

        else:
            assistant_msg = safe_string(out)

    except Exception as e:
        assistant_msg = f"Internal error: {e}"

    chat_history.append({"role": "user", "content": raw})
    chat_history.append({"role": "assistant", "content": assistant_msg})
    return "", chat_history, agent_state

def reset_chat():
    return "", [], {"known_fields": {}, "history": [], "last_missing": None}


# -----------------------------
# C) UI
# -----------------------------
with gr.Blocks(title="Loan Risk Agent Demo") as demo:
    gr.Markdown(
        """
# Loan Risk Assistant (Demo)

**You can:**
- Ask **definitions**: “What does DTIR1 mean?” “Define LTV”
- Apply for a loan **step-by-step**: “I want to apply”, then send income, score, etc.
- Or provide many fields in one message.
- Type **run assessment** to force evaluation using saved fields.
- Type **exit** to clear the session.
"""
    )

    chatbot = gr.Chatbot(height=420, type="messages")
    msg = gr.Textbox(placeholder="Type your message here...", show_label=False)

    state = gr.State({"known_fields": {}, "history": [], "last_missing": None})

    with gr.Row():
        send = gr.Button("Send")
        reset = gr.Button("Reset")

    send.click(gradio_chat, inputs=[msg, chatbot, state], outputs=[msg, chatbot, state])
    msg.submit(gradio_chat, inputs=[msg, chatbot, state], outputs=[msg, chatbot, state])
    reset.click(reset_chat, inputs=[], outputs=[msg, chatbot, state])

demo.launch(share=True, debug=True)


  chatbot = gr.Chatbot(height=420, type="messages")


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1f123f9b335ac02323.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1f123f9b335ac02323.gradio.live




In [56]:
!jupyter nbconvert Final_project_with_lang_chain.ipynb \
  --to notebook \
  --ClearMetadataPreprocessor.enabled=True \
  --ClearOutputPreprocessor.enabled=True \
  --output Final_project_with_lang_chain_clean.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr