In [3]:
import os, io, json, math, re, hashlib
from pathlib import Path
from sentence_transformers import SentenceTransformer
import pdfplumber
from tqdm import tqdm
import requests
import faiss
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np # Import numpy
nltk.download('punkt')
nltk.download('punkt_tab')

DATA_DIR = Path("data")
PDF_DIR = DATA_DIR / "documents"
PDF_DIR.mkdir(parents=True, exist_ok=True)
META_FILE = DATA_DIR / "metadata.jsonl"
INDEX_FILE = DATA_DIR / "faiss_index.bin"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# --- Configure your papers list here ---
# For each item: either provide 'url' (direct link to PDF) OR set 'upload': True and
# upload the PDF file when prompted in Colab.
PAPERS = [
    {"id":"arraiz_2015", "title":"Psychometrics as a Tool to Improve Screening and Access to Credit", "url":"https://publications.iadb.org/publications/english/document/Psychometrics-as-a-Tool-to-Improve-Screening-and-Access-to-Credit.pdf"},
    {"id":"alibhai_2023", "title":"Evening the Credit Score - World Bank", "url":"https://documents1.worldbank.org/curated/en/099440203162337439/pdf/IDU046d2582b04f4b047d6086a408f375dfc12ae.pdf"},
    {"id":"fine_2024", "title":"Psychometric-Based Credit Scoring for Underbanked", "url":"https://www.mdpi.com/1911-8074/17/9/423"},
    {"id":"bjorkegren_2018", "title":"Behavior Revealed in Mobile Phone Usage Predicts Loan Repayment", "url":"https://dan.bjorkegren.com/danbjork_grissen_creditscoring.pdf"},
    {"id":"mobile_usage", "title":"Mobile Phone Usage Data for Credit Scoring (working paper)", "url":"https://arxiv.org/abs/1712.05840"},
    {"id":"telecom_airtime_2024", "title":"Telecom Airtime Credit Risk Prediction", "url":"https://www.engineeringpaper.net/archives/2024/vol6issue2/PartA/6-2-4-193.pdf"},
    {"id":"socialcapital_cassar", "title":"The Effect of Social Capital on Group Loan Repayment (Cassar)", "url":"https://www.findevgateway.org/sites/default/files/publications/files/mfg-en-paper-the-effect-of-social-capital-on-group-loan-repayment-evidence-from-artefactual-field-experiments-jun-2005.pdf"},
    {"id":"rezaei_2021", "title":"The Structure of Borrower Social Network Matters (Rezaei)", "url":"https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4597857"},
    {"id":"postelnicu_2019", "title":"External Social Ties and Loan Repayment (Postelnicu)", "url":"https://www.tandfonline.com/doi/full/10.1080/00220388.2018.1464148"},
    {"id":"zhu_2023", "title":"Explainable prediction of loan default (Zhu et al.)", "url":"https://www.sciengine.com/doi/10.1016/j.dsm.2023.04.003"},
    {"id":"zhang_2025", "title":"Data-Driven Loan Default Prediction (Zhang et al.)", "url":"https://www.mdpi.com/2079-8954/13/7/581"},
    {"id":"iitm_mfi_2022", "title":"Loan Default Prediction on Indian MFI Dataset (IITM thesis)", "url":"https://eescholars.iitm.ac.in/sites/default/files/eethesis/ee17b035.pdf"}
]

# Helper: download a url to file (if direct pdf)
def download_pdf(url, save_path):
    try:
        r = requests.get(url, timeout=30, stream=True)
        content_type = r.headers.get('content-type','')
        if r.status_code == 200 and ('pdf' in content_type or url.lower().endswith('.pdf')):
            with open(save_path, 'wb') as f:
                for chunk in r.iter_content(1024*16):
                    f.write(chunk)
            return True, None
        # handle html (some MDPI/journal pages require manual download or html->pdf)
        return False, f"not-a-direct-pdf ({content_type})"
    except Exception as e:
        return False, str(e)

# 1) Download / Ask for manual upload when required
uploaded_files = {}
for p in PAPERS:
    pid = p['id']
    out_path = PDF_DIR / f"{pid}.pdf"
    print(f"Processing [{pid}] {p['title']}")
    if 'url' in p and p['url']:
        ok, err = download_pdf(p['url'], out_path)
        if not ok:
            print(f"  -> could not download directly: {err}. Marking for manual upload.")
            p['needs_upload'] = True
        else:
            p['local_path'] = str(out_path)
            print("  -> downloaded.")
    else:
        p['needs_upload'] = True

# If any need upload, prompt user (Colab files upload)
to_upload = [p for p in PAPERS if p.get('needs_upload')]
if to_upload:
    print("\nSome PDFs could not be downloaded automatically. Please upload them now via Colab file upload widget.")
    from google.colab import files
    uploaded = files.upload()  # user selects files
    for fn in uploaded.keys():
        # map uploaded filename to a paper if names match, else just save
        for p in to_upload:
            expected = f"{p['id']}.pdf"
            if fn == expected:
                dst = PDF_DIR / expected
                with open(dst,'wb') as f:
                    f.write(uploaded[fn])
                p['local_path'] = str(dst)
                p.pop('needs_upload', None)
                print(f"Uploaded and mapped {fn} -> {p['id']}")
    # Any still unmapped remain flagged

# 2) Extract text from PDFs
def extract_text_from_pdf(path):
    text = []
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                ptext = page.extract_text()
                if ptext:
                    # basic cleaning: remove excessive newlines
                    ptext = re.sub(r'\n{2,}', '\n', ptext)
                    text.append(ptext)
    except Exception as e:
        print("ERROR extracting", path, e)
    return "\n".join(text)

documents = []  # each doc: {'paper':..., 'text':..., 'path':...}
for p in PAPERS:
    lp = p.get('local_path')
    if lp and os.path.exists(lp):
        txt = extract_text_from_pdf(lp)
        documents.append({"id": p['id'], "title": p['title'], "path": lp, "text": txt})
        print(f"Extracted text from {p['id']} (chars={len(txt)})")
    else:
        print(f"Skipping extraction for {p['id']}: no local file found")

# 3) Chunking (sentence-based, ~400 tokens target)
def chunk_text(text, target_words=450, overlap_words=80):
    # naive chunker using sentences
    sents = sent_tokenize(text)
    chunks = []
    cur = []
    cur_words = 0
    for sent in sents:
        wcount = len(word_tokenize(sent))
        if cur_words + wcount > target_words and cur:
            chunks.append(" ".join(cur))
            # overlap: keep last overlap_words worth of tokens as new base
            if overlap_words > 0:
                tail = " ".join(word_tokenize(" ".join(cur))[-overlap_words:])
                cur = [tail]
                cur_words = len(word_tokenize(tail))
            else:
                cur = []
                cur_words = 0
        cur.append(sent)
        cur_words += wcount
    if cur:
        chunks.append(" ".join(cur))
    return chunks

all_chunks = []
for doc in documents:
    chunks = chunk_text(doc['text'], target_words=450, overlap_words=90)
    for i, c in enumerate(chunks):
        chunk_id = hashlib.sha1((doc['id'] + "_" + str(i)).encode()).hexdigest()[:12]
        meta = {
            "chunk_id": chunk_id,
            "paper_id": doc['id'],
            "paper_title": doc['title'],
            "chunk_index": i,
            "text": c[:2000]  # keep up to 2k chars in memory export for metadata file; full text stored in files if desired
        }
        all_chunks.append(meta)
    print(f"Doc {doc['id']} -> {len(chunks)} chunks")

# 4) Embeddings
model = SentenceTransformer(EMB_MODEL_NAME)
batch_size = 64
embeddings = []
for i in tqdm(range(0, len(all_chunks), batch_size), desc="Embedding batches"):
    batch_texts = [c['text'] for c in all_chunks[i:i+batch_size]]
    batch_emb = model.encode(batch_texts, show_progress_bar=False, convert_to_numpy=True)
    embeddings.append(batch_emb)
embeddings = np.vstack(embeddings)

# 5) Build FAISS index & persist metadata
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product (use cosine after normalization)
# normalize vectors for cosine similarity
faiss.normalize_L2(embeddings)
index.add(embeddings)
faiss.write_index(index, str(INDEX_FILE))

# Write metadata.jsonl (full metadata per chunk)
with open(META_FILE, "w", encoding="utf-8") as mf:
    for c in all_chunks:
        json.dump(c, mf, ensure_ascii=False)
        mf.write("\n")

print("Saved metadata:", META_FILE)
print("Saved FAISS index:", INDEX_FILE)

# 6) Basic retrieval test (sample queries)
sample_queries = [
    "Which mobile phone features predict default?",
    "How psychometric tests predict repayment behavior",
    "Group lending peer monitoring and repayment"
]
from sentence_transformers import util
retrieval_report = {}
for q in sample_queries:
    q_emb = model.encode(q, convert_to_numpy=True)
    faiss.normalize_L2(q_emb.reshape(1,-1))
    D, I = index.search(q_emb.reshape(1,-1), k=5)
    hits = []
    for idx in I[0]:
        if idx < len(all_chunks):
            hits.append({
                "paper_id": all_chunks[idx]['paper_id'],
                "paper_title": all_chunks[idx]['paper_title'],
                "chunk_index": all_chunks[idx]['chunk_index'],
                "text_snippet": all_chunks[idx]['text'][:500]
            })
    retrieval_report[q] = hits

with open(DATA_DIR / "retrieval_report.json", "w", encoding="utf-8") as rf:
    json.dump(retrieval_report, rf, ensure_ascii=False, indent=2)

print("Saved retrieval report:", DATA_DIR / "retrieval_report.json")
# ---------- END PIPELINE ----------


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Processing [arraiz_2015] Psychometrics as a Tool to Improve Screening and Access to Credit
  -> could not download directly: not-a-direct-pdf (text/html; charset=UTF-8). Marking for manual upload.
Processing [alibhai_2023] Evening the Credit Score - World Bank
  -> downloaded.
Processing [fine_2024] Psychometric-Based Credit Scoring for Underbanked
  -> could not download directly: not-a-direct-pdf (text/html). Marking for manual upload.
Processing [bjorkegren_2018] Behavior Revealed in Mobile Phone Usage Predicts Loan Repayment
  -> downloaded.
Processing [mobile_usage] Mobile Phone Usage Data for Credit Scoring (working paper)
  -> could not download directly: not-a-direct-pdf (text/html; charset=utf-8). Marking for manual upload.
Processing [telecom_airtime_2024] Telecom Airtime Credit Risk Prediction
  -> downloaded.
Processing [socialcapital_cassar] The Effect of Social Capital on Group Loan Repayment (Cassar)
  -> downloaded.
Processing [rezaei_2021] The Structure of Borrower Soc

Skipping extraction for arraiz_2015: no local file found
Extracted text from alibhai_2023 (chars=14266)
Skipping extraction for fine_2024: no local file found
Extracted text from bjorkegren_2018 (chars=116572)
Skipping extraction for mobile_usage: no local file found
Extracted text from telecom_airtime_2024 (chars=38415)
Extracted text from socialcapital_cassar (chars=54350)
Skipping extraction for rezaei_2021: no local file found
Skipping extraction for postelnicu_2019: no local file found
Skipping extraction for zhu_2023: no local file found
Skipping extraction for zhang_2025: no local file found
Extracted text from iitm_mfi_2022 (chars=49635)
Doc alibhai_2023 -> 7 chunks
Doc bjorkegren_2018 -> 43 chunks
Doc telecom_airtime_2024 -> 18 chunks
Doc socialcapital_cassar -> 33 chunks
Doc iitm_mfi_2022 -> 31 chunks


Embedding batches: 100%|██████████| 3/3 [00:18<00:00,  6.09s/it]

Saved metadata: data/metadata.jsonl
Saved FAISS index: data/faiss_index.bin
Saved retrieval report: data/retrieval_report.json





In [5]:
# ==============================================================
# STEP B – Full Unconventional User Data Schema & Feature Engineering
# SINGLE COLAB CELL VERSION (paste and run)
# ==============================================================

import json, math, random
from datetime import datetime, timedelta
from collections import Counter
from pathlib import Path
from typing import Dict, Any, List
import statistics as stats

# ---------------------------------------------------------------------
# 1. JSON SCHEMA (REFERENCE)
# ---------------------------------------------------------------------

SCHEMA = {
    "user_id": "string",
    "timestamp": "ISO8601 datetime",

    "demographics": {
        "age": "int",
        "gender": "string",
        "marital_status": "string",
        "education_level": "string",
        "occupation": "string",
        "monthly_income": "float",
        "income_source_stability": "float 0-1"
    },

    "mobile_metadata": {
        "avg_daily_calls": "float",
        "avg_daily_sms": "float",
        "unique_contacts_30d": "int",
        "call_diversity_index": "float",
        "night_activity_ratio": "float",
        "airtime_topup_frequency": "float",
        "avg_topup_amount": "float",
        "data_usage_variance": "float",
        "mobility_radius_km": "float",
        "days_inactive_last_30": "int"
    },

    "psychometrics": {
        "conscientiousness_score": "float",
        "honesty_humility_score": "float",
        "risk_aversion_score": "float",
        "impulse_control_score": "float",
        "numerical_ability_score": "float",
        "business_aptitude_score": "float",
        "confidence_bias_score": "float"
    },

    "financial_behavior": {
        "savings_frequency": "float",
        "savings_amount_variance": "float",
        "bill_payment_timeliness": "float",
        "wallet_balance_lows_last_90d": "int",
        "credit_utilization_ratio": "float",
        "purchase_spread_index": "float"
    },

    "social_network": {
        "shg_membership": "bool",
        "peer_monitoring_strength": "float",
        "community_reputation_score": "float",
        "dependents_in_household": "int",
        "consistency_in_group_meetings": "float"
    },

    "loan_history": {
        "previous_loans": "int",
        "previous_defaults": "int",
        "previous_late_payments": "int",
        "avg_repayment_delay_days": "float"
    }
}

# ---------------------------------------------------------------------
# 2. UTILITY HELPERS
# ---------------------------------------------------------------------

def iso_now():
    return datetime.utcnow().isoformat() + "Z"

def validate_profile(profile: Dict[str, Any]):
    """Validate required fields + numeric ranges."""
    errors = []

    required = [
        "user_id", "timestamp", "demographics", "mobile_metadata",
        "psychometrics", "financial_behavior", "social_network", "loan_history"
    ]

    for r in required:
        if r not in profile:
            errors.append(f"Missing field: {r}")

    # demographic checks
    if "demographics" in profile:
        age = profile["demographics"].get("age")
        if age is not None and not (18 <= age <= 100):
            errors.append("Age must be 18–100.")
        income = profile["demographics"].get("monthly_income")
        if income is not None and income < 0:
            errors.append("Income must be >= 0.")

    # psychometric checks
    if "psychometrics" in profile:
        for k, v in profile["psychometrics"].items():
            if v is not None and not (0 <= v <= 1):
                errors.append(f"Psychometric {k} must be 0–1")

    return errors

# ---------------------------------------------------------------------
# 3. MOBILE METADATA FEATURE ENGINEERING
# ---------------------------------------------------------------------

def shannon_diversity(counter):
    total = sum(counter.values())
    if total == 0:
        return 0
    entropy = 0
    for c in counter.values():
        p = c / total
        entropy -= p * math.log(p + 1e-12)
    max_entropy = math.log(len(counter)) if len(counter) else 1
    return entropy / max_entropy if max_entropy else 0

def compute_mobile_features(call_logs, sms_logs, topups, data_usage, as_of=None):
    if as_of is None:
        as_of = datetime.utcnow()

    def last_n_days(dt, n):
        return dt >= as_of - timedelta(days=n)

    # Parse calls
    calls = []
    for c in call_logs:
        try:
            t = datetime.fromisoformat(c["timestamp"].replace("Z",""))
            calls.append({"t": t, "contact": c.get("contact_id")})
        except:
            pass

    sms = []
    for s in sms_logs:
        try:
            t = datetime.fromisoformat(s["timestamp"].replace("Z",""))
            sms.append({"t": t, "contact": s.get("contact_id")})
        except:
            pass

    calls30 = [c for c in calls if last_n_days(c["t"], 30)]
    sms30   = [s for s in sms if last_n_days(s["t"], 30)]

    # core mobile features
    avg_daily_calls = len(calls30) / 30
    avg_daily_sms   = len(sms30) / 30

    unique_contacts = len(
        set(c["contact"] for c in calls30 if c["contact"]) |
        set(s["contact"] for s in sms30 if s["contact"])
    )

    diversity = shannon_diversity(
        Counter(c["contact"] for c in calls30 if c["contact"])
    )

    night_calls = [c for c in calls30 if 0 <= c["t"].hour < 6]
    night_ratio = len(night_calls) / len(calls30) if calls30 else 0

    # topups
    parsed_topups = []
    for t in topups:
        try:
            dt = datetime.fromisoformat(t["timestamp"].replace("Z",""))
            parsed_topups.append({"t": dt, "amount": float(t.get("amount",0))})
        except:
            pass

    topups30 = [t for t in parsed_topups if last_n_days(t["t"], 30)]

    topup_freq = len(topups30) / 30
    avg_topup_amount = (
        sum(t["amount"] for t in topups30) / len(topups30)
        if topups30 else 0
    )

    # data usage variance
    parsed_usage = []
    for u in data_usage:
        try:
            dt = datetime.fromisoformat(u["date"].replace("Z",""))
            if last_n_days(dt, 30):
                parsed_usage.append(u.get("mb_used", 0))
        except:
            pass

    data_var = stats.pvariance(parsed_usage) if len(parsed_usage) > 1 else 0

    # inactivity
    active_days = set(
        [c["t"].date() for c in calls30] +
        [s["t"].date() for s in sms30] +
        [t["t"].date() for t in topups30]
    )
    inactive_days = 30 - len(active_days)

    return {
        "avg_daily_calls": round(avg_daily_calls,4),
        "avg_daily_sms": round(avg_daily_sms,4),
        "unique_contacts_30d": unique_contacts,
        "call_diversity_index": round(diversity,4),
        "night_activity_ratio": round(night_ratio,4),
        "airtime_topup_frequency": round(topup_freq,4),
        "avg_topup_amount": round(avg_topup_amount,2),
        "data_usage_variance": round(data_var,4),
        "mobility_radius_km": 0.0,
        "days_inactive_last_30": inactive_days
    }

# ---------------------------------------------------------------------
# 4. PSYCHOMETRIC SCORING
# ---------------------------------------------------------------------

def psychometric_scoring(responses: Dict[str, int]):
    groups = {
        "conscientiousness":   [k for k in responses if k.startswith("C_")],
        "impulse_control":     [k for k in responses if k.startswith("I_")],
        "honesty_humility":    [k for k in responses if k.startswith("H_")],
        "risk_aversion":       [k for k in responses if k.startswith("R_")],
        "numerical_ability":   [k for k in responses if k.startswith("N_")],
        "business_aptitude":   [k for k in responses if k.startswith("B_")],
        "confidence_bias":     [k for k in responses if k.startswith("CB_")]
    }

    def scale(vals):
        if not vals:
            return None
        return round((sum(vals)/len(vals) - 1) / 4, 4)

    return {
        "conscientiousness_score": scale([responses[k] for k in groups["conscientiousness"]]),
        "honesty_humility_score":  scale([responses[k] for k in groups["honesty_humility"]]),
        "risk_aversion_score":     scale([responses[k] for k in groups["risk_aversion"]]),
        "impulse_control_score":   scale([responses[k] for k in groups["impulse_control"]]),
        "numerical_ability_score": scale([responses[k] for k in groups["numerical_ability"]]),
        "business_aptitude_score": scale([responses[k] for k in groups["business_aptitude"]]),
        "confidence_bias_score":   scale([responses[k] for k in groups["confidence_bias"]]),
    }

# ---------------------------------------------------------------------
# 5. FINANCIAL BEHAVIOR FEATURES
# ---------------------------------------------------------------------

def engineer_financial_behavior(savings_events, bill_payments, wallet_balance_ts):
    now = datetime.utcnow()

    sav_90 = [
        s for s in savings_events
        if datetime.fromisoformat(s["timestamp"].replace("Z","")) >= now - timedelta(days=90)
    ]
    savings_freq = len(sav_90)/3
    amounts = [s["amount"] for s in sav_90]
    savings_var = stats.pvariance(amounts) if len(amounts)>1 else 0

    bills = [
        b for b in bill_payments
        if datetime.fromisoformat(b["timestamp"].replace("Z","")) >= now - timedelta(days=365)
    ]
    on_time = sum(1 for b in bills if b.get("on_time")) / len(bills) if bills else 0

    lows = 0
    for w in wallet_balance_ts:
        dt = datetime.fromisoformat(w["date"])
        if dt >= now - timedelta(days=90) and w["balance"] < 100:
            lows += 1

    return {
        "savings_frequency": round(savings_freq,4),
        "savings_amount_variance": round(savings_var,4),
        "bill_payment_timeliness": round(on_time,4),
        "wallet_balance_lows_last_90d": lows,
        "credit_utilization_ratio": 0.0,
        "purchase_spread_index": 0.0
    }

# ---------------------------------------------------------------------
# 6. ASSEMBLE FINAL PROFILE
# ---------------------------------------------------------------------

def assemble_profile(
    user_id,
    demographics,
    call_logs,
    sms_logs,
    topups,
    data_usage,
    psych_responses,
    savings_events,
    bill_payments,
    wallet_balance_ts,
    social_info,
    loan_history
):
    return {
        "user_id": user_id,
        "timestamp": iso_now(),
        "demographics": demographics,
        "mobile_metadata": compute_mobile_features(call_logs, sms_logs, topups, data_usage),
        "psychometrics": psychometric_scoring(psych_responses),
        "financial_behavior": engineer_financial_behavior(savings_events, bill_payments, wallet_balance_ts),
        "social_network": social_info,
        "loan_history": loan_history
    }

# ---------------------------------------------------------------------
# 7. SYNTHETIC TEST GENERATOR
# ---------------------------------------------------------------------

def rand_poisson(lam):
    L = math.exp(-lam)
    p = 1.0
    k = 0
    while p > L:
        p *= random.random()
        k += 1
    return k - 1

def generate_synthetic_profile():
    now = datetime.utcnow()
    calls, sms, topups, data_usage = [], [], [], []

    for d in range(60):
        day = now - timedelta(days=d)
        data_usage.append({"date": day.date().isoformat(), "mb_used": max(0, random.gauss(50,20))})

        # calls
        for _ in range(rand_poisson(3)):
            t = datetime(day.year, day.month, day.day, random.randint(0,23), random.randint(0,59))
            calls.append({"timestamp": t.isoformat()+"Z", "contact_id": f"c{random.randint(1,30)}"})

        # sms
        for _ in range(max(0, int(random.gauss(1,1)))):
            t = datetime(day.year, day.month, day.day, random.randint(0,23), random.randint(0,59))
            sms.append({"timestamp": t.isoformat()+"Z", "contact_id": f"c{random.randint(1,35)}"})

        # topups
        if random.random() < 0.2:
            t = datetime(day.year, day.month, day.day, 12, random.randint(0,59))
            topups.append({"timestamp": t.isoformat()+"Z", "amount": round(random.gauss(50,20),2)})

    psych = {
        "C_q1":4,"C_q2":3,"C_q3":4,
        "I_q1":3,"I_q2":2,"I_q3":3,
        "H_q1":4,"H_q2":4,
        "R_q1":2,"R_q2":3,
        "N_q1":3,"N_q2":2,
        "B_q1":3,"B_q2":3,
        "CB_q1":4
    }

    savings_events = [
        {"timestamp": (now-timedelta(days=random.randint(1,90))).isoformat()+"Z", "amount": random.randint(100,300)}
        for _ in range(6)
    ]

    bill_payments = [
        {"timestamp": (now-timedelta(days=random.randint(1,365))).isoformat()+"Z",
         "amount": random.randint(300,600), "on_time": random.random()>0.2}
        for _ in range(8)
    ]

    wallet = [
        {"date": (now-timedelta(days=i)).date().isoformat(), "balance": max(0, random.gauss(500,250))}
        for i in range(120)
    ]

    social = {
        "shg_membership": True,
        "peer_monitoring_strength": 0.7,
        "community_reputation_score": 0.6,
        "dependents_in_household": 3,
        "consistency_in_group_meetings": 0.8
    }

    loan_history = {
        "previous_loans":2,
        "previous_defaults":0,
        "previous_late_payments":1,
        "avg_repayment_delay_days":3.5
    }

    return assemble_profile(
        "demo_user_001",
        demographics={
            "age":34,"gender":"female","marital_status":"married",
            "education_level":"secondary","occupation":"vendor",
            "monthly_income":12000,"income_source_stability":0.6
        },
        call_logs=calls,
        sms_logs=sms,
        topups=topups,
        data_usage=data_usage,
        psych_responses=psych,
        savings_events=savings_events,
        bill_payments=bill_payments,
        wallet_balance_ts=wallet,
        social_info=social,
        loan_history=loan_history
    )

# ---------------------------------------------------------------------
# 8. RUN DEMO
# ---------------------------------------------------------------------

demo_profile = generate_synthetic_profile()
errors = validate_profile(demo_profile)

print("Validation errors:", errors)
print("\nSample Profile:\n")
print(json.dumps(demo_profile, indent=2))


Validation errors: []

Sample Profile:

{
  "user_id": "demo_user_001",
  "timestamp": "2025-11-14T12:51:17.675015Z",
  "demographics": {
    "age": 34,
    "gender": "female",
    "marital_status": "married",
    "education_level": "secondary",
    "occupation": "vendor",
    "monthly_income": 12000,
    "income_source_stability": 0.6
  },
  "mobile_metadata": {
    "avg_daily_calls": 3.0667,
    "avg_daily_sms": 0.5333,
    "unique_contacts_30d": 31,
    "call_diversity_index": 0.9669,
    "night_activity_ratio": 0.2283,
    "airtime_topup_frequency": 0.1,
    "avg_topup_amount": 54.04,
    "data_usage_variance": 365.4078,
    "mobility_radius_km": 0.0,
    "days_inactive_last_30": 0
  },
  "psychometrics": {
    "conscientiousness_score": 0.6667,
    "honesty_humility_score": 0.75,
    "risk_aversion_score": 0.375,
    "impulse_control_score": 0.4167,
    "numerical_ability_score": 0.375,
    "business_aptitude_score": 0.5,
    "confidence_bias_score": 0.75
  },
  "financial_behavio

  now = datetime.utcnow()
  return datetime.utcnow().isoformat() + "Z"
  as_of = datetime.utcnow()
  now = datetime.utcnow()


In [11]:
# ===============================================================
# SIMPLE RAG USING GROQ LLMs (NO ML MODEL)
# ===============================================================

!pip install -q groq sentence-transformers faiss-cpu

import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from groq import Groq

# ------------------------------
# 1. Load Embedding Model
# ------------------------------

EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMB_MODEL)

# ------------------------------
# 2. Load FAISS Index + Metadata
# ------------------------------

FAISS_PATH = "data/faiss_index.bin"
META_PATH = "data/metadata.jsonl"

index = faiss.read_index(FAISS_PATH)

# load metadata (aligned with FAISS row order)
metas = []
with open(META_PATH, "r", encoding="utf-8") as f:
    for line in f:
        metas.append(json.loads(line))

# ------------------------------
# 3. GROQ LLM Client
# ------------------------------
# Sign up at https://console.groq.com to get your free API key

GROQ_API_KEY = "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"   # <-- put your key here
client = Groq(api_key=GROQ_API_KEY)

LLM_MODEL = "llama-3.1-8b-instant"     # Updated to a currently supported model


# ------------------------------
# 4. Function: Retrieve Top k Chunks
# ------------------------------

def retrieve_chunks(query, k=5):
    q_emb = embedder.encode(query, convert_to_numpy=True)
    q_emb = q_emb.reshape(1, -1).astype(np.float32)

    # Normalize for cosine similarity
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)

    results = []
    for dist, idx in zip(D[0], I[0]):
        if idx < len(metas):
            m = metas[idx]
            results.append({
                "score": float(dist),
                "paper_id": m["paper_id"],
                "paper_title": m["paper_title"],
                "chunk_index": m["chunk_index"],
                "text_snippet": m["text"]   # limit length for prompt cleanliness
            })
    return results


# ------------------------------
# 5. Build RAG Prompt
# ------------------------------

def build_prompt(query, retrieved):
    context = ""
    for i, r in enumerate(retrieved, start=1):
        context += f"[CHUNK {i}] From: {r['paper_title']} — {r['text_snippet']}\n\n"

    prompt = f"""
You are an expert assistant that answers questions using ONLY the research context provided.

User Query:
{query}

Relevant Research Chunks:
{context}

Instructions:
 - Use only the information in the chunks.
 - If the answer is not found in the chunks, say "The research does not contain this information."
 - Cite chunk numbers like [CHUNK 1], [CHUNK 2].
 - Give a clear, concise answer.

Answer:
"""
    return prompt


# ------------------------------
# 6. RAG Ask Function
# ------------------------------

def rag_answer(query, k=5):
    retrieved = retrieve_chunks(query, k=k)
    prompt = build_prompt(query, retrieved)

    response = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,
        max_tokens=600,
    )

    answer = response.choices[0].message.content
    return {
        "query": query,
        "answer": answer,
        "retrieved_chunks": retrieved
    }


# ------------------------------
# 7. TEST IT
# ------------------------------

test_query = "What psychometric traits are strongly linked to loan repayment behavior?"
result = rag_answer(test_query, k=5)

print("USER QUERY:", result["query"])
print("\n--- ANSWER ---\n")
print(result["answer"])

print("\n--- CHUNKS USED ---\n")
for c in result["retrieved_chunks"]:
    print(f"- {c['paper_title']} (chunk {c['chunk_index']})")

USER QUERY: What psychometric traits are strongly linked to loan repayment behavior?

--- ANSWER ---

Based on the provided research chunks, the psychometric traits strongly linked to loan repayment behavior are:

- Business aptitude (as mentioned in [CHUNK 1])
- Honesty (as mentioned in [CHUNK 1])
- Periodicity of usage (as mentioned in [CHUNK 3])
- Slope of usage (as mentioned in [CHUNK 3])
- Correlations in usage (as mentioned in [CHUNK 3])
- Variance (as mentioned in [CHUNK 3])
- Personal trust between group members (as mentioned in [CHUNK 5])
- Peer homogeneity (as mentioned in [CHUNK 5])

These traits are linked to loan repayment behavior through various studies and experiments, including psychometric assessments, mobile phone usage analysis, and social capital research.

--- CHUNKS USED ---

- Evening the Credit Score - World Bank (chunk 2)
- Behavior Revealed in Mobile Phone Usage Predicts Loan Repayment (chunk 13)
- Behavior Revealed in Mobile Phone Usage Predicts Loan Repayme

In [14]:
# ---------------------------
# SIMPLE RAG Q&A + MEMORY (Groq) - Single Colab Cell
# Paste & run in Colab
# ---------------------------

!pip install -q groq sentence-transformers faiss-cpu

import json, os, math, pathlib
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from groq import Groq
from datetime import datetime, UTC # Import UTC

# ---------- CONFIG ----------
GROQ_API_KEY = "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"   # <-- set your Groq key
LLM_MODEL = "llama-3.1-8b-instant"     # change if you prefer other Groq model
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
FAISS_PATH = "data/faiss_index.bin"      # path to your FAISS index from ingestion
META_PATH = "data/metadata.jsonl"        # metadata lines aligned with index
MEMORY_PATH = "data/memory.json"         # simple local memory store

# ---------- INIT CLIENTS ----------
client = Groq(api_key=GROQ_API_KEY)
embedder = SentenceTransformer(EMB_MODEL)

# Load FAISS index + metadata
if not os.path.exists(FAISS_PATH) or not os.path.exists(META_PATH):
    raise FileNotFoundError("FAISS index or metadata.jsonl not found. Run ingestion first or change FAISS_PATH/META_PATH.")

index = faiss.read_index(FAISS_PATH)
metas = [json.loads(line) for line in open(META_PATH, "r", encoding="utf-8")]

# ---------- MEMORY HELPERS ----------
def load_memory() -> Dict[str, Any]:
    if not os.path.exists(MEMORY_PATH):
        return {}
    return json.load(open(MEMORY_PATH, "r", encoding="utf-8"))

def save_memory(mem: Dict[str, Any]):
    os.makedirs(os.path.dirname(MEMORY_PATH) or ".", exist_ok=True)
    json.dump(mem, open(MEMORY_PATH, "w", encoding="utf-8"), indent=2)

def add_to_memory(user_id: str, profile: Dict[str, Any]=None, utterance: Dict[str,Any]=None):
    mem = load_memory()
    if user_id not in mem:
        mem[user_id] = {"profile": None, "conversations": []}
    if profile:
        mem[user_id]["profile"] = profile
    if utterance:
        mem[user_id]["conversations"].append(utterance)
    save_memory(mem)

# ---------- RETRIEVER ----------
def retrieve_chunks(query: str, k: int=6) -> List[Dict[str,Any]]:
    q_emb = embedder.encode(query, convert_to_numpy=True).astype(np.float32)
    q_emb = q_emb.reshape(1, -1)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    results = []
    for dist, idx in zip(D[0], I[0]):
        if idx < len(metas):
            m = metas[idx]
            results.append({
                "score": float(dist),
                "chunk_text": m.get("text","")[:1000],
                # keep metadata but do NOT surface titles to user later
                "paper_id": m.get("paper_id"),
                "paper_title": m.get("paper_title"),
                "chunk_index": m.get("chunk_index")
            })
    return results

# ---------- PROMPT BUILDERS (internal, DO NOT INCLUDE CITATIONS IN USER ANSWER) ----------
def profile_summary_for_prompt(profile: Dict[str,Any]) -> str:
    # create a short structured summary that the LLM can use to reason
    dem = profile.get("demographics", {})
    mob = profile.get("mobile_metadata", {})
    psych = profile.get("psychometrics", {})
    fin = profile.get("financial_behavior", {})
    soc = profile.get("social_network", {})
    loan = profile.get("loan_history", {})
    parts = []
    parts.append(f"Age: {dem.get('age','unknown')}, Occupation: {dem.get('occupation','unknown')}, Monthly income: {dem.get('monthly_income','unknown')}")
    parts.append(f"Mobile: avg_calls={mob.get('avg_daily_calls','?')}, unique_contacts_30d={mob.get('unique_contacts_30d','?')}, days_inactive_30d={mob.get('days_inactive_last_30','?')}")
    parts.append(f"Psychometrics (0-1): conscientious={psych.get('conscientiousness_score','?')}, impulse_control={psych.get('impulse_control_score','?')}, honesty={psych.get('honesty_humility_score','?')}")
    parts.append(f"Financial behaviour: savings_freq={fin.get('savings_frequency','?')}, on_time_bill_frac={fin.get('bill_payment_timeliness','?')}, wallet_lows_90d={fin.get('wallet_balance_lows_last_90d','?')}")
    parts.append(f"Social: shg_membership={soc.get('shg_membership',False)}, peer_monitoring={soc.get('peer_monitoring_strength','?')}")
    parts.append(f"Loan history: prev_loans={loan.get('previous_loans',0)}, prev_defaults={loan.get('previous_defaults',0)}, prev_late={loan.get('previous_late_payments',0)}")
    return "\n".join(parts)

def build_internal_prompt(user_query: str, profile: Dict[str,Any], retrieved_chunks: List[Dict[str,Any]]) -> str:
    # This prompt instructs the LLM to use the chunks to form an answer but not to reveal sources.
    profile_summary = profile_summary_for_prompt(profile) if profile else "No profile provided."
    chunks_text = "\n\n".join([f"CHUNK {i+1}: {c['chunk_text']}" for i,c in enumerate(retrieved_chunks)])
    prompt = f"""
You are an expert microfinance advisor. You will be given:
1) A short borrower profile summary (structured).
2) A user question.
3) A set of research text CHUNKs (raw excerpts). Use the CHUNKs strictly to ground your reasoning.

Instructions:
- Produce a single JSON object (no extra commentary) with exactly these top-level keys:
  - "eligibility": one of ["eligible","maybe","not_eligible"]
  - "score": float between 0.0 and 1.0 indicating likelihood of default (higher = higher risk). (If you reason the probability of default is p, set score=p.)
  - "verdict_text": a 2-3 sentence plain-language verdict for the user (DO NOT mention or cite article titles or chunk ids).
  - "strong_points": list of 3 short bullet strings (why the user is likely to be creditworthy)
  - "weak_points": list of 3 short bullet strings (risks / reasons for concern)
  - "required_unconventional_data": list of fields (from the checklist) that are MISSING from the profile and needed to improve the assessment
  - "actionable_recommendations": list of 4 concrete, prioritized actions the borrower can take to improve eligibility (very specific)
  - "confidence": one of ["high","medium","low"] indicating how confident you are based on available data
  - "raw_internal_reasoning": one short paragraph that says which CHUNK features informed the decision (this is for audit; keep it generic, do not list paper titles)
- Use only the CHUNK text and profile to decide. Do not invent facts outside them.
- Do NOT include paper titles, DO NOT output CHUNK ids in the user-facing fields; you may reference CHUNKs only inside "raw_internal_reasoning".
- If the profile lacks important unconventional fields, include them in "required_unconventional_data". The recommended required unconventional fields (use these labels):
  ["mobile_call_logs_30d","airtime_topups_30d","psychometric_responses","savings_history_90d","wallet_balance_timeseries_90d","shg_membership_info","transaction_history_180d","sms_patterns_30d"]
- If you cannot answer because no relevant CHUNK supports the query, set "verdict_text" to "The research does not contain this information.", set "eligibility" to "maybe", score to 0.5 and confidence to "low".
- Keep lists short and precise. Prioritize clarity for non-technical users.

Now, here is the input:
Profile summary:
{profile_summary}

User question:
{user_query}

CHUNKS:
{chunks_text}

Return ONLY the JSON object.
"""
    return prompt

# ---------- MAIN RAG Q&A (structured) ----------
def rag_qna_structured(user_query: str, user_id: str=None, profile: Dict[str,Any]=None, k:int=6) -> Dict[str,Any]:
    """
    user_query: user's natural question (e.g., "Am I eligible for a loan of 50k?")
    user_id: optional id for memory
    profile: optional profile dict (if None, function will check memory for user_id)
    returns: structured dict per prompt instructions
    """
    # load profile from memory if not provided
    if profile is None and user_id:
        mem = load_memory()
        profile = mem.get(user_id, {}).get("profile")

    # retrieve
    # create a retrieval query that mixes user question & profile "weak" markers for better retrieval
    retrieval_query = user_query
    if profile:
        # include top signals to help retrieval
        try:
            p_summary = profile_summary_for_prompt(profile)
            retrieval_query = user_query + " | profile summary: " + " ".join(p_summary.splitlines())
        except Exception:
            pass

    retrieved = retrieve_chunks(retrieval_query, k=k)

    # Build prompt for Groq
    prompt = build_internal_prompt(user_query, profile if profile else {}, retrieved)

    # Call Groq LLM
    resp = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[{"role":"user","content":prompt}],
        temperature=0.0,
        max_tokens=700
    )

    llm_text = resp.choices[0].message.content # Changed from resp.choices[0].message["content"]

    # Parse JSON out of the response robustly
    try:
        parsed = json.loads(llm_text.strip())
    except Exception as e:
        # fallback: try to extract first JSON-looking substring
        import re
        m = re.search(r'(\{.*\})', llm_text, flags=re.S)
        if m:
            parsed = json.loads(m.group(1))
        else:
            # return an error-structured object
            parsed = {
                "eligibility":"maybe",
                "score":0.5,
                "verdict_text":"Could not parse model output. Please try again.",
                "strong_points":[],
                "weak_points":[],
                "required_unconventional_data":["mobile_call_logs_30d","psychometric_responses"],
                "actionable_recommendations":[],
                "confidence":"low",
                "raw_internal_reasoning": llm_text[:800]
            }

    # Append conversation to memory for user
    if user_id:
        add_to_memory(user_id, profile=profile, utterance={
            "timestamp": datetime.now(UTC).isoformat(), # Updated to use datetime.now(UTC)
            "query": user_query,
            "response": parsed
        })

    # Package additional debug info (retrieved chunk summaries) but DO NOT show these to final user unless needed
    parsed["_debug"] = {
        "retrieved_count": len(retrieved),
        "retrieved_sample": [ {"text_snippet": r["chunk_text"][:300]} for r in retrieved ]
    }
    return parsed

# ---------- USAGE EXAMPLE ----------
if __name__ == "__main__":
    # demo: load example profile if exists
    demo_profile_path = "/mnt/data/user_profile_example.json"
    demo_profile = None
    if os.path.exists(demo_profile_path):
        demo_profile = json.load(open(demo_profile_path,"r",encoding="utf-8"))
        print("Loaded demo profile from", demo_profile_path)

    # Example query
    q = "Am I likely to be able to get a small business loan of 50,000 INR? What are my strengths and weaknesses?"
    out = rag_qna_structured(q, user_id="demo_user_001", profile=demo_profile, k=6)
    print("\n--- STRUCTURED OUTPUT ---\n")
    print(json.dumps(out, indent=2))


--- STRUCTURED OUTPUT ---

{
  "eligibility": "maybe",
  "score": 0.5,
  "verdict_text": "The research does not contain this information.",
  "strong_points": [],
  "weak_points": [],
  "required_unconventional_data": [
    "mobile_call_logs_30d",
    "airtime_topups_30d",
    "psychometric_responses",
    "savings_history_90d",
    "wallet_balance_timeseries_90d",
    "shg_membership_info",
    "transaction_history_180d",
    "sms_patterns_30d"
  ],
  "actionable_recommendations": [
    "Provide psychometric responses to assess business aptitude and honesty.",
    "Share mobile call logs and airtime top-ups for the last 30 days.",
    "Offer savings history for the last 90 days.",
    "Share wallet balance timeseries for the last 90 days."
  ],
  "confidence": "low",
  "raw_internal_reasoning": "The decision was based on the lack of relevant information in the provided CHUNKs and profile. The CHUNKs discuss psychometric credit scoring, its benefits, and its use in lending to women en

In [15]:
# ---------------------------
# COLAB CELL: Interactive data collection + run RAG decision
# ---------------------------
# Paste this AFTER your previous cells that define:
# - load_memory, save_memory, add_to_memory
# - rag_qna_structured
# - (FAISS, embedder, Groq client etc.)
#
# This cell will collect unconventional data fields (or NA), build a profile
# in your schema, save to memory, call rag_qna_structured, and save/print output.
# ---------------------------

import json, os
from datetime import datetime
from pathlib import Path

# REQUIRED UNCONVENTIONAL FIELDS (labels used in prompts and by the RAG prompt)
REQUIRED_UNCONVENTIONAL_FIELDS = [
    "mobile_call_logs_30d",        # expects list of call dicts or 'NA'
    "airtime_topups_30d",          # expects list of topup dicts or 'NA'
    "psychometric_responses",      # expects dict of Q->Likert or 'NA'
    "savings_history_90d",         # expects list of savings events or 'NA'
    "wallet_balance_timeseries_90d", # expects list of daily balances or 'NA'
    "shg_membership_info",         # expects dict {member:bool,peer_monitoring:float} or 'NA'
    "transaction_history_180d",    # expects list of transaction dicts or 'NA'
    "sms_patterns_30d"             # expects list of sms dicts or 'NA'
]

OUT_DIR = Path("data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _prompt_list_of_dicts(field_name):
    print(f"\nEnter {field_name} as JSON array of objects, or type NA (without quotes) if unavailable.")
    print("Examples:")
    if field_name == "mobile_call_logs_30d":
        print('[{"timestamp":"2025-11-01T08:12:00Z","duration_sec":120,"contact_id":"c1","direction":"out"}, ...]')
    elif field_name == "airtime_topups_30d":
        print('[{"timestamp":"2025-11-05T12:00:00Z","amount":50.0}, ...]')
    elif field_name == "savings_history_90d":
        print('[{"timestamp":"2025-09-10T00:00:00Z","amount":200.0}, ...]')
    elif field_name == "wallet_balance_timeseries_90d":
        print('[{"date":"2025-08-01","balance":450.0}, ...]')
    elif field_name == "transaction_history_180d":
        print('[{"timestamp":"2025-06-01T10:00:00Z","amount":250.0,"type":"sale"}, ...]')
    elif field_name == "sms_patterns_30d":
        print('[{"timestamp":"2025-11-01T09:00:00Z","contact_id":"c1","text_snippet":"paid today"}, ...]')
    raw = input(f"{field_name} > ").strip()
    if raw.upper() == "NA" or raw == "":
        return "NA"
    try:
        parsed = json.loads(raw)
        if not isinstance(parsed, list):
            print("Expected a JSON array (list). Storing as single-element list.")
            parsed = [parsed]
        return parsed
    except Exception as e:
        print("Could not parse JSON — saving as NA. Error:", e)
        return "NA"

def _prompt_dict(field_name):
    print(f"\nEnter {field_name} as JSON object (e.g., psychometric responses), or NA if unavailable.")
    if field_name == "psychometric_responses":
        print('{"C_q1":4,"C_q2":3,"C_q3":4,"I_q1":3,"I_q2":2}')
    if field_name == "shg_membership_info":
        print('{"shg_membership": true, "peer_monitoring_strength": 0.7, "consistency_in_group_meetings": 0.8}')
    raw = input(f"{field_name} > ").strip()
    if raw.upper() == "NA" or raw == "":
        return "NA"
    try:
        parsed = json.loads(raw)
        if not isinstance(parsed, dict):
            print("Expected a JSON object — storing as NA.")
            return "NA"
        return parsed
    except Exception as e:
        print("Could not parse JSON — saving as NA. Error:", e)
        return "NA"

def collect_unconventional_inputs_interactive():
    """
    Interactive console-based collector for the unconventional fields.
    Returns a dict mapping field -> value ('NA' or parsed value).
    """
    print("COLLECTING UNCONVENTIONAL DATA — type NA if not available.\n(You can paste JSON arrays/objects directly when prompted.)")
    collected = {}
    for f in REQUIRED_UNCONVENTIONAL_FIELDS:
        if f in ["psychometric_responses", "shg_membership_info"]:
            collected[f] = _prompt_dict(f)
        else:
            collected[f] = _prompt_list_of_dicts(f)
    return collected

# Build the profile structure expected by Step-B (minimum fields + unconventional parts)
def build_profile_from_inputs(user_id: str, demographics: dict, unconventional_inputs: dict):
    """
    demographics : minimal dict (age, occupation, monthly_income etc.) or {}
    unconventional_inputs : from collector
    Returns full profile dict compatible with your Step-B schema but may place 'NA' where missing.
    """
    # Minimal structure: fill what we can; map unconventional inputs into schema fields used by pipeline
    profile = {
        "user_id": user_id,
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "demographics": demographics or {},
        # mobile_metadata will be computed if raw call/sms/topup data available; otherwise mark NA
        "mobile_metadata": {},
        "psychometrics": {},
        "financial_behavior": {},
        "social_network": {},
        "loan_history": {}  # keep empty — you can extend to prompt loan history too
    }

    # Map mobile_call_logs_30d / sms_patterns_30d / airtime_topups_30d -> raw buckets (keep raw for reproducibility)
    profile["_raw_unconventional"] = unconventional_inputs  # store raw inputs for audits

    # If available, perform minimal aggregation so RAG prompt has summary fields
    # avg_daily_calls, unique_contacts_30d, days_inactive_last_30, airtime_topup_frequency, avg_topup_amount, data_usage_variance
    try:
        calls = unconventional_inputs.get("mobile_call_logs_30d")
        sms = unconventional_inputs.get("sms_patterns_30d")
        topups = unconventional_inputs.get("airtime_topups_30d")
        wallet_ts = unconventional_inputs.get("wallet_balance_timeseries_90d")
        savings = unconventional_inputs.get("savings_history_90d")
        psych = unconventional_inputs.get("psychometric_responses")
        shg = unconventional_inputs.get("shg_membership_info")

        # helper safe checks
        def is_available(x):
            return x != "NA" and x is not None

        # simple call stats
        if is_available(calls):
            # compute avg daily calls in last 30 days
            from datetime import datetime, timedelta
            now = datetime.utcnow()
            last30 = [c for c in calls if "timestamp" in c and datetime.fromisoformat(c["timestamp"].replace("Z","")) >= (now - timedelta(days=30))]
            profile["mobile_metadata"]["avg_daily_calls"] = round(len(last30)/30.0,4)
            contacts = set([c.get("contact_id") for c in last30 if c.get("contact_id")])
            profile["mobile_metadata"]["unique_contacts_30d"] = len(contacts)
            # days inactive
            days_with_activity = set([datetime.fromisoformat(c["timestamp"].replace("Z","")).date() for c in last30])
            profile["mobile_metadata"]["days_inactive_last_30"] = max(0, 30 - len(days_with_activity))
        else:
            profile["mobile_metadata"]["avg_daily_calls"] = "NA"
            profile["mobile_metadata"]["unique_contacts_30d"] = "NA"
            profile["mobile_metadata"]["days_inactive_last_30"] = "NA"

        # airtime
        if is_available(topups):
            from datetime import datetime
            now = datetime.utcnow()
            topups_30 = [t for t in topups if "timestamp" in t and datetime.fromisoformat(t["timestamp"].replace("Z","")) >= (now - timedelta(days=30))]
            profile["mobile_metadata"]["airtime_topup_frequency"] = round(len(topups_30)/30.0,4)
            profile["mobile_metadata"]["avg_topup_amount"] = round(sum([t.get("amount",0) for t in topups_30])/len(topups_30),2) if len(topups_30)>0 else 0.0
        else:
            profile["mobile_metadata"]["airtime_topup_frequency"] = "NA"
            profile["mobile_metadata"]["avg_topup_amount"] = "NA"

        # wallet & savings -> financial_behavior
        if is_available(savings):
            profile["financial_behavior"]["savings_frequency"] = round(len(savings)/3.0,4)  # per month proxy
            amounts = [s.get("amount",0) for s in savings]
            import statistics as stats
            profile["financial_behavior"]["savings_amount_variance"] = round(stats.pvariance(amounts) if len(amounts)>1 else 0.0,4)
        else:
            profile["financial_behavior"]["savings_frequency"] = "NA"
            profile["financial_behavior"]["savings_amount_variance"] = "NA"

        if is_available(wallet_ts):
            # wallet lows last 90d threshold at 100 currency units
            lows = 0
            from datetime import datetime
            now = datetime.utcnow()
            for w in wallet_ts:
                try:
                    d = datetime.fromisoformat(w.get("date"))
                    if d >= (now - timedelta(days=90)) and w.get("balance",0) < 100:
                        lows += 1
                except:
                    pass
            profile["financial_behavior"]["wallet_balance_lows_last_90d"] = lows
        else:
            profile["financial_behavior"]["wallet_balance_lows_last_90d"] = "NA"

        # psychometrics mapping (normalize if Likert 1-5)
        if is_available(psych) and isinstance(psych, dict):
            # map a few common questions if present; otherwise compute mean
            vals = [v for v in psych.values() if isinstance(v,(int,float))]
            if vals:
                # naive "conscientiousness" proxy = mean normalized
                mean = sum(vals)/len(vals)
                profile["psychometrics"]["conscientiousness_score"] = round((mean-1)/4.0,4)
                # keep full responses for audit
                profile["_raw_unconventional"]["psychometric_responses_full"] = psych
            else:
                profile["psychometrics"]["conscientiousness_score"] = "NA"
        else:
            profile["psychometrics"]["conscientiousness_score"] = "NA"

        # shg info
        if is_available(shg) and isinstance(shg, dict):
            profile["social_network"]["shg_membership"] = bool(shg.get("shg_membership", False))
            profile["social_network"]["peer_monitoring_strength"] = shg.get("peer_monitoring_strength", "NA")
            profile["social_network"]["consistency_in_group_meetings"] = shg.get("consistency_in_group_meetings", "NA")
        else:
            profile["social_network"]["shg_membership"] = "NA"
            profile["social_network"]["peer_monitoring_strength"] = "NA"
            profile["social_network"]["consistency_in_group_meetings"] = "NA"

    except Exception as e:
        print("Warning: error while aggregating unconventional inputs:", e)

    return profile

def interactive_run_and_save(user_id: str = None):
    """
    Interactive flow:
    - collect demographics minimal fields
    - collect unconventional inputs interactively
    - build profile
    - save to memory
    - call rag_qna_structured and show/save output
    """
    if user_id is None:
        user_id = input("Enter a user_id (e.g., user_001) > ").strip()
    if user_id == "":
        user_id = "user_"+datetime.utcnow().strftime("%Y%m%d%H%M%S")

    print("\nEnter basic demographics (press Enter to skip / NA):")
    age_raw = input("Age (int) > ").strip()
    try:
        age = int(age_raw) if age_raw and age_raw.upper()!="NA" else None
    except:
        age = None
    occupation = input("Occupation > ").strip() or None
    income_raw = input("Monthly income (numeric) > ").strip()
    try:
        monthly_income = float(income_raw) if income_raw and income_raw.upper()!="NA" else None
    except:
        monthly_income = None

    demographics = {
        "age": age,
        "occupation": occupation,
        "monthly_income": monthly_income
    }

    print("\nNow provide unconventional data (you can paste full JSON arrays/objects). Type NA to skip any.")
    unconventional_inputs = collect_unconventional_inputs_interactive()

    # Build profile
    profile = build_profile_from_inputs(user_id, demographics, unconventional_inputs)

    # Save to memory (profile)
    try:
        add_to_memory(user_id, profile=profile, utterance=None)
        print(f"\nSaved profile to memory for user_id={user_id}")
    except Exception as e:
        print("Warning: could not save to memory (missing add_to_memory). Exception:", e)

    # Run RAG structured Q&A
    print("\nNow enter the user's question (e.g., 'Am I eligible for a ₹50,000 loan?'):")
    user_question = input("Question > ").strip()
    if user_question == "":
        user_question = "Am I eligible for a microloan of 50,000 INR?"

    # Ensure rag_qna_structured exists
    if "rag_qna_structured" not in globals():
        raise RuntimeError("rag_qna_structured is not defined in this notebook. Run the earlier RAG cell that provides it before executing this cell.")

    print("\nRunning RAG-based decision. Please wait...")
    result = rag_qna_structured(user_question, user_id=user_id, profile=profile, k=6)

    # Save output
    out_path = OUT_DIR / f"last_output_{user_id}.json"
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2)
    print(f"\nStructured output saved to: {out_path}\n")
    print("---- START OF OUTPUT ----")
    print(json.dumps(result, indent=2))
    print("---- END OF OUTPUT ----")
    return result

# If running as script/cell, call interactive_run_and_save()
print("Run interactive_run_and_save() to start interactive input collection and RAG decision.")
# Example: uncomment to run immediately:
# interactive_run_and_save("demo_user_001")


Run interactive_run_and_save() to start interactive input collection and RAG decision.


In [16]:
interactive_run_and_save()


Enter a user_id (e.g., user_001) > user_1

Enter basic demographics (press Enter to skip / NA):
Age (int) > 45
Occupation > street vendor
Monthly income (numeric) > 20000

Now provide unconventional data (you can paste full JSON arrays/objects). Type NA to skip any.
COLLECTING UNCONVENTIONAL DATA — type NA if not available.
(You can paste JSON arrays/objects directly when prompted.)

Enter mobile_call_logs_30d as JSON array of objects, or type NA (without quotes) if unavailable.
Examples:
[{"timestamp":"2025-11-01T08:12:00Z","duration_sec":120,"contact_id":"c1","direction":"out"}, ...]
mobile_call_logs_30d > NA

Enter airtime_topups_30d as JSON array of objects, or type NA (without quotes) if unavailable.
Examples:
[{"timestamp":"2025-11-05T12:00:00Z","amount":50.0}, ...]
airtime_topups_30d > NA

Enter psychometric_responses as JSON object (e.g., psychometric responses), or NA if unavailable.
{"C_q1":4,"C_q2":3,"C_q3":4,"I_q1":3,"I_q2":2}
psychometric_responses > NA

Enter savings_hist

UnboundLocalError: cannot access local variable 'datetime' where it is not associated with a value