## Part 1: Dataset Simulation

In [25]:
# Library imports

import pandas as pd
import numpy as np
from datetime import datetime
import random
from google import genai
from google.genai import types
from collections import Counter
import os
import math
import time
import json
from dotenv import load_dotenv
load_dotenv()

# Helper: robustly convert various SDK response types to plain string for JSON parsing
def _extract_text(obj):
    """Recursively extract text from SDK response objects.
    Handles str/bytes, lists, objects with .text/.content/.output, pydantic models, response.candidates, and fallbacks to str().
    """
    if obj is None:
        return ""
    if isinstance(obj, str):
        return obj
    if isinstance(obj, (bytes, bytearray)):
        try:
            return obj.decode('utf-8')
        except Exception:
            return str(obj)
    # If object has 'parts' (GenAI SDK), extract part contents
    try:
        if hasattr(obj, 'parts') and obj.parts:
            # join all part contents
            return "\n".join(_extract_text(p) for p in obj.parts)
    except Exception:
        pass
    # If it's a list or tuple, join items
    if isinstance(obj, (list, tuple)):
        return "\n".join(_extract_text(x) for x in obj)
    # Common nested attributes
    for attr in ("text", "content", "output", "value", "data"):
        try:
            val = getattr(obj, attr, None)
        except Exception:
            val = None
        if val:
            return _extract_text(val)
    # If object is a dict-like
    try:
        if isinstance(obj, dict):
            return json.dumps(obj)
    except Exception:
        pass
    # Try pydantic/BaseModel .json() or .dict()
    try:
        if hasattr(obj, 'json') and callable(obj.json):
            return obj.json()
    except Exception:
        pass
    try:
        if hasattr(obj, 'dict') and callable(obj.dict):
            return json.dumps(obj.dict())
    except Exception:
        pass
    # Fallback to string
    return str(obj)


In [26]:
# Methods used in the simulation

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

TRANSACTION_CATEGORIES = ["groceries", "utilities", "entertainment", "dining", "transport", "healthcare", "shopping", "education", "rent", "salary", "transfer"]

def generate_feedback_gemini_batch(summaries, n_requests=10, model="gemini-2.5-flash", max_retries=3, retry_delay=5, batch_delay_seconds=4):
    """Send summaries to Gemini in n_requests batches and return feedback map.

    This function WILL raise if the GenAI SDK is missing, the GOOGLE_API_KEY env var
    is not set, or if Gemini fails to produce valid JSON for any batch after retries.

    summaries: list of dicts, each must include 'customer_id' and other summary fields.
    n_requests: desired number of Gemini calls (e.g., 10 to respect quotas).
    batch_delay_seconds: seconds to sleep between successful batch calls to avoid bursts.
    Returns: {customer_id: feedback_text}
    """
    # Strict preconditions for Gemini-only mode
    if 'genai' not in globals() or genai is None or 'types' not in globals() or types is None:
        raise RuntimeError("Google GenAI SDK not available. Install the official SDK before enabling Gemini.")

    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    if not GOOGLE_API_KEY:
        raise ValueError("GOOGLE_API_KEY not found in environment variables; required for Gemini mode.")

    client = genai.Client(api_key=GOOGLE_API_KEY)

    feedback_map = {}
    total = len(summaries)
    if total == 0:
        return feedback_map

    batch_size = max(1, math.ceil(total / n_requests))

    for i in range(0, total, batch_size):
        batch = summaries[i:i+batch_size]
        # Build strict JSON-response prompt
        items_text = []
        for s in batch:
            items_text.append(f"ID: {s['customer_id']} | credits: {s['total_credits']:.2f} | debits: {s['total_debits']:.2f} | end_balance: {s['ending_balance']:.2f} | top: {s['top_category']}")
        prompt_body = "\n".join(items_text)
        prompt = (
            "You are given multiple short customer monthly summaries (one per line in the format 'ID: <id> | ...').\n"
            "For each summary, produce a JSON array of objects with fields {\"customer_id\": <id>, \"feedback\": <short advice sentence>} and NOTHING ELSE.\n"
            "Keep each feedback within 10 words and make them varied and realistic. Example: \"I plan to increase my savings next month.\"\n"
            "Respond with valid JSON only (no extra commentary).\n\n"
            f"Summaries:\n{prompt_body}"
        )

        attempt = 0
        resp_text = None
        last_exception = None
        while attempt < max_retries:
            try:
                response = client.models.generate_content(
                    model=model,
                    contents=[prompt],
                    config=types.GenerateContentConfig(
                        system_instruction="You are generating short customer feedback notes for a synthetic banking dataset.",
                        thinking_config=types.ThinkingConfig(thinking_budget=-1)
                    )
                )
                # Extract returned text conservatively
                resp_text = _extract_text(response)
                if resp_text:
                    break
                else:
                    raise RuntimeError('Empty response from Gemini')
            except Exception as e:
                last_exception = e
                attempt += 1
                wait = retry_delay * (2 ** (attempt - 1))
                # If the exception contains retry info, prefer that (best-effort)
                try:
                    # attempt to read retryDelay from exception message or object
                    # Not all exceptions expose this; this is best-effort
                    print(f"Gemini batch request failed (attempt {attempt}/{max_retries}): {e}. Retrying in {wait}s...")
                except Exception:
                    pass
                time.sleep(wait)

        if resp_text is None:
            # unrecoverable for this batch
            raise RuntimeError(f"Gemini failed for batch starting at index {i} after {max_retries} attempts.") from last_exception

        # Parse JSON exactly; be strict in Gemini-only mode
        # Clean markdown code blocks
        if resp_text.startswith('```json') and resp_text.endswith('```'):
            resp_text = resp_text[7:-3].strip()
        elif resp_text.startswith('```') and resp_text.endswith('```'):
            resp_text = resp_text[3:-3].strip()
        parsed = None
        try:
            parsed = json.loads(resp_text)
        except Exception:
            # attempt to extract JSON substring
            start = resp_text.find('[')
            end = resp_text.rfind(']')
            if start != -1 and end != -1 and end > start:
                parsed = json.loads(resp_text[start:end+1])

        if not isinstance(parsed, list):
            raise ValueError(f"Gemini response for batch starting at index {i} did not parse to a JSON list.\nResponse:\n{resp_text}")

        for obj in parsed:
            cid = obj.get('customer_id') or obj.get('ID') or obj.get('id')
            fb = obj.get('feedback') or obj.get('note') or obj.get('text')
            if not cid or not fb:
                raise ValueError(f"Invalid object in Gemini JSON output: {obj}")
            feedback_map[str(cid)] = str(fb)

        # Sleep a short while between batches to avoid quota bursts
        time.sleep(batch_delay_seconds)

    return feedback_map


# Replace daily series to *require* Gemini use and map feedbacks strictly
def generate_customer_dataset(num_customers=500, seed=42, gemini_requests=10, batch_delay_seconds=4):
    np.random.seed(seed)
    random.seed(seed)

    rows = []
    month_end_date = datetime(2025, 1, 31).date()

    customer_summaries = []

    for cid in range(1, num_customers + 1):
        customer_id = f"CUST{str(cid).zfill(5)}"
        age = int(np.clip(np.random.normal(40, 12), 18, 80))
        monthly_income = float(np.round(np.clip(np.random.normal(4000, 2000), 500, None), 2))
        occupation = random.choice(["Engineer", "Teacher", "Healthcare", "Retail", "Finance", "Student", "Self-employed", "Unemployed", "Manager", "Technician"])
        starting_balance = float(np.round(monthly_income * np.random.uniform(0.2, 4.0) + np.random.uniform(-500, 500), 2))
        credit_score = int(np.clip(np.random.normal(680, 50), 300, 850))
        invest_pct = float(np.round(np.random.normal(0.8, 1.2), 2))

        # Simulate monthly transactions
        balance = starting_balance
        monthly_credits_total = 0.0
        monthly_debits_total = 0.0
        categories_counter = Counter()

        # Salary credit
        salary = monthly_income
        monthly_credits_total += salary
        balance += salary
        categories_counter.update(["salary"])

        # Random transfers
        n_transfers = np.random.poisson(2)
        for _ in range(n_transfers):
            transfer = float(np.random.exponential(scale=monthly_income / 10.0))
            monthly_credits_total += transfer
            balance += transfer
            categories_counter.update(["transfer"])

        # Investment gains
        monthly_invest_gain = (balance * max(0.0, invest_pct) / 100.0)
        monthly_credits_total += monthly_invest_gain
        balance += monthly_invest_gain

        # Debits
        n_txns = np.random.poisson(30)  # approx daily
        for _ in range(n_txns):
            cat = random.choice(TRANSACTION_CATEGORIES[:-3])
            amt = float(np.random.exponential(scale=monthly_income / 120.0)) + np.random.uniform(5, 50)
            monthly_debits_total += amt
            balance -= amt
            categories_counter.update([cat])

        # Big spends
        n_big = np.random.poisson(3)
        for _ in range(n_big):
            big_cat = random.choice(["shopping", "healthcare", "education", "rent"])
            big_amt = float(np.random.exponential(scale=monthly_income / 4.0))
            monthly_debits_total += big_amt
            balance -= big_amt
            categories_counter.update([big_cat])

        ending_balance = float(np.round(balance, 2))

        # Credit score change
        credit_score_change = np.random.choice([-10, -5, 5, 10])
        final_credit_score = int(np.clip(credit_score + credit_score_change, 300, 850))

        top_category = categories_counter.most_common(1)[0][0] if categories_counter else None

        row = {
            "customer_id": customer_id,
            "date": month_end_date.isoformat(),
            "age": age,
            "occupation": occupation,
            "monthly_income": monthly_income,
            "monthly_credit": float(round(monthly_credits_total, 2)),
            "monthly_debit": float(round(monthly_debits_total, 2)),
            "starting_balance": starting_balance,
            "ending_balance": ending_balance,
            "monthly_invest_gain": float(round(monthly_invest_gain, 2)),
            "monthly_invest_return_pct": invest_pct,
            "initial_credit_score": credit_score,
            "final_credit_score": final_credit_score,
            "transaction_category_major": top_category,
            "customer_feedback": None
        }
        rows.append(row)

        summary = {
            "customer_id": customer_id,
            "total_credits": monthly_credits_total,
            "total_debits": monthly_debits_total,
            "ending_balance": ending_balance,
            "top_category": top_category
        }
        customer_summaries.append(summary)

    # Generate feedbacks using Gemini batches (Gemini-only mode)
    feedback_map = generate_feedback_gemini_batch(customer_summaries, n_requests=gemini_requests, batch_delay_seconds=batch_delay_seconds)


    # Ensure every customer got a Gemini feedback
    missing = [s['customer_id'] for s in customer_summaries if s['customer_id'] not in feedback_map]
    if missing:
        raise RuntimeError(f"Gemini did not return feedback for the following customers: {missing[:10]}{'...' if len(missing)>10 else ''}")

    # Attach feedbacks
    for r in rows:
        cid = r['customer_id']
        fb = feedback_map[cid]
        r['customer_feedback'] = fb

    df = pd.DataFrame(rows)

    # Inject missingness and noise
    inc_idx = df.sample(frac=0.02, random_state=seed).index
    df.loc[inc_idx, "monthly_income"] = np.nan

    txn_idx = df.sample(frac=0.03, random_state=seed+1).index
    df.loc[txn_idx, ["monthly_credit", "monthly_debit"]] = np.nan

    cs_idx = df.sample(frac=0.01, random_state=seed+2).index
    df.loc[cs_idx, "final_credit_score"] = df.loc[cs_idx, "final_credit_score"].apply(lambda x: int(x + np.random.choice([-500, 400])))

    fb_idx = df[df["customer_feedback"].notnull()].sample(frac=0.05, random_state=seed+3).index
    df.loc[fb_idx, "customer_feedback"] = None

    ol_idx = df.sample(n=12, random_state=seed+4).index
    df.loc[ol_idx, "monthly_debit"] = df.loc[ol_idx, "monthly_debit"] * np.random.uniform(5, 25, size=len(ol_idx))

    col_order = [
        "customer_id", "date", "age", "occupation", "monthly_income",
        "monthly_credit", "monthly_debit", "starting_balance", "ending_balance",
        "monthly_invest_gain", "monthly_invest_return_pct", "initial_credit_score",
        "final_credit_score", "transaction_category_major", "customer_feedback"
    ]
    df = df[col_order]

    return df


def save_simulation(df: pd.DataFrame, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)


In [27]:
# Configuration and run

NUM_CUSTOMERS = 500
SEED = 42

print("Generating synthetic banking customer dataset...")
df = generate_customer_dataset(num_customers=NUM_CUSTOMERS, seed=SEED)

print(f"\n✓ Dataset generated successfully! Rows: {len(df)} Customers: {df['customer_id'].nunique()}")
print(f"Date: {df['date'].iloc[0]}")

# Show missing data summary
print("\nMissing data summary:")
print(df.isnull().sum())

# Show sample of feedbacks
print("\nSample feedbacks (first 10 customers):")
sample_feedbacks = df[['customer_id','customer_feedback']].head(10)
print(sample_feedbacks.to_string(index=False))

output_filename = 'data/sim_customers_seed_42.csv'
save_simulation(df, output_filename)
print(f"\n✓ Dataset saved to: {output_filename}")

print("\nBasic numeric stats:")
print(df[['monthly_credit','monthly_debit','ending_balance','monthly_invest_return_pct']].describe())


Generating synthetic banking customer dataset...

✓ Dataset generated successfully! Rows: 500 Customers: 500
Date: 2025-01-31

Missing data summary:
customer_id                    0
date                           0
age                            0
occupation                     0
monthly_income                10
monthly_credit                15
monthly_debit                 15
starting_balance               0
ending_balance                 0
monthly_invest_gain            0
monthly_invest_return_pct      0
initial_credit_score           0
final_credit_score             0
transaction_category_major     0
customer_feedback             25
dtype: int64

Sample feedbacks (first 10 customers):
customer_id                                            customer_feedback
  CUST00001            Dining was enjoyable, but finances remain strong.
  CUST00002            Healthcare costs were high, good reserves helped.
  CUST00003                  Dining out was excessive; need to cut back.
  CUST00004