# Part 1: Dataset Simulation

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from google import genai
from google.genai import types
import json
import os
from dotenv import load_dotenv
import re

In [None]:
# CONFIGURATION
load_dotenv()
API_KEY = os.getenv("API_KEY")
client = genai.Client(api_key=API_KEY)

# DEFINE THE GENAI FUNCTION for TEXT GENERATION
def generate_customer_feedback(persona, sentiment, model="gemini-2.0-flash", num_sentences=200):
    """
    Sends a prompt to Gemini to generate realistic text and returns a list of strings.
    This function is defensive: it will try multiple parsing strategies and always
    return a list of length `num_sentences` (padding/repeating if necessary).
    """
    prompt = f"""
    You are simulating a customer database for a bank.
    Generate exactly {num_sentences} unique, realistic customer feedback sentences (10-20 words each).

    Customer Profile: {persona}
    Sentiment:{sentiment}

    - If 'Young Adult': Mention apps, crypto, student loans, splitting bills, fast cash. Use casual language.
    - If 'Professional': Mention interest rates, mortgages, wire transfers, business accounts. Use formal language.
    - If 'Retiree': Mention branches, pensions, passwords, phone support, safety. Use polite but confused language.

    - If Sentiment is 'Positive': Praise speed, features, or service.
    - If Sentiment is 'Negative': Complain about fees, crashes, delays, or rejections.

    Output format: A raw JSON list of strings. Example: ["text1", "text2"]
    Do not add Markdown formatting or extra text.
    """

    try:
        response = client.models.generate_content(
            model=model,
            contents=[prompt]
        )

        # Try multiple ways to get text from the response object
        raw_text = None
        if hasattr(response, 'text') and isinstance(response.text, str):
            raw_text = response.text
        else:
            # Fallback to stringifying the response
            raw_text = str(response)

        if raw_text is None:
            raise ValueError("No text found in model response")

        # Remove common code block markers
        cleaned = raw_text.strip().replace("```json", "").replace("```", "")

        # Search for the first JSON array in the response (robust against extra text)
        # Use non-greedy match so we don't accidentally capture across multiple arrays
        match = re.search(r"\[.*?\]", cleaned, re.S)
        json_text = match.group(0) if match else cleaned

        parsed = json.loads(json_text)

        # Normalize to list of non-empty strings
        if isinstance(parsed, list):
            texts = [str(x).strip() for x in parsed if isinstance(x, str) and str(x).strip()]
        else:
            # If model returned a dict or other structure, try to find strings inside
            texts = []
            def _extract_strings(obj):
                if isinstance(obj, str):
                    return [obj]
                if isinstance(obj, dict):
                    out = []
                    for v in obj.values():
                        out.extend(_extract_strings(v))
                    return out
                if isinstance(obj, list):
                    out = []
                    for v in obj:
                        out.extend(_extract_strings(v))
                    return out
                return []
            texts = [s.strip() for s in _extract_strings(parsed) if s and s.strip()]

        # If we still have nothing, raise to hit the exception handler below
        if not texts:
            raise ValueError("Parsed response contained no usable strings")

        # Ensure exactly num_sentences are returned: if fewer, repeat/shuffle to pad
        if len(texts) < num_sentences:
            # If there are some unique texts, repeat them to reach requested count
            orig = texts.copy()
            i = 0
            while len(texts) < num_sentences and orig:
                texts.append(orig[i % len(orig)])
                i += 1
        elif len(texts) > num_sentences:
            texts = texts[:num_sentences]

        return texts

    except Exception as e:
        print(f" Error fetching/parsing from Gemini: {e}")
        # Return a deterministic fallback list of the requested length
        return ["Error generating text." for _ in range(num_sentences)]


data_cache = {
        "young_adult": {"positive": [], "negative": []},
        "professional": {"positive": [], "negative": []},
        "retiree": {"positive": [], "negative": []}
    }
    
# --- Young Adult ---
ya_positive = generate_customer_feedback("Young Adult", "Positive")
ya_negative = generate_customer_feedback("Young Adult", "Negative")

# --- Professional ---
pro_positive = generate_customer_feedback("Professional", "Positive")
pro_negative = generate_customer_feedback("Professional", "Negative")

# --- Retiree ---
ret_positive = generate_customer_feedback("Retiree", "Positive")
ret_negative = generate_customer_feedback("Retiree", "Negative")

In [None]:
def get_feedback_text(persona, is_default):
    """
    Selects a random feedback string based on Persona and Default Status.
    """
    # 1. Determine Base Sentiment
    # Default = 1 means they are high risk/struggling -> likely Negative feedback
    # Default = 0 means they are repaying -> likely Positive feedback
    use_negative_sentiment = (is_default == 1)

    # 2. Add "Noise"
    if random.random() < 0.10:
        use_negative_sentiment = not use_negative_sentiment

    selected_list = []
    
    # 3. Select the appropriate list based on Persona and Sentiment
    if persona == "young_adult":
        selected_list = ya_negative if use_negative_sentiment else ya_positive
    elif persona == "professional":
        selected_list = pro_negative if use_negative_sentiment else pro_positive
    elif persona == "retiree":
        selected_list = ret_negative if use_negative_sentiment else ret_positive

    # 4. Return a random sentence from selected list
    if not selected_list:
        return "Service was okay."      
    return random.choice(selected_list)

In [None]:
# SIMULATE DATASET 
fake = Faker()
Faker.seed(42)

def generate_data(num_records=1000):
    data = []

    for _ in range(num_records):
        # --- Demographics ---
        profile = fake.profile()
        customer_id = f"CUST-{fake.unique.random_int(min=100000, max=999999)}"
        gender = profile['sex']
        age = random.randint(18, 80)


        # Simulate 3 types customer persona: Young Adult, Professional, Retiree
        if age < 25:
            persona = "young_adult"
            job = np.random.choice(["Student", "Intern", "Part-time Server", "Barista", "None"])
            income = np.random.normal(15000, 5000)
            spend_score = np.random.normal(70, 10) # High spend relative to income

        elif 25 <= age < 60:
            persona = "professional"
            job = profile['job']
            income = np.random.normal(85000, 25000)
            spend_score = np.random.normal(50, 15)

        else:
            persona = "retiree"
            job = "retired"
            income = np.random.normal(45000, 10000)
            spend_score = np.random.normal(30, 10) # Low spend


        # --- Transactional Numerics ---
        #  Credit score
        if persona == "professional":
            credit_score = int(np.random.normal(720, 50))
        elif persona == "young_adult":
            credit_score = int(np.random.normal(600, 80))
        else:
            credit_score = int(np.random.normal(680, 60))
        
        credit_score = max(300, min(850, credit_score)) #real world limits (300-850)

        # Account balance
        if persona == "young_adult":
            savings_rate = np.random.normal(0.05, 0.02)
            accumulation_factor = random.uniform(0.1, 1.0)

        elif persona == "professional":
            savings_rate = np.random.normal(0.20, 0.05)
            accumulation_factor = random.uniform(1.0, 3.0)

        else: # Retiree
            savings_rate = np.random.normal(0.08, 0.04)
            accumulation_factor = random.uniform(2.0, 5.0)

        savings_rate = max(0.01, min(0.50, savings_rate))
        account_balance = (income * savings_rate) * accumulation_factor * random.uniform(0.1, 1.5)
        
        # Loan Request Details
        max_possible_loan = income * 0.6
        loan_amount = round(random.uniform(2000, max_possible_loan), 2)
        loan_term = random.choice([12, 24, 36, 60])

        risk_probability = 0.1 # Base 10% risk
        
        # Rule 1: Low Credit Score increases risk significantly
        if credit_score < 580: risk_probability += 0.40
        elif credit_score < 650: risk_probability += 0.20
        
        # Rule 2: High Loan-to-Income ratio increases risk
        dti_ratio = loan_amount / income
        if dti_ratio > 0.4: risk_probability += 0.25
        
        # Rule 3: Additional risk for Young Adults
        if persona == "young_adult": risk_probability += 0.05

        # Cap probability at 0.95 
        risk_probability = min(0.95, risk_probability)

        is_default = 1 if random.random() < risk_probability else 0


        # --- 4. Feedback Text Column ---
        feedback_text = get_feedback_text(persona, is_default)
        
        record = {
            "Customer_ID": customer_id,
            "Age": age,
            "Gender": gender,
            "Persona": persona,
            "Job": job,
            "Annual_Income": round(income, 2),
            "Credit_Score": credit_score,
            "Account_Balance": round(account_balance, 2),
            "Spending_Score": round(spend_score, 2),
            "Loan_Amount": loan_amount,
            "Loan_Term_Months": loan_term,
            "Loan_Default": is_default, #
            "Customer_Feedback": feedback_text
        }
        data.append(record)

    return pd.DataFrame(data)

# Generate and View
df_customers = generate_data(1000)
print(df_customers.head())

In [None]:
# Save to CSV
df_customers.to_csv('financial_data_simulated.csv', index=False)
print(f"Dataset saved to 'financial_data_simulated.csv'")

# Part 2: Feature Engineering and Data Cleaning