# Part 1: Dataset Simulation

In [None]:
pip install faker pandas numpy google-generativeai

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import google.generativeai as genai
import json
import os

In [None]:
# CONFIGURATION 
API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash')


# DEFINE THE GENAI FUNCTION for TEXT GENERATION
def generate_customer_feedback(persona, sentiment):
    """
    Sends a prompt to Gemini to generate realistic text.
    """
    prompt = f"""
    You are simulating a customer database for a bank.
    Generate exactly 200 unique, realistic customer feedback sentences (10-20 words each).

    Customer Profile: {persona}
    Sentiment:{sentiment}
    
    - If 'Young Adult': Mention apps, crypto, student loans, splitting bills, fast cash. Use casual language.
    - If 'Professional': Mention interest rates, mortgages, wire transfers, business accounts. Use formal language.
    - If 'Retiree': Mention branches, pensions, passwords, phone support, safety. Use polite but confused language.

    - If Sentiment is 'Positive': Praise speed, features, or service.
    - If Sentiment is 'Negative': Complain about fees, crashes, delays, or rejections.

    Output format: A raw JSON list of strings. Example: ["text1", "text2"]
    Do not add Markdown formatting or extra text.
    """

    try:
        response = model.generate_content(prompt)
        # Clean the response to ensure it's valid JSON
        clean_text = response.text.strip().replace("```json", "").replace("```", "")
        return json.loads(clean_text)
    except Exception as e:
        print(f" Error fetching from Gemini: {e}")
        return ["Error generating text."] * 40


data_cache = {
        "young_adult": {"positive": [], "negative": []},
        "professional": {"positive": [], "negative": []},
        "retiree": {"positive": [], "negative": []}
    }
    
# --- Young Adult ---
ya_positive = generate_customer_feedback("Young Adult", "Positive")
ya_negative = generate_customer_feedback("Young Adult", "Negative")

# --- Professional ---
pro_positive = generate_customer_feedback("Professional", "Positive")
pro_negative = generate_customer_feedback("Professional", "Negative")

# --- Retiree ---
ret_positive = generate_customer_feedback("Retiree", "Positive")
ret_negative = generate_customer_feedback("Retiree", "Negative")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def get_feedback_text(persona, is_default):
    """
    Selects a random feedback string based on Persona and Default Status.
    """
    # 1. Determine Base Sentiment
    # Default = 1 means they are high risk/struggling -> likely Negative feedback
    # Default = 0 means they are repaying -> likely Positive feedback
    use_negative_sentiment = (is_default == 1)

    # 2. Add "Noise"
    if random.random() < 0.10:
        use_negative_sentiment = not use_negative_sentiment

    selected_list = []
    
    # 3. Select the appropriate list based on Persona and Sentiment
    if persona == "young_adult":
        selected_list = ya_negative if use_negative_sentiment else ya_positive
    elif persona == "professional":
        selected_list = pro_negative if use_negative_sentiment else pro_positive
    elif persona == "retiree":
        selected_list = ret_negative if use_negative_sentiment else ret_positive

    # 4. Return a random sentence from selected list
    if not selected_list:
        return "Service was okay."      
    return random.choice(selected_list)

In [None]:
# SIMULATE DATASET 
fake = Faker()
Faker.seed(42)

def generate_data(num_records=1000):
    data = []

    for _ in range(num_records):
        # --- Demographics ---
        profile = fake.profile()
        customer_id = f"CUST-{fake.unique.random_int(min=100000, max=999999)}"
        gender = profile['sex']
        age = random.randint(18, 80)


        # Simulate 3 types customer persona: Young Adult, Professional, Retiree
        if age < 25:
            persona = "young_adult"
            job = np.random.choice(["Student", "Intern", "Part-time Server", "Barista", "None"])
            income = np.random.normal(15000, 5000)
            spend_score = np.random.normal(70, 10) # High spend relative to income

        elif 25 <= age < 60:
            persona = "professional"
            job = profile['job']
            income = np.random.normal(85000, 25000)
            spend_score = np.random.normal(50, 15)

        else:
            persona = "retiree"
            job = "retired"
            income = np.random.normal(45000, 10000)
            spend_score = np.random.normal(30, 10) # Low spend


        # --- Transactional Numerics ---
        #  Credit score
        if persona == "professional":
            credit_score = int(np.random.normal(720, 50))
        elif persona == "young_adult":
            credit_score = int(np.random.normal(600, 80))
        else:
            credit_score = int(np.random.normal(680, 60))
        
        credit_score = max(300, min(850, credit_score)) #real world limits (300-850)

        # Account balance
        if persona == "young_adult":
            savings_rate = np.random.normal(0.05, 0.02)
            accumulation_factor = random.uniform(0.1, 1.0)

        elif persona == "professional":
            savings_rate = np.random.normal(0.20, 0.05)
            accumulation_factor = random.uniform(1.0, 3.0)

        else: # Retiree
            savings_rate = np.random.normal(0.08, 0.04)
            accumulation_factor = random.uniform(2.0, 5.0)

        savings_rate = max(0.01, min(0.50, savings_rate))
        account_balance = (income * savings_rate) * accumulation_factor * random.uniform(0.1, 1.5)
        
        # Loan Request Details
        max_possible_loan = income * 0.6
        loan_amount = round(random.uniform(2000, max_possible_loan), 2)
        loan_term = random.choice([12, 24, 36, 60])

        risk_probability = 0.1 # Base 10% risk
        
        # Rule 1: Low Credit Score increases risk significantly
        if credit_score < 580: risk_probability += 0.40
        elif credit_score < 650: risk_probability += 0.20
        
        # Rule 2: High Loan-to-Income ratio increases risk
        dti_ratio = loan_amount / income
        if dti_ratio > 0.4: risk_probability += 0.25
        
        # Rule 3: Additional risk for Young Adults
        if persona == "young_adult": risk_probability += 0.05

        # Cap probability at 0.95 
        risk_probability = min(0.95, risk_probability)

        is_default = 1 if random.random() < risk_probability else 0


        # --- 4. Feedback Text Column ---
        feedback_text = get_feedback_text(persona, is_default)
        
        record = {
            "Customer_ID": customer_id,
            "Age": age,
            "Gender": gender,
            "Persona": persona,
            "Job": job,
            "Annual_Income": round(income, 2),
            "Credit_Score": credit_score,
            "Account_Balance": round(account_balance, 2),
            "Spending_Score": round(spend_score, 2),
            "Loan_Amount": loan_amount,
            "Loan_Term_Months": loan_term,
            "Loan_Default": is_default, #
            "Customer_Feedback": feedback_text
        }
        data.append(record)

    return pd.DataFrame(data)

# Generate and View
df_customers = generate_data(1000)
print(df_customers.head())

   Customer_ID  Age Gender       Persona                                  Job  \
0  CUST-279451   43      F  professional          Clinical research associate   
1  CUST-160738   32      M  professional  Social research officer, government   
2  CUST-292401   20      F   young_adult                                 None   
3  CUST-681542   51      M  professional          Health promotion specialist   
4  CUST-266931   55      F  professional                Hydrographic surveyor   

   Annual_Income  Credit_Score  Account_Balance  Spending_Score  Loan_Amount  \
0      119316.26           820         31514.82           64.18      8965.07   
1       86244.23           693         23638.00           53.99     18447.09   
2       17903.26           650           104.06           67.95      3681.92   
3       67501.25           735         11488.98           43.02     35908.24   
4      117625.97           694         35947.21           54.91     31072.81   

   Loan_Term_Months  Loan_Defaul

In [12]:
# Save to CSV
df_customers.to_csv('financial_data_simulated.csv', index=False)
print(f"Dataset saved to 'financial_data_simulated.csv'")

Dataset saved to 'financial_data_simulated.csv'
