<a href="https://colab.research.google.com/github/ShuHuiK/WIE3007_Group_Assignment/blob/ShuHui/STEP_1_Dataset_Simulation_%26_Feature_Engineering_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **1 - Dataset Simulation & Feature Engineering**

In [2]:
!pip install faker
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import openai  # or your preferred LLM API
from transformers import pipeline

# Initialize
fake = Faker()
np.random.seed(42)

# Generate 1000 financial records
n_records = 1000

def simulate_financial_dataset(n=1000):
    data = []

    for i in range(n):
        record = {
            'customer_id': f"CUST{10000 + i}",
            'age': np.random.randint(18, 70),
            'income': np.random.normal(75000, 25000),
            'credit_score': np.random.randint(300, 850),
            'account_balance': np.random.exponential(5000),
            'loan_amount': np.random.uniform(1000, 500000),
            'loan_duration': np.random.choice([12, 24, 36, 60, 84]),
            'employment_years': np.random.exponential(5),
            'debt_to_income': np.random.beta(2, 5) * 0.8,
            'transaction_frequency': np.random.poisson(30),
            'business_sector': np.random.choice(['Technology', 'Healthcare', 'Finance', 'Retail', 'Manufacturing', 'Services']),
            'company_size': np.random.choice(['Small', 'Medium', 'Large']),
            'customer_feedback': generate_feedback(),  # Will create with LLM
            'transaction_description': generate_transaction_desc(),
            'location': fake.city(),
            'last_interaction_date': fake.date_between(start_date='-2y', end_date='today'),
            'default_history': np.random.choice([0, 1], p=[0.85, 0.15]),
            'risk_level': None,  # To be filled by LLM
            'sentiment_score': None  # To be filled by LLM
        }

        # Add realistic correlations
        if record['credit_score'] > 700:
            record['interest_rate'] = np.random.uniform(3.5, 5.5)
        else:
            record['interest_rate'] = np.random.uniform(6.0, 12.5)

        data.append(record)

    return pd.DataFrame(data)

def generate_feedback():
    """Use LLM to generate realistic customer feedback"""
    feedback_types = [
        "Excellent service, very satisfied with the loan process.",
        "Faced some delays in approval, but overall okay.",
        "High interest rates compared to market standards.",
        "Customer support needs improvement.",
        "Smooth transaction, would recommend to others.",
        "Fees are too high for the services provided.",
        "Very professional and efficient handling of my account.",
        "Had issues with online banking platform.",
        "Quick loan disbursement, appreciated the efficiency.",
        "Not satisfied with the hidden charges."
    ]
    return np.random.choice(feedback_types)

def generate_transaction_desc():
    """Generate realistic transaction descriptions"""
    transactions = [
        "Monthly salary credit",
        "Online shopping - Amazon",
        "Grocery purchase at Walmart",
        "Loan EMI payment",
        "Credit card bill payment",
        "Utility bill payment",
        "Stock market investment",
        "Insurance premium",
        "Restaurant dining",
        "Fuel payment at gas station"
    ]
    return np.random.choice(transactions)

# Generate the dataset
df = simulate_financial_dataset(1000)
print(f"Dataset shape: {df.shape}")
print(df.head())

Collecting faker
  Downloading faker-39.0.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-39.0.0-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-39.0.0




Dataset shape: (1000, 20)
  customer_id  age        income  credit_score  account_balance  \
0   CUST10000   56  61244.137771           371      4564.712769   
1   CUST10001   61  64695.287988           774      4731.854369   
2   CUST10002   26  94479.815863           727      3417.736141   
3   CUST10003   64  26008.246903           352      4418.526815   
4   CUST10004   22  80289.675308           327      9942.647858   

     loan_amount  loan_duration  employment_years  debt_to_income  \
0   78853.301581             36          0.526663        0.357327   
1   70607.436465             60         18.201498        0.191831   
2   18159.872036             12          1.497289        0.085103   
3  482662.398325             60          3.912035        0.282604   
4  312025.765287             24          0.501368        0.197715   

   transaction_frequency business_sector company_size  \
0                     19        Services       Medium   
1                     37      Technology  