<a href="https://colab.research.google.com/github/ShuHuiK/WIE3007_Group_Assignment/blob/ShuHui/STEP_1_Dataset_Simulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **STEP 1: DATASET SIMULATION**

In [8]:
!pip install faker
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime
from google.colab import files

# Initialize
fake = Faker('en_US')
np.random.seed(42)

def generate_comprehensive_dataset(n_records=1500):
    print(f"ðŸš€ Generating {n_records} records or 2025 Sterling Financial Dataset")

    data = []
    # Defining sectors with specific financial weights
    sectors = {
        'Technology': {'inc': 1.4, 'risk': 1.1, 'save': 0.08},
        'Healthcare': {'inc': 1.3, 'risk': 0.8, 'save': 0.15},
        'Finance': {'inc': 1.5, 'risk': 0.9, 'save': 0.20},
        'Retail': {'inc': 0.8, 'risk': 1.3, 'save': 0.05},
        'Manufacturing': {'inc': 1.1, 'risk': 1.0, 'save': 0.12},
        'Services': {'inc': 0.9, 'risk': 1.1, 'save': 0.10},
        'Real Estate': {'inc': 1.2, 'risk': 1.2, 'save': 0.07},
        'Education': {'inc': 0.8, 'risk': 0.7, 'save': 0.18}
    }

    sector_names = list(sectors.keys())

    for i in range(n_records):
        # 1. Sector Selection
        sector = np.random.choice(sector_names)
        w = sectors[sector]

        # 2. Temporal & Geography
        record_date = datetime(2025, np.random.randint(1, 13), np.random.randint(1, 28))
        location = f"{fake.state()}, US"

        # 3. Sector-Linked Financials
        income = max(25000, min(300000, np.random.lognormal(11, 0.4) * w['inc']))
        credit_score = int(max(300, min(850, np.random.normal(700 - (w['risk'] * 20), 70))))
        age = max(18, min(75, int(np.random.normal(42, 12))))
        loan_amount = round(min(income * 3.5, np.random.exponential(45000)), 2)

        # New: Savings Ratio (Base + Sector Bias + Random Variance)
        savings_ratio = max(0.01, min(0.40, w['save'] + np.random.normal(0, 0.05)))

        # Performance Metrics
        debt_to_income = round(np.random.beta(2, 5) * 0.5 * w['risk'], 3)
        credit_utilization = round(min(0.99, np.random.uniform(0.1, 0.8) * w['risk']), 2)
        payment_punctuality = round(max(0, min(100, np.random.uniform(85, 100) - (w['risk'] * 5))), 2)

        # 4. Feedback
        templates = ["Excellent rates", "Process was slow", "High fees", "Standard banking", "Quick approval"]
        feedback = np.random.choice(templates) if np.random.random() > 0.05 else None

        # 5. Target Variable (Default History)
        # Logic: High savings ratio reduces default probability
        default_prob = 1 / (1 + np.exp(-(-4.0 + (800-credit_score)*0.01 + debt_to_income*5 - savings_ratio*10)))
        default = 1 if np.random.random() < default_prob else 0

        record = {
            'date': record_date.strftime('%Y-%m-%d'),
            'customer_id': f"CUST-{2025000 + i}",
            'location': location,
            'business_sector': sector,
            'age': age,
            'income': round(income, 2),
            'credit_score': credit_score,
            'savings_ratio': round(savings_ratio, 3), # Added Column
            'loan_amount': loan_amount,
            'debt_to_income': debt_to_income,
            'credit_utilization': credit_utilization,
            'payment_punctuality': payment_punctuality,
            'customer_feedback': feedback,
            'default_history': default
        }
        data.append(record)

    df = pd.DataFrame(data)
    df['customer_feedback'] = df['customer_feedback'].fillna("Neutral: No feedback provided.")

    file_name = "financial_data_2025_assignment.csv"
    df.to_csv(file_name, index=False)

    return df, file_name

# Execute
df, filename = generate_comprehensive_dataset(1500)

# --- DISPLAY ---
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1200)
pd.set_option('display.max_colwidth', 40)
display_df = df.head(10).copy()

# Formatting for display
display_df['income'] = display_df['income'].apply(lambda x: f"${x:,.0f}")
display_df['loan_amount'] = display_df['loan_amount'].apply(lambda x: f"${x:,.0f}")
display_df['savings_ratio'] = display_df['savings_ratio'].apply(lambda x: f"{x*100:.1f}%")
display_df['debt_to_income'] = display_df['debt_to_income'].apply(lambda x: f"{x*100:.1f}%")
display_df['payment_punctuality'] = display_df['payment_punctuality'].apply(lambda x: f"{x:.1f}%")

cols_to_show = [
    'date', 'customer_id', 'location', 'business_sector', 'age', 'income',
    'credit_score', 'savings_ratio', 'loan_amount', 'debt_to_income',
    'credit_utilization', 'payment_punctuality', 'customer_feedback', 'default_history'
]

print("\n" + "="*230)
print(f"{'2025 STERLING FINANCIAL DATASET ':^230}")
print("="*230)
print(display_df[cols_to_show].to_string(index=False))
print("="*230)

ðŸš€ Generating 1500 records or 2025 Sterling Financial Dataset

                                                                                                   2025 STERLING FINANCIAL DATASET                                                                                                    
      date  customer_id           location business_sector  age  income  credit_score savings_ratio loan_amount debt_to_income  credit_utilization payment_punctuality              customer_feedback  default_history
2025-04-15 CUST-2025000        Indiana, US     Real Estate   39 $93,097           782          5.8%      $2,693          13.3%                0.93               91.5%               Standard banking                0
2025-10-22 CUST-2025001  West Virginia, US          Retail   38 $25,000           716          4.2%     $43,351          18.7%                0.75               85.3% Neutral: No feedback provided.                0
2025-02-26 CUST-2025002      Louisiana, US        Services 