In [0]:
# Imports
import pandas as pd
import numpy as np 
import random 
from datetime import datetime, timedelta

In [0]:
''' laying out the database scheme and generating sample data to use for the models

TABLE 1: Customers
COLUMNS:
- Id
- First Name
- Last Name
- Age
- Location
- Annual Income
- Debt-To-Income Ratio (DTI)
- Loan-to-Value Ratio (LTV) 
- Credit Score

TABLE 2: Transactions 
COLUMNS: 
- Customer ID 
- Transaction Date
- Amount 
- Recipient
- Device Type
'''

In [0]:
# Create database and tables

database_name = "banking_database"
customers_table = "customers"
transaction_table = "transactions"
bank_table = "bank"

spark.sql(f"CREATE DATABASE IF NOT EXISTS `{database_name}`")

In [0]:
# sample data for customers table
customers = {
    'id': [1, 2, 3, 4, 5],
    'first_name': ['Mitch', 'Auston', 'William', 'Joseph', 'John'],
    'last_name' : ['Marner', 'Matthews', 'Nylander', 'Woll', 'Tavares'],
    'age': [28, 27, 29, 26, 34],
    'location': ['CA', 'USA', 'CA', 'USA', 'Canada'],
    'annual_income': [200000, 350000, 150000, 90000, 180000],
    'dti': [0.5, 0.4, 0.7, 0.6, 0.8],
    'ltv': [0.58, 0.33, 0.66, 0.8, 0.75],
    'credit_score': [700, 900, 600, 450, 540]
}
customers_df = pd.DataFrame(customers)

# sample data for transactions table

# transactions = {
# 'customer_id': [1, 2, 3, 4, 5],
# 'date' : ['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04', '2025-01-05'],
# 'amount': [100, 200, 300, 9999, 500],   
# 'recipient': ['Walmart', 'Tim Hortons', 'Nike', 'Unknown', 'Walmart'],
# 'device_type': ['Credit Card', 'Credit Card', 'Online', 'Unknown', 'Credit Card'],
# }
#transactions_df = pd.DataFrame(transactions)

In [0]:
def create_sql_table(database_name, table_name, df): 
    spark_df = spark.createDataFrame(df)
    spark_df.write.format("delta").mode("overwrite").saveAsTable(f"{database_name}.{table_name}")
    print(f"\nManaged Delta table '{database_name}.{table_name}' created successfully!")


In [0]:
create_sql_table(database_name, customers_table, customers_df)

In [0]:
# Utility Function: Generate Synthetic Transaction Data 
def generate_transaction_data(
    num_data_points: int,
    num_days: int,
    mean_amount: float,
    std_dev_amount: float, # Added standard deviation for more realistic amounts
    recipients_list: list,
    device_types_list: list,
    customer_id: int, # the specific customer_id generating for
    transactions: list,
): 
    end_date = datetime.now()
    start_date = end_date - timedelta(days=num_days)

    for _ in range(num_data_points):

        # Generate random Transaction Date within the specified range
        random_seconds = random.uniform(0, num_days * 24 * 3600)
        transaction_date = start_date + timedelta(seconds=random_seconds)

        # Generate Amount using a normal distribution, ensuring it's positive
        amount = max(5.0, np.random.normal(loc=mean_amount, scale=std_dev_amount)) # Ensure amount is at least 5.0

        # Select random Recipient and Device Type
        recipient = random.choice(recipients_list)
        device_type = random.choice(device_types_list)

        transactions.append({
            'Customer ID': customer_id,
            'Transaction Date': transaction_date,
            'Amount': round(amount, 2), # Round to 2 decimal places
            'Recipient': recipient,
            'Device Type': device_type
        })

    return transactions

In [0]:
def introduce_fraud(
    transactions_df: pd.DataFrame,
    fraud_percentage: float = 0.05, # Percentage of transactions to mark as fraudulent
    fraud_amount_min: float = 2000.0,
    fraud_amount_max: float = 5000.0
) -> pd.DataFrame:
    
    df_copy = transactions_df.copy()
    num_transactions = len(df_copy)
    num_fraudulent = int(num_transactions * fraud_percentage)

    if num_fraudulent == 0 and fraud_percentage > 0:
        print("Warning")
        df_copy['is_fraudulent'] = False
        return df_copy

    # Randomly select indices to mark as fraudulent
    fraud_indices = np.random.choice(num_transactions, num_fraudulent, replace=False)

    # Mark selected transactions as fraudulent
    df_copy.loc[fraud_indices, 'Amount'] = np.random.uniform(fraud_amount_min, fraud_amount_max, num_fraudulent).round(2)
    df_copy.loc[fraud_indices, 'Recipient'] = "Unknown"
    df_copy.loc[fraud_indices, 'Device Type'] = "Unknown"
    df_copy['is_fraudulent'] = False # Initialize all as False
    df_copy.loc[fraud_indices, 'is_fraudulent'] = True # Set selected ones to True

    return df_copy

In [0]:
test_df.head()

In [0]:
transactions_list = generate_transaction_data(90, 30, 300, 50, ['Walmart', 'Tim Hortons', 'Nike', 'Esso', 'Presto'], ['Credit Card', 'Online', 'Apple Pay'], 1, [])

In [0]:
transactions_list = generate_transaction_data(90, 30, 500, 100, ['Whole Foods', 'Tim Hortons', 'Gucci', 'Esso', 'Presto'], ['Credit Card', 'Online', 'Apple Pay'], 2, transactions_list)

In [0]:
transactions_list = generate_transaction_data(90, 30, 250, 75, ['Sobeys', 'Starbucks', 'Prada', 'Esso', 'Presto'], ['Credit Card', 'Online', 'Apple Pay'], 3, transactions_list)

In [0]:
transactions_list = generate_transaction_data(90, 30, 100, 25, ['NoFrills', 'Tim Hortons', 'Joe Fresh', 'Esso', 'Presto'], ['Credit Card', 'Online', 'Apple Pay'], 4, transactions_list)

In [0]:
transactions_list = generate_transaction_data(90, 30, 150, 25, ['NoFrills', 'Tim Hortons', 'Joe Fresh', 'Esso', 'Presto'], ['Credit Card', 'Online', 'Apple Pay'], 5, transactions_list)

In [0]:
transactions_df = pd.DataFrame(transactions_list)

In [0]:
transactions_with_fraud_df = introduce_fraud(transactions_df)

In [0]:
transactions_with_fraud_df = transactions_with_fraud_df.drop(columns=['is_fraudulent'])

In [0]:
create_sql_table(database_name, transaction_table, transactions_with_fraud_df.rename(columns={'Customer ID': 'customer_iD', 'Transaction Date': 'transaction_date', 'Amount': 'amount', 'Recipient': 'recipient', 'Device Type': 'device_type'}))