In [None]:
!pip install --upgrade pip
!pip install fosforml numpy pandas matplotlib scikit-learn seaborn python-dateutil
!pip uninstall urllib3 -y
!pip install urllib3==1.26.15
!pip install fosforml 
!pip install fosforio
!pip install refractio
!pip install refractml

In [None]:
!pip install seaborn scipy xgboost pandas Faker dice-ml tabulate numpy scikit-learn pandas-profiling plotly matplotlib scipy statsmodels seaborn pydantic-settings

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
from scipy.stats.mstats import winsorize
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import joblib
from fosforml import *
from fosforml.constants import MLModelFlavours
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
# Initialize Faker
fake = Faker()
 
# Number of records to generate
num_records = 300000
 
# Pre-defined values
transaction_types = ['Online', 'In-Store Purchase', 'ATM Withdrawal', 'Bill Payment', 'Direct Debit', 'Wire Transfer']
merchant_categories = ['Electronics', 'Apparel', 'Restaurants', 'Supermarkets', 'Pharmacies', 'Entertainment', 'Travel', 'Other']
payment_methods = ['Credit Card', 'Debit Card', 'E-Wallet', 'Cash']
customer_segments = ['Individual', 'Business', 'Student', 'Senior']
account_types = ['Checking Account', 'Savings Account', 'Business Account', 'Investment Account']
devices = ['Mobile', 'Desktop', 'Tablet', 'Laptop', 'Other']
transaction_statuses = ['Completed', 'Pending', 'Failed', 'On Hold', 'In Process']
times_of_day = ['Morning', 'Afternoon', 'Evening', 'Night', 'Midnight']
 


In [3]:
# Lists to hold the generated data
data = {
    'CustomerID': [],
    'TransactionID': [],
    'AnomalyScore': [],
    'Time': [],
    'Transaction Amount': [],
    'Credit Limit': [],
    'Age of Account': [],
    'Frequency of Transactions': [],
    'Merchant Risk Score': [],
    'Customer Loyalty Score': [],
    'Distance from Home Address': [],
    'Distance from Last Transaction': [],
    'Time Since Last Transaction': [],
    'Average Transaction Value': [],
    'Number of Refunds': [],
    'Number of Chargebacks': [],
    'Credit Utilization Ratio': [],
    'Change in Spending Behavior': [],
    'Transaction Velocity': [],
    'Transaction Date': [],
    'Account Opening Date': [],
    'Lastlogin': [],
    'FraudIndicator': [],
    'SuspiciousFlag': [],
    'Transaction Type': [],
    'Merchant Category': [],
    'Payment Method': [],
    'Customer Segment': [],
    'Account Type': [],
    'Device Used': [],
    'Time of Day': [],
    'Day of Week': [],
    'Transaction Status': [],
    'Previous Fraud Reports': [],
    'Cross-border Transaction Indicator': [],
    'New Device Indicator': []
}
 

In [5]:
# Generating Data
for _ in range(num_records):
    # Generate IDs
    customer_id = fake.unique.random_int(min=1000, max=999999)
    transaction_id = fake.unique.random_int(min=100000, max=99999999)

In [6]:
    # Generate random data
    anomaly_score = round(np.random.uniform(0, 1), 2)
    time = fake.time(pattern='%H:%M:%S')
    transaction_amount = round(np.random.uniform(10, 10000), 2)
    credit_limit = random.randint(2000, 35000)
    age_of_account = random.randint(1, 30)
    frequency_of_transactions = random.randint(1, 100)
    merchant_risk_score = round(np.random.uniform(0, 100), 2)
    customer_loyalty_score = round(np.random.uniform(0, 100), 2)
    distance_from_home = round(np.random.uniform(0, 100), 2)
    distance_from_last = round(np.random.uniform(0, 100), 2)
    time_since_last = round(np.random.uniform(0, 24), 2)  # Hours since last transaction
    average_transaction_value = round(np.random.uniform(10, 5000), 2)
    number_of_refunds = random.randint(0, 5)
    number_of_chargebacks = random.randint(0, 3)
    credit_utilization_ratio = round(np.random.uniform(0, 1), 2)
    change_in_spending_behavior = round(np.random.uniform(0, 100), 2)
    transaction_velocity = round(np.random.uniform(0, 10), 2)
 
    # Transaction Date (within the last 7 years)
    transaction_date = fake.date_between(start_date='-7y', end_date='today')
    account_opening_date = fake.date_between(start_date='-30y', end_date=transaction_date)
    lastlogin = fake.date_between(start_date='-1y', end_date=transaction_date)
 
    # Calculate derived fields
    time_of_day = 'Morning'
    hour = int(time.split(":")[0])
    if 12 <= hour < 17:
        time_of_day = 'Afternoon'
    elif 17 <= hour < 21:
        time_of_day = 'Evening'
    elif 21 <= hour <= 23:
        time_of_day = 'Night'
    elif 0 <= hour < 6:
        time_of_day = 'Midnight'
 
    day_of_week = transaction_date.strftime('%A')
 
    # Random selection of other fields
    transaction_type = np.random.choice(transaction_types)
    merchant_category = np.random.choice(merchant_categories)
    payment_method = np.random.choice(payment_methods)
    customer_segment = np.random.choice(customer_segments)
    account_type = np.random.choice(account_types)
    device_used = 'Other' if payment_method == 'Cash' else np.random.choice(devices)
    transaction_status = np.random.choice(transaction_statuses)
 
    fraud_indicator = np.random.choice([0, 1], p=[0.95, 0.05])
    suspicious_flag = np.random.choice([0, 1], p=[0.98, 0.02])
    previous_fraud_reports = random.randint(0, 2)
    cross_border_indicator = np.random.choice([0, 1], p=[0.9, 0.1])
    new_device_indicator = np.random.choice([0, 1], p=[0.85, 0.15])


ValueError: empty range for _rand_seconds: start datetime must be before than end datetime

In [None]:
 # Append to data lists
    data['CustomerID'].append(customer_id)
    data['TransactionID'].append(transaction_id)
    data['AnomalyScore'].append(anomaly_score)
    data['Time'].append(time)
    data['Transaction Amount'].append(transaction_amount)
    data['Credit Limit'].append(credit_limit)
    data['Age of Account'].append(age_of_account)
    data['Frequency of Transactions'].append(frequency_of_transactions)
    data['Merchant Risk Score'].append(merchant_risk_score)
    data['Customer Loyalty Score'].append(customer_loyalty_score)
    data['Distance from Home Address'].append(distance_from_home)
    data['Distance from Last Transaction'].append(distance_from_last)
    data['Time Since Last Transaction'].append(time_since_last)
    data['Average Transaction Value'].append(average_transaction_value)
    data['Number of Refunds'].append(number_of_refunds)
    data['Number of Chargebacks'].append(number_of_chargebacks)
    data['Credit Utilization Ratio'].append(credit_utilization_ratio)
    data['Change in Spending Behavior'].append(change_in_spending_behavior)
    data['Transaction Velocity'].append(transaction_velocity)
    data['Transaction Date'].append(transaction_date)
    data['Account Opening Date'].append(account_opening_date)
    data['Lastlogin'].append(lastlogin)
    data['FraudIndicator'].append(fraud_indicator)
    data['SuspiciousFlag'].append(suspicious_flag)
    data['Transaction Type'].append(transaction_type)
    data['Merchant Category'].append(merchant_category)
    data['Payment Method'].append(payment_method)
    data['Customer Segment'].append(customer_segment)
    data['Account Type'].append(account_type)
    data['Device Used'].append(device_used)
    data['Time of Day'].append(time_of_day)
    data['Day of Week'].append(day_of_week)
    data['Transaction Status'].append(transaction_status)
    data['Previous Fraud Reports'].append(previous_fraud_reports)
    data['Cross-border Transaction Indicator'].append(cross_border_indicator)
    data['New Device Indicator'].append(new_device_indicator)
 

In [None]:
 
# Create DataFrame
df = pd.DataFrame(data)
 
# Display first 10 rows
print(df.head(10))
 
# Save to CSV
df.to_csv('generated_fraud_detection_data.csv', index=False)
 

In [None]:
# Lists to hold the generated data
data = {
    'CustomerID': [],
    'TransactionID': [],
    'AnomalyScore': [],
    'Time': [],
    'Transaction Amount': [],
    'Credit Limit': [],
    'Age of Account': [],
    'Frequency of Transactions': [],
    'Merchant Risk Score': [],
    'Customer Loyalty Score': [],
    'Distance from Home Address': [],
    'Distance from Last Transaction': [],
    'Time Since Last Transaction': [],
    'Average Transaction Value': [],
    'Number of Refunds': [],
    'Number of Chargebacks': [],
    'Credit Utilization Ratio': [],
    'Change in Spending Behavior': [],
    'Transaction Velocity': [],
    'Transaction Date': [],
    'Account Opening Date': [],
    'Lastlogin': [],
    'FraudIndicator': [],
    'SuspiciousFlag': [],
    'Transaction Type': [],
    'Merchant Category': [],
    'Payment Method': [],
    'Customer Segment': [],
    'Account Type': [],
    'Device Used': [],
    'Time of Day': [],
    'Day of Week': [],
    'Transaction Status': [],
    'Previous Fraud Reports': [],
    'Cross-border Transaction Indicator': [],
    'New Device Indicator': []
}
 
# Generating Data
for _ in range(num_records):
    # Generate IDs
    customer_id = fake.unique.random_int(min=1000, max=9999)
    transaction_id = fake.unique.random_int(min=100000, max=999999)
 
    # Generate random data
    anomaly_score = round(np.random.uniform(0, 1), 2)
    time = fake.time(pattern='%H:%M:%S')
    transaction_amount = round(np.random.uniform(10, 10000), 2)
    credit_limit = random.randint(2000, 35000)
    age_of_account = random.randint(1, 30)
    frequency_of_transactions = random.randint(1, 100)
    merchant_risk_score = round(np.random.uniform(0, 100), 2)
    customer_loyalty_score = round(np.random.uniform(0, 100), 2)
    distance_from_home = round(np.random.uniform(0, 100), 2)
    distance_from_last = round(np.random.uniform(0, 100), 2)
    time_since_last = round(np.random.uniform(0, 24), 2)  # Hours since last transaction
    average_transaction_value = round(np.random.uniform(10, 5000), 2)
    number_of_refunds = random.randint(0, 5)
    number_of_chargebacks = random.randint(0, 3)
    credit_utilization_ratio = round(np.random.uniform(0, 1), 2)
    change_in_spending_behavior = round(np.random.uniform(0, 100), 2)
    transaction_velocity = round(np.random.uniform(0, 10), 2)
 
    # Transaction Date (within the last 7 years)
    transaction_date = fake.date_between(start_date='-7y', end_date='today')
    account_opening_date = fake.date_between(start_date='-30y', end_date=transaction_date)
    lastlogin = fake.date_between(start_date='-1y', end_date=transaction_date)
 
    # Calculate derived fields
    time_of_day = 'Morning'
    hour = int(time.split(":")[0])
    if 12 <= hour < 17:
        time_of_day = 'Afternoon'
    elif 17 <= hour < 21:
        time_of_day = 'Evening'
    elif 21 <= hour <= 23:
        time_of_day = 'Night'
    elif 0 <= hour < 6:
        time_of_day = 'Midnight'
 
    day_of_week = transaction_date.strftime('%A')
 
    # Random selection of other fields
    transaction_type = np.random.choice(transaction_types)
    merchant_category = np.random.choice(merchant_categories)
    payment_method = np.random.choice(payment_methods)
    customer_segment = np.random.choice(customer_segments)
    account_type = np.random.choice(account_types)
    device_used = 'Other' if payment_method == 'Cash' else np.random.choice(devices)
    transaction_status = np.random.choice(transaction_statuses)
 
    fraud_indicator = np.random.choice([0, 1], p=[0.95, 0.05])
    suspicious_flag = np.random.choice([0, 1], p=[0.98, 0.02])
    previous_fraud_reports = random.randint(0, 2)
    cross_border_indicator = np.random.choice([0, 1], p=[0.9, 0.1])
    new_device_indicator = np.random.choice([0, 1], p=[0.85, 0.15])
 
    # Append to data lists
    data['CustomerID'].append(customer_id)
    data['TransactionID'].append(transaction_id)
    data['AnomalyScore'].append(anomaly_score)
    data['Time'].append(time)
    data['Transaction Amount'].append(transaction_amount)
    data['Credit Limit'].append(credit_limit)
    data['Age of Account'].append(age_of_account)
    data['Frequency of Transactions'].append(frequency_of_transactions)
    data['Merchant Risk Score'].append(merchant_risk_score)
    data['Customer Loyalty Score'].append(customer_loyalty_score)
    data['Distance from Home Address'].append(distance_from_home)
    data['Distance from Last Transaction'].append(distance_from_last)
    data['Time Since Last Transaction'].append(time_since_last)
    data['Average Transaction Value'].append(average_transaction_value)
    data['Number of Refunds'].append(number_of_refunds)
    data['Number of Chargebacks'].append(number_of_chargebacks)
    data['Credit Utilization Ratio'].append(credit_utilization_ratio)
    data['Change in Spending Behavior'].append(change_in_spending_behavior)
    data['Transaction Velocity'].append(transaction_velocity)
    data['Transaction Date'].append(transaction_date)
    data['Account Opening Date'].append(account_opening_date)
    data['Lastlogin'].append(lastlogin)
    data['FraudIndicator'].append(fraud_indicator)
    data['SuspiciousFlag'].append(suspicious_flag)
    data['Transaction Type'].append(transaction_type)
    data['Merchant Category'].append(merchant_category)
    data['Payment Method'].append(payment_method)
    data['Customer Segment'].append(customer_segment)
    data['Account Type'].append(account_type)
    data['Device Used'].append(device_used)
    data['Time of Day'].append(time_of_day)
    data['Day of Week'].append(day_of_week)
    data['Transaction Status'].append(transaction_status)
    data['Previous Fraud Reports'].append(previous_fraud_reports)
    data['Cross-border Transaction Indicator'].append(cross_border_indicator)
    data['New Device Indicator'].append(new_device_indicator)
 
# Create DataFrame
df = pd.DataFrame(data)
 
# Display first 10 rows
print(df.head(10))
 
# Save to CSV
df.to_csv('generated_fraud_detection_data.csv', index=False)
 