# Data Preprocessing and Migrating

This notebook loads and preprocesses data from CSV and JSON files into a MySQL database (`finance`). It populates the `users`, `mcc`, `cards` and `transactions`

In [None]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:.Tldccmcbtldck2@localhost/finance')

In [None]:
# POPULATE USERS TABLE

# Preprocess monetary columns by removing '$' and converting to numeric
df = pd.read_csv('/home/bnguyen/Desktop/finance_analytics/datasets/users_data.csv')
df['per_capita_income'] = pd.to_numeric(df['per_capita_income'].astype(str).str.replace('$', ''), errors='coerce')
df['yearly_income'] = pd.to_numeric(df['yearly_income'].astype(str).str.replace('$', ''), errors='coerce')
df['total_debt'] = pd.to_numeric(df['total_debt'].astype(str).str.replace('$', ''), errors='coerce')

# collumn 'id' to 'client_id' to match the database
df.rename(columns={
    'id': 'client_id',
}, inplace=True)

df.to_sql('users', con=engine, if_exists='append', index=False, chunksize=60000)

In [None]:
# POPULATE MCC_CODES TABLE
import json

# Load JSON data
with open('/home/bnguyen/Desktop/finance_analytics/datasets/mcc_codes.json') as f:
    mcc_dict = json.load(f)

df_mcc = pd.DataFrame(list(mcc_dict.items()), columns=['mcc', 'merchant_type'])
df_mcc['mcc'] = df_mcc['mcc'].astype(int)
df_mcc.to_sql('mcc_codes', con=engine, if_exists='append', index=False)

In [None]:
# POPULATE CARDS TABLE
df = pd.read_csv('/home/bnguyen/Desktop/finance_analytics/datasets/cards_data.csv')

# column rename
df.rename(columns={
    'id': 'card_id',
}, inplace=True)

# remove $, convert to numeric
df['credit_limit'] = pd.to_numeric(df['credit_limit'].astype(str).str.replace('$', ''), errors='coerce')

# convert date
df['expires'] = pd.to_datetime(df['expires'], format='%m/%Y', errors='coerce').dt.to_period('M').dt.to_timestamp().dt.date

# convert date
df['acct_open_date'] = pd.to_datetime(df['acct_open_date'], format='%m/%Y', errors='coerce')
df['acct_open_date'] = df['acct_open_date'].fillna(pd.to_datetime(df['acct_open_date'], format='%d/%m/%Y', errors='coerce'))
df['acct_open_date'] = df['acct_open_date'].dt.to_period('M').dt.to_timestamp().dt.date

df.to_sql('cards', con=engine, if_exists='append', index=False, chunksize=60000)

In [None]:
# POPULATE TRANSACTIONS TABLE
df = pd.read_csv('/home/bnguyen/Desktop/finance_analytics/datasets/transactions_data.csv')

# column rename
df.rename(columns={
    'id': 'transaction_id',
    'date': 'trans_date'
}, inplace=True)

# remove $, convert to numeric  
df['amount'] = pd.to_numeric(df['amount'].astype(str).str.replace('$', ''), errors='coerce')

# convert datetime
df['trans_date'] = pd.to_datetime(df['trans_date'], errors='coerce')

df.to_sql('transactions', con=engine, if_exists='append', index=False, chunksize=40000)

In [4]:
# POPULATE FRAUD_LABELS TABLE
import json
from pathlib import Path
import pandas as pd
from sqlalchemy.exc import IntegrityError

labels_path = Path('/home/bnguyen/Desktop/finance_analytics/datasets/train_fraud_labels.json')
with labels_path.open('r') as f:
    raw = json.load(f)

# collect all transaction_id->label pairs where label is 'Yes' or 'No'
labels = {}
def collect_labels(obj):
    if isinstance(obj, dict):
        for k, v in obj.items():
            if isinstance(v, str) and v in ('Yes', 'No'):
                labels[k] = v
            else:
                collect_labels(v)
    elif isinstance(obj, list):
        for item in obj:
            collect_labels(item)

collect_labels(raw)

df_labels = pd.DataFrame(list(labels.items()), columns=['transaction_id', 'label'])
df_labels['transaction_id'] = pd.to_numeric(df_labels['transaction_id'], errors='coerce')
df_labels = df_labels.dropna(subset=['transaction_id'])
df_labels['transaction_id'] = df_labels['transaction_id'].astype(int)
df_labels = df_labels[df_labels['label'].isin(['Yes','No'])].drop_duplicates(['transaction_id'])

# avoid inserting duplicates and enforce FK to transactions
try:
    existing = pd.read_sql('SELECT transaction_id FROM fraud_labels', con=engine)
    existing_ids = set(existing['transaction_id'].astype(int).tolist()) if not existing.empty else set()
except Exception:
    existing_ids = set()

try:
    tx = pd.read_sql('SELECT transaction_id FROM transactions', con=engine)
    valid_tx_ids = set(tx['transaction_id'].astype(int).tolist()) if not tx.empty else set()
except Exception:
    valid_tx_ids = set()

df_new = df_labels[~df_labels['transaction_id'].isin(existing_ids)]
if valid_tx_ids:
    df_new = df_new[df_new['transaction_id'].isin(valid_tx_ids)]

if df_new.empty:
    print("No new fraud labels to insert.")
else:
    try:
        df_new.to_sql('fraud_labels', con=engine, if_exists='append', index=False, chunksize=10000)
        print(f"Inserted {len(df_new)} new fraud label(s).")
    except IntegrityError as e:
        print("IntegrityError on insert:", e)


Inserted 8914963 new fraud label(s).
