In [2]:
import os
import json
import numpy as np
import pandas as pd
import kagglehub

In [3]:
path = kagglehub.dataset_download("computingvictor/transactions-fraud-datasets")
print("Path to dataset files:", path)

transactions_data = pd.read_csv(os.path.join(path, "transactions_data.csv"))
users_data        = pd.read_csv(os.path.join(path, "users_data.csv"))
cards_data        = pd.read_csv(os.path.join(path, "cards_data.csv"))

with open(os.path.join(path, "mcc_codes.json"), "r") as f:
    mcc_dict = json.load(f)

Path to dataset files: /home/cunbidun/.cache/kagglehub/datasets/computingvictor/transactions-fraud-datasets/versions/1


Preprocessing

In [4]:
def parse_money_to_float(s):
    """
    Convert money-like strings to float.
    Handles $, commas, negatives; returns np.nan on failure.
    Examples: "$1,234.56" -> 1234.56 ; "($77.00)" -> -77.0 ; "-$77.00" -> -77.0
    """
    if pd.isna(s):
        return np.nan
    s = str(s).strip()
    neg = False
    if s.startswith("(") and s.endswith(")"):
        neg = True
        s = s[1:-1]
    s = s.replace("$", "").replace(",", "")
    try:
        val = float(s)
        return -val if neg else val
    except ValueError:
        return np.nan

def to_datetime(series, fmt=None):
    """Coerce to datetime with optional format; never throws."""
    return pd.to_datetime(series, format=fmt, errors="coerce")

def normalize_zip_to_str(series):
    """Convert to 5+ digit ZIP (string). Keeps non-numeric as 'Unknown'."""
    s = series.astype(str)
    # treat 'nan'/'NaT'/'None' as missing
    s = s.where(~s.str.lower().isin({"nan", "nat", "none"}), other=np.nan)
    # strip decimals like '58523.0'
    s = s.str.replace(r"\.0$", "", regex=True)
    # if numeric, pad to at least 5; else mark Unknown
    s_clean = []
    for v in s:
        if pd.isna(v):
            s_clean.append("Unknown")
        else:
            vv = v.strip()
            if vv.isdigit():
                s_clean.append(vv.zfill(5))
            else:
                # keep as-is if it looks like a ZIP+4 (e.g., '10027-1234'), else Unknown
                s_clean.append(vv if "-" in vv and vv.split("-")[0].isdigit() else "Unknown")
    return pd.Series(s_clean, index=series.index, dtype="string")

def to_category(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype("category")

Transactions data preprocessing

In [5]:
transactions = transactions_data.copy()

In [6]:
transactions.rename(columns={"id":"transaction_id", "client_id":"user_id"}, inplace=True)

In [7]:
# date
transactions["date"] = to_datetime(transactions["date"])

In [8]:
# amount to float
transactions["amount"] = transactions["amount"].apply(parse_money_to_float)

In [9]:
# MCC to 4-digit string (left-padded)
transactions["mcc"] = transactions["mcc"].astype(str).str.replace(r"\.0$", "", regex=True).str.zfill(4)

In [10]:
# clean text-y columns
for col in ["merchant_city", "merchant_state", "use_chip"]:
    if col in transactions.columns:
        transactions[col] = transactions[col].astype("string").fillna("Unknown")

In [11]:
# ZIP normalized to string
if "zip" in transactions.columns:
    transactions["zip"] = normalize_zip_to_str(transactions["zip"])

In [12]:
# errors column as string
if "errors" in transactions.columns:
    transactions["errors"] = transactions["errors"].astype("string")

Users data preprocessing

In [13]:
users = users_data.copy()

In [14]:
if "id" in users.columns and "user_id" not in users.columns:
    users.rename(columns={"id": "user_id"}, inplace=True)

In [15]:
# money-like int columns
for col in ["per_capita_income", "yearly_income", "total_debt"]:
    if col in users.columns:
        users[col] = (
            users[col]
            .astype(str)
            .str.replace(r"[,\$]", "", regex=True)
            .replace({"": np.nan})
            .astype(float)
            .astype("Float64")
        )

In [16]:
if "address" in users.columns:
    users["address"] = users["address"].astype("string")

to_category(users, ["gender"])

Cards data preprocessing

In [17]:
cards = cards_data.copy()

In [18]:
if "id" in cards.columns and "card_id" not in cards.columns:
    cards.rename(columns={"id": "card_id"}, inplace=True)


In [19]:
# card identifier as str
for col in ["card_number", "cvv"]:
    if col in cards.columns:
        cards[col] = cards[col].astype("string")

In [20]:
# categorical fields
to_category(cards, ["card_brand", "card_type", "has_chip", "card_on_dark_web"])

In [21]:
# Dates
if "expires" in cards.columns:
    # If expires is like "03/25"
    cards["expires"] = to_datetime(cards["expires"], fmt="%m/%y").fillna(to_datetime(cards["expires"]))
if "acct_open_date" in cards.columns:
    cards["acct_open_date"] = to_datetime(cards["acct_open_date"])

  return pd.to_datetime(series, format=fmt, errors="coerce")
  return pd.to_datetime(series, format=fmt, errors="coerce")


In [22]:
# credit limit to float
if "credit_limit" in cards.columns:
    cards["credit_limit"] = (
        cards["credit_limit"]
        .astype(str)
        .str.replace(r"[,\$]", "", regex=True)
        .replace({"": np.nan})
        .astype(float)
    )

MCC codes

In [23]:
mcc = pd.DataFrame.from_dict(mcc_dict, orient="index", columns=["description"])
mcc.index.name = "mcc"
mcc.reset_index(inplace=True)
mcc["mcc"] = mcc["mcc"].astype(str).str.zfill(4)
mcc.drop_duplicates(subset=["mcc"], inplace=True)
display(mcc.head())

Unnamed: 0,mcc,description
0,5812,Eating Places and Restaurants
1,5541,Service Stations
2,7996,"Amusement Parks, Carnivals, Circuses"
3,5411,"Grocery Stores, Supermarkets"
4,4784,Tolls and Bridge Fees


 transactions + MCC

In [24]:
tx = transactions.merge(mcc, how="left", on="mcc")
tx

Unnamed: 0,transaction_id,date,user_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,description
0,7475327,2010-01-01 00:01:00,1556,2972,-77.00,Swipe Transaction,59935,Beulah,ND,58523,5499,,Miscellaneous Food Stores
1,7475328,2010-01-01 00:02:00,561,4575,14.57,Swipe Transaction,67570,Bettendorf,IA,52722,5311,,Department Stores
2,7475329,2010-01-01 00:02:00,1129,102,80.00,Swipe Transaction,27092,Vista,CA,92084,4829,,Money Transfer
3,7475331,2010-01-01 00:05:00,430,2860,200.00,Swipe Transaction,27092,Crown Point,IN,46307,4829,,Money Transfer
4,7475332,2010-01-01 00:06:00,848,3915,46.41,Swipe Transaction,13051,Harwood,MD,20776,5813,,Drinking Places (Alcoholic Beverages)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,1.11,Chip Transaction,86438,West Covina,CA,91792,5499,,Miscellaneous Food Stores
13305911,23761869,2019-10-31 23:56:00,1766,2066,12.80,Online Transaction,39261,ONLINE,Unknown,Unknown,5815,,"Digital Goods - Media, Books, Apps"
13305912,23761870,2019-10-31 23:57:00,199,1031,40.44,Swipe Transaction,2925,Allen,TX,75002,4900,,"Utilities - Electric, Gas, Water, Sanitary"
13305913,23761873,2019-10-31 23:58:00,1986,5443,4.00,Chip Transaction,46284,Daly City,CA,94014,5411,,"Grocery Stores, Supermarkets"


 transactions + MCC + big categories

In [25]:
# Load big categories JSON file
with open("big_categories.json", "r") as f:
    big_categories = json.load(f)

# Create a mapping from MCC code to big category
mcc_to_category = {}
for big_cat, mcc_codes in big_categories.items():
    for mcc_code in mcc_codes:
        # Ensure MCC code is 4-digit string (left-padded with zeros)
        mcc_code_str = str(mcc_code).zfill(4)
        mcc_to_category[mcc_code_str] = big_cat

# Convert to DataFrame for merging
big_cat_df = pd.DataFrame.from_dict(mcc_to_category, orient="index", columns=["category"])
big_cat_df.index.name = "mcc"
big_cat_df.reset_index(inplace=True)

print(f"Found {len(big_cat_df)} MCC codes mapped to big categories")
print("\nSample mappings:")
display(big_cat_df.head(10))

# Merge with transactions data
tx_with_big_cat = tx.merge(big_cat_df, how="left", on="mcc")

print(f"\nOriginal tx shape: {tx.shape}")
print(f"tx_with_big_cat shape: {tx_with_big_cat.shape}")
print(f"Transactions with big category: {tx_with_big_cat['category'].notna().sum():,}")
print(f"Transactions without big category: {tx_with_big_cat['category'].isna().sum():,}")

tx_with_big_cat

Found 109 MCC codes mapped to big categories

Sample mappings:


Unnamed: 0,mcc,category
0,5812,Food & Dining
1,5814,Food & Dining
2,5813,Food & Dining
3,5411,Food & Dining
4,5499,Food & Dining
5,5921,Food & Dining
6,5942,Food & Dining
7,7996,Entertainment & Recreation
8,7832,Entertainment & Recreation
9,7922,Entertainment & Recreation



Original tx shape: (13305915, 13)
tx_with_big_cat shape: (13305915, 14)
Transactions with big category: 13,305,915
Transactions without big category: 0
Transactions without big category: 0


Unnamed: 0,transaction_id,date,user_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,mcc,errors,description,category
0,7475327,2010-01-01 00:01:00,1556,2972,-77.00,Swipe Transaction,59935,Beulah,ND,58523,5499,,Miscellaneous Food Stores,Food & Dining
1,7475328,2010-01-01 00:02:00,561,4575,14.57,Swipe Transaction,67570,Bettendorf,IA,52722,5311,,Department Stores,Shopping & Retail
2,7475329,2010-01-01 00:02:00,1129,102,80.00,Swipe Transaction,27092,Vista,CA,92084,4829,,Money Transfer,Financial & Insurance
3,7475331,2010-01-01 00:05:00,430,2860,200.00,Swipe Transaction,27092,Crown Point,IN,46307,4829,,Money Transfer,Financial & Insurance
4,7475332,2010-01-01 00:06:00,848,3915,46.41,Swipe Transaction,13051,Harwood,MD,20776,5813,,Drinking Places (Alcoholic Beverages),Food & Dining
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,1.11,Chip Transaction,86438,West Covina,CA,91792,5499,,Miscellaneous Food Stores,Food & Dining
13305911,23761869,2019-10-31 23:56:00,1766,2066,12.80,Online Transaction,39261,ONLINE,Unknown,Unknown,5815,,"Digital Goods - Media, Books, Apps",Entertainment & Recreation
13305912,23761870,2019-10-31 23:57:00,199,1031,40.44,Swipe Transaction,2925,Allen,TX,75002,4900,,"Utilities - Electric, Gas, Water, Sanitary",Home & Utilities
13305913,23761873,2019-10-31 23:58:00,1986,5443,4.00,Chip Transaction,46284,Daly City,CA,94014,5411,,"Grocery Stores, Supermarkets",Food & Dining


add users on user_id (left join)

In [26]:
tx_users = tx_with_big_cat.merge(users.add_prefix("user_").rename(columns={"user_user_id":"user_id"}),
                    how="left", on="user_id")
tx_users

Unnamed: 0,transaction_id,date,user_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,...,user_birth_month,user_gender,user_address,user_latitude,user_longitude,user_per_capita_income,user_yearly_income,user_total_debt,user_credit_score,user_num_credit_cards
0,7475327,2010-01-01 00:01:00,1556,2972,-77.00,Swipe Transaction,59935,Beulah,ND,58523,...,7,Female,594 Mountain View Street,46.80,-100.76,23679.0,48277.0,110153.0,740,4
1,7475328,2010-01-01 00:02:00,561,4575,14.57,Swipe Transaction,67570,Bettendorf,IA,52722,...,6,Male,604 Pine Street,40.80,-91.12,18076.0,36853.0,112139.0,834,5
2,7475329,2010-01-01 00:02:00,1129,102,80.00,Swipe Transaction,27092,Vista,CA,92084,...,4,Male,2379 Forest Lane,33.18,-117.29,16894.0,34449.0,36540.0,686,3
3,7475331,2010-01-01 00:05:00,430,2860,200.00,Swipe Transaction,27092,Crown Point,IN,46307,...,5,Female,903 Hill Boulevard,41.42,-87.35,26168.0,53350.0,128676.0,685,5
4,7475332,2010-01-01 00:06:00,848,3915,46.41,Swipe Transaction,13051,Harwood,MD,20776,...,5,Male,166 River Drive,38.86,-76.60,33529.0,68362.0,96182.0,711,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,1.11,Chip Transaction,86438,West Covina,CA,91792,...,11,Female,766 Third Drive,34.02,-117.89,22681.0,33483.0,196.0,698,5
13305911,23761869,2019-10-31 23:56:00,1766,2066,12.80,Online Transaction,39261,ONLINE,Unknown,Unknown,...,9,Male,6076 Bayview Boulevard,43.06,-87.96,9995.0,20377.0,12092.0,789,4
13305912,23761870,2019-10-31 23:57:00,199,1031,40.44,Swipe Transaction,2925,Allen,TX,75002,...,4,Female,7927 Plum Lane,33.10,-96.66,32580.0,78329.0,40161.0,720,3
13305913,23761873,2019-10-31 23:58:00,1986,5443,4.00,Chip Transaction,46284,Daly City,CA,94014,...,12,Female,5887 Seventh Lane,37.68,-122.43,23752.0,48430.0,62384.0,716,2


 add cards on card_id (left join)

In [27]:
cards_for_merge = cards.copy()
cards_for_merge.rename(columns={"client_id": "card_client_user_id"}, inplace=True)

In [28]:
tx_full = tx_users.merge(
    cards_for_merge.add_prefix("card_").rename(columns={"card_card_id": "card_id"}),
    how="left",
    on="card_id",
    suffixes=("", "_carddup")
)
tx_full

Unnamed: 0,transaction_id,date,user_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,...,card_card_type,card_card_number,card_expires,card_cvv,card_has_chip,card_num_cards_issued,card_credit_limit,card_acct_open_date,card_year_pin_last_changed,card_card_on_dark_web
0,7475327,2010-01-01 00:01:00,1556,2972,-77.00,Swipe Transaction,59935,Beulah,ND,58523,...,Debit (Prepaid),5497590243197280,2022-07-01,306,YES,2,55.0,2008-05-01,2008,No
1,7475328,2010-01-01 00:02:00,561,4575,14.57,Swipe Transaction,67570,Bettendorf,IA,52722,...,Credit,5175842699412235,2024-12-01,438,YES,1,9100.0,2005-09-01,2015,No
2,7475329,2010-01-01 00:02:00,1129,102,80.00,Swipe Transaction,27092,Vista,CA,92084,...,Debit,5874992802287595,2020-05-01,256,YES,1,14802.0,2006-01-01,2008,No
3,7475331,2010-01-01 00:05:00,430,2860,200.00,Swipe Transaction,27092,Crown Point,IN,46307,...,Debit,5346827663529174,2024-10-01,54,NO,2,37634.0,2004-05-01,2006,No
4,7475332,2010-01-01 00:06:00,848,3915,46.41,Swipe Transaction,13051,Harwood,MD,20776,...,Debit,4354185735186651,2020-01-01,120,YES,1,19113.0,2009-07-01,2014,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,1.11,Chip Transaction,86438,West Covina,CA,91792,...,Debit,5766352389579834,2020-02-01,992,YES,1,26743.0,2019-03-01,2019,No
13305911,23761869,2019-10-31 23:56:00,1766,2066,12.80,Online Transaction,39261,ONLINE,Unknown,Unknown,...,Debit,5960254965868032,2024-06-01,91,YES,1,5141.0,2004-03-01,2012,No
13305912,23761870,2019-10-31 23:57:00,199,1031,40.44,Swipe Transaction,2925,Allen,TX,75002,...,Debit,5930100401040067,2021-06-01,908,YES,1,17686.0,2004-02-01,2007,No
13305913,23761873,2019-10-31 23:58:00,1986,5443,4.00,Chip Transaction,46284,Daly City,CA,94014,...,Debit,4696464603505748,2021-11-01,762,YES,2,14036.0,2005-11-01,2010,No


In [29]:
core_cols = [
    # transaction core
    "transaction_id", "date", "user_id", "card_id", "amount",
    "use_chip", "merchant_id", "merchant_city", "merchant_state", "zip",
    "mcc", "description", "category", "errors",
    # user features (prefixed with user_)
    "user_current_age", "user_retirement_age", "user_birth_year", "user_birth_month",
    "user_gender", "user_address", "user_latitude", "user_longitude",
    "user_per_capita_income", "user_yearly_income", "user_total_debt",
    "user_credit_score", "user_num_credit_cards",
    # card features (prefixed with card_)
    "card_card_brand", "card_card_type", "card_has_chip", "card_cvv", "card_expires", "card_num_cards_issued",
    "card_credit_limit", "card_acct_open_date", "card_year_pin_last_changed",
    "card_card_on_dark_web"
]

master_df = tx_full[core_cols].copy()
master_df

Unnamed: 0,transaction_id,date,user_id,card_id,amount,use_chip,merchant_id,merchant_city,merchant_state,zip,...,card_card_brand,card_card_type,card_has_chip,card_cvv,card_expires,card_num_cards_issued,card_credit_limit,card_acct_open_date,card_year_pin_last_changed,card_card_on_dark_web
0,7475327,2010-01-01 00:01:00,1556,2972,-77.00,Swipe Transaction,59935,Beulah,ND,58523,...,Mastercard,Debit (Prepaid),YES,306,2022-07-01,2,55.0,2008-05-01,2008,No
1,7475328,2010-01-01 00:02:00,561,4575,14.57,Swipe Transaction,67570,Bettendorf,IA,52722,...,Mastercard,Credit,YES,438,2024-12-01,1,9100.0,2005-09-01,2015,No
2,7475329,2010-01-01 00:02:00,1129,102,80.00,Swipe Transaction,27092,Vista,CA,92084,...,Mastercard,Debit,YES,256,2020-05-01,1,14802.0,2006-01-01,2008,No
3,7475331,2010-01-01 00:05:00,430,2860,200.00,Swipe Transaction,27092,Crown Point,IN,46307,...,Mastercard,Debit,NO,54,2024-10-01,2,37634.0,2004-05-01,2006,No
4,7475332,2010-01-01 00:06:00,848,3915,46.41,Swipe Transaction,13051,Harwood,MD,20776,...,Visa,Debit,YES,120,2020-01-01,1,19113.0,2009-07-01,2014,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13305910,23761868,2019-10-31 23:56:00,1718,2379,1.11,Chip Transaction,86438,West Covina,CA,91792,...,Mastercard,Debit,YES,992,2020-02-01,1,26743.0,2019-03-01,2019,No
13305911,23761869,2019-10-31 23:56:00,1766,2066,12.80,Online Transaction,39261,ONLINE,Unknown,Unknown,...,Mastercard,Debit,YES,91,2024-06-01,1,5141.0,2004-03-01,2012,No
13305912,23761870,2019-10-31 23:57:00,199,1031,40.44,Swipe Transaction,2925,Allen,TX,75002,...,Mastercard,Debit,YES,908,2021-06-01,1,17686.0,2004-02-01,2007,No
13305913,23761873,2019-10-31 23:58:00,1986,5443,4.00,Chip Transaction,46284,Daly City,CA,94014,...,Visa,Debit,YES,762,2021-11-01,2,14036.0,2005-11-01,2010,No


In [30]:
master_df.columns

Index(['transaction_id', 'date', 'user_id', 'card_id', 'amount', 'use_chip',
       'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc',
       'description', 'category', 'errors', 'user_current_age',
       'user_retirement_age', 'user_birth_year', 'user_birth_month',
       'user_gender', 'user_address', 'user_latitude', 'user_longitude',
       'user_per_capita_income', 'user_yearly_income', 'user_total_debt',
       'user_credit_score', 'user_num_credit_cards', 'card_card_brand',
       'card_card_type', 'card_has_chip', 'card_cvv', 'card_expires',
       'card_num_cards_issued', 'card_credit_limit', 'card_acct_open_date',
       'card_year_pin_last_changed', 'card_card_on_dark_web'],
      dtype='object')

Missing Data

In [31]:
def missing_report(df, sort_by="pct_missing", descending=True):
    rep = (
        df.isna()
          .sum()
          .rename("n_missing")
          .to_frame()
          .assign(
              n_rows=len(df),
              pct_missing=lambda x: (x["n_missing"] / x["n_rows"]).round(4)
          )
          .join(df.dtypes.rename("dtype"))
          .reset_index()
          .rename(columns={"index":"column"})
          .sort_values(sort_by, ascending=not descending, ignore_index=True)
    )
    return rep

report = missing_report(master_df)
print(report.head(25))   # top 25 by % missing
print(f"Total rows: {len(master_df):,}  |  Columns: {master_df.shape[1]}")

                    column  n_missing    n_rows  pct_missing           dtype
0                   errors   13094522  13305915       0.9841  string[python]
1                     date          0  13305915       0.0000  datetime64[ns]
2                  user_id          0  13305915       0.0000           int64
3                  card_id          0  13305915       0.0000           int64
4           transaction_id          0  13305915       0.0000           int64
5                   amount          0  13305915       0.0000         float64
6                 use_chip          0  13305915       0.0000  string[python]
7            merchant_city          0  13305915       0.0000  string[python]
8              merchant_id          0  13305915       0.0000           int64
9                      zip          0  13305915       0.0000  string[python]
10                     mcc          0  13305915       0.0000          object
11             description          0  13305915       0.0000          object

In [32]:
master_df = master_df.drop(columns=["errors"])

In [33]:
# save to CSV
master_df.to_csv("merged-df.csv", index=False)