In [1]:
import pandas as pd
df = pd.read_csv(r'F:\fraud_transaction_detection\synthetic_fraud_data_original.csv')
df.columns

Index(['transaction_id', 'customer_id', 'card_number', 'timestamp',
       'merchant_category', 'merchant_type', 'merchant', 'amount', 'currency',
       'country', 'city', 'city_size', 'card_type', 'card_present', 'device',
       'channel', 'device_fingerprint', 'ip_address', 'distance_from_home',
       'high_risk_merchant', 'transaction_hour', 'weekend_transaction',
       'velocity_last_hour', 'is_fraud'],
      dtype='object')

transaction_id: Unique identifier for each transaction.

customer_id: Unique identifier for each customer in the dataset.

card_number: Masked card number associated with the transaction.

timestamp: Date and time of the transaction.

merchant_category: General category of the merchant (e.g., Retail, Grocery, Travel).

merchant_type: Specific type within the merchant category (e.g., "online" for Retail).

merchant: Name of the merchant where the transaction took place.

amount: Transaction amount (currency based on the country).

currency: Currency used for the transaction (e.g., USD, EUR, JPY).

country: Country where the transaction occurred.

city: City where the transaction took place.

city_size: Size of the city (e.g., medium, large).

card_type: Type of card used (e.g., Basic Credit, Gold Credit).

card_present: Indicates if the card was physically present during the transaction (used in POS transactions).

device: Device used for the transaction (e.g., Chrome, iOS App, NFC Payment).

channel: Type of channel used for the transaction (web, mobile, POS).

device_fingerprint: Unique fingerprint for the device used in the transaction.

ip_address: IP address associated with the transaction.

distance_from_home: Binary indicator showing if the transaction occurred outside the customer's home country.

high_risk_merchant: Indicates if the merchant category is known for higher fraud risk (e.g., Travel, Entertainment).

transaction_hour: Hour of the day when the transaction was made.

weekend_transaction: Boolean indicating if the transaction took place on a weekend.

velocity_last_hour: Dictionary containing metrics on the transaction velocity, including:

num_transactions: Number of transactions in the last hour for this customer. total_amount: Total amount spent in the last hour. unique_merchants: Count of unique merchants in the last hour. unique_countries: Count of unique countries in the last hour. max_single_amount: Maximum single transaction amount in the last hour.

is_fraud: Binary indicator showing if the transaction is fraudulent (True for fraudulent transactions, False for legitimate ones).

In [2]:
df.nunique()

transaction_id         7477306
customer_id               4869
card_number               5000
timestamp              7483754
merchant_category            8
merchant_type               17
merchant                   105
amount                 2831167
currency                    11
country                     12
city                        11
city_size                    2
card_type                    5
card_present                 2
device                       9
channel                      3
device_fingerprint      785462
ip_address             7477187
distance_from_home           2
high_risk_merchant           2
transaction_hour            24
weekend_transaction          2
velocity_last_hour     7483740
is_fraud                     2
dtype: int64

In [3]:
df = df.drop_duplicates(subset='transaction_id', keep='first')

In [4]:
df.isnull().sum()

transaction_id         0
customer_id            0
card_number            0
timestamp              0
merchant_category      0
merchant_type          0
merchant               0
amount                 0
currency               0
country                0
city                   0
city_size              0
card_type              0
card_present           0
device                 0
channel                0
device_fingerprint     0
ip_address             0
distance_from_home     0
high_risk_merchant     0
transaction_hour       0
weekend_transaction    0
velocity_last_hour     0
is_fraud               0
dtype: int64

In [5]:
(df['amount'] <= 0).sum()

0

In [6]:
contradictions = df[
    (df['card_present'] == 'yes') & (df['channel'] != 'pos')
]

len(contradictions)

0

In [7]:
columns_to_check = ['city', 'merchant_category', 'merchant_type', 'currency', 'country', 'city_size', 'card_type', 'card_present', 'device', 'channel', 'distance_from_home', 'high_risk_merchant', 'transaction_hour', 'weekend_transaction', 'is_fraud']  # specify your columns

for col in columns_to_check:
    print(f"\nValue counts for column: {col}")
    print(df[col].value_counts())


Value counts for column: city
city
Unknown City    6977703
San Diego         50380
Phoenix           50283
Dallas            50090
San Antonio       50031
San Jose          49974
Houston           49915
Philadelphia      49870
Chicago           49853
New York          49754
Los Angeles       49453
Name: count, dtype: int64

Value counts for column: merchant_category
merchant_category
Healthcare       935976
Entertainment    935398
Restaurant       935354
Retail           935061
Travel           935010
Gas              934539
Grocery          933191
Education        932777
Name: count, dtype: int64

Value counts for column: merchant_type
merchant_type
online       1400440
physical      934199
pharmacy      467998
medical       467978
local         467484
major         467055
supplies      466390
fast_food     312504
events        312351
streaming     311828
premium       311444
casual        311406
gaming        311219
hotels        234112
booking       233840
transport     233769
airl

In [8]:
df = df.drop(['city', 'timestamp', 'city_size', 'transaction_id', 'ip_address', 'customer_id', 'card_number', 'device_fingerprint'], axis=1)

In [9]:
import numpy as np
from ast import literal_eval

def safe_extract_velocity(df):
    # Step 1: Verify and clean the dictionary column
    if 'velocity_last_hour' not in df.columns:
        raise KeyError("velocity_last_hour column not found")

    # Convert string representations to actual dictionaries if needed
    if isinstance(df['velocity_last_hour'].iloc[0], str):
        try:
            df['velocity_last_hour'] = df['velocity_last_hour'].apply(literal_eval)
        except:
            raise ValueError("Could not parse string values to dictionaries")

    # Step 2: Pre-allocate arrays for performance
    n_rows = len(df)
    results = {
        'num_transactions': np.zeros(n_rows, dtype=np.int32),
        'total_amount': np.zeros(n_rows, dtype=np.float64),
        'unique_merchants': np.zeros(n_rows, dtype=np.int32),
        'unique_countries': np.zeros(n_rows, dtype=np.int32),
        'max_single_amount': np.zeros(n_rows, dtype=np.float64)
    }

    # Step 3: Extract values with proper error handling
    for i, value in enumerate(df['velocity_last_hour']):
        if not isinstance(value, dict):
            continue  # Keeps zeros for invalid entries

        for key in results:
            if key in value:
                results[key][i] = value[key]

    # Step 4: Add new columns to dataframe
    for col_name, values in results.items():
        df[col_name] = values

    # Step 5: Drop original column
    return df.drop('velocity_last_hour', axis=1)

# Usage
df2 = safe_extract_velocity(df)

In [10]:
df2.to_csv('fraud_data_catboost.csv', index=False)

In [11]:
df2.head(5)

Unnamed: 0,merchant_category,merchant_type,merchant,amount,currency,country,card_type,card_present,device,channel,distance_from_home,high_risk_merchant,transaction_hour,weekend_transaction,is_fraud,num_transactions,total_amount,unique_merchants,unique_countries,max_single_amount
0,Restaurant,fast_food,Taco Bell,294.87,GBP,UK,Platinum Credit,False,iOS App,mobile,0,False,0,False,False,1197,33498560.0,105,12,1925481.0
1,Entertainment,gaming,Steam,3368.97,BRL,Brazil,Platinum Credit,False,Edge,web,1,True,0,False,True,509,20114760.0,100,12,5149117.0
2,Grocery,physical,Whole Foods,102582.38,JPY,Japan,Platinum Credit,False,Firefox,web,0,False,0,False,False,332,39163850.0,97,12,1852242.0
3,Gas,major,Exxon,630.6,AUD,Australia,Premium Debit,False,iOS App,mobile,0,False,0,False,False,764,22012600.0,105,12,2055798.0
4,Healthcare,medical,Medical Center,724949.27,NGN,Nigeria,Basic Debit,False,Chrome,web,1,False,0,False,True,218,4827636.0,88,12,1157231.0


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
import pandas as pd

# -----------------------------
# Train-test split
# -----------------------------
X = df2.drop(columns=['is_fraud'])
y = df2['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# Target Encoding
# -----------------------------
target_encode_cols = ['merchant', 'merchant_type']

te = TargetEncoder(
    cols=target_encode_cols,
    smoothing=10,
    min_samples_leaf=100
)

X_train[target_encode_cols] = te.fit_transform(
    X_train[target_encode_cols], y_train
)

X_test[target_encode_cols] = te.transform(
    X_test[target_encode_cols]
)

# -----------------------------
# Label Encoding (remaining categoricals)
# -----------------------------
categorical_cols = X_train.select_dtypes(include='object').columns
label_encode_cols = [col for col in categorical_cols if col not in target_encode_cols]

encoders = {}

for col in label_encode_cols:
    le = LabelEncoder()

    X_train[col] = le.fit_transform(X_train[col])

    X_test[col] = X_test[col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )

    encoders[col] = le

# -----------------------------
# Save final datasets
# -----------------------------
train_encoded = X_train.copy()
train_encoded['is_fraud'] = y_train.values

test_encoded = X_test.copy()
test_encoded['is_fraud'] = y_test.values

train_encoded.to_csv('fraud_data_lgbm_xgb_train.csv', index=False)
test_encoded.to_csv('fraud_data_lgbm_xgb_test.csv', index=False)