In [1]:
import pandas as pd
df = pd.read_csv(r'F:\fraud_transaction_detection\synthetic_fraud_data_original.csv')
df.columns

Index(['transaction_id', 'customer_id', 'card_number', 'timestamp',
       'merchant_category', 'merchant_type', 'merchant', 'amount', 'currency',
       'country', 'city', 'city_size', 'card_type', 'card_present', 'device',
       'channel', 'device_fingerprint', 'ip_address', 'distance_from_home',
       'high_risk_merchant', 'transaction_hour', 'weekend_transaction',
       'velocity_last_hour', 'is_fraud'],
      dtype='object')

transaction_id: Unique identifier for each transaction.

customer_id: Unique identifier for each customer in the dataset.

card_number: Masked card number associated with the transaction.

timestamp: Date and time of the transaction.

merchant_category: General category of the merchant (e.g., Retail, Grocery, Travel).

merchant_type: Specific type within the merchant category (e.g., "online" for Retail).

merchant: Name of the merchant where the transaction took place.

amount: Transaction amount (currency based on the country).

currency: Currency used for the transaction (e.g., USD, EUR, JPY).

country: Country where the transaction occurred.

city: City where the transaction took place.

city_size: Size of the city (e.g., medium, large).

card_type: Type of card used (e.g., Basic Credit, Gold Credit).

card_present: Indicates if the card was physically present during the transaction (used in POS transactions).

device: Device used for the transaction (e.g., Chrome, iOS App, NFC Payment).

channel: Type of channel used for the transaction (web, mobile, POS).

device_fingerprint: Unique fingerprint for the device used in the transaction.

ip_address: IP address associated with the transaction.

distance_from_home: Binary indicator showing if the transaction occurred outside the customer's home country.

high_risk_merchant: Indicates if the merchant category is known for higher fraud risk (e.g., Travel, Entertainment).

transaction_hour: Hour of the day when the transaction was made.

weekend_transaction: Boolean indicating if the transaction took place on a weekend.

velocity_last_hour: Dictionary containing metrics on the transaction velocity, including:

num_transactions: Number of transactions in the last hour for this customer. total_amount: Total amount spent in the last hour. unique_merchants: Count of unique merchants in the last hour. unique_countries: Count of unique countries in the last hour. max_single_amount: Maximum single transaction amount in the last hour.

is_fraud: Binary indicator showing if the transaction is fraudulent (True for fraudulent transactions, False for legitimate ones).

In [2]:
df.nunique()

transaction_id         7477306
customer_id               4869
card_number               5000
timestamp              7483754
merchant_category            8
merchant_type               17
merchant                   105
amount                 2831167
currency                    11
country                     12
city                        11
city_size                    2
card_type                    5
card_present                 2
device                       9
channel                      3
device_fingerprint      785462
ip_address             7477187
distance_from_home           2
high_risk_merchant           2
transaction_hour            24
weekend_transaction          2
velocity_last_hour     7483740
is_fraud                     2
dtype: int64

In [3]:
df = df.drop_duplicates(subset='transaction_id', keep='first')

In [4]:
df.isnull().sum()

transaction_id         0
customer_id            0
card_number            0
timestamp              0
merchant_category      0
merchant_type          0
merchant               0
amount                 0
currency               0
country                0
city                   0
city_size              0
card_type              0
card_present           0
device                 0
channel                0
device_fingerprint     0
ip_address             0
distance_from_home     0
high_risk_merchant     0
transaction_hour       0
weekend_transaction    0
velocity_last_hour     0
is_fraud               0
dtype: int64

In [8]:
columns_to_check = ['city', 'merchant_category', 'merchant_type', 'currency', 'country', 'city_size', 'card_type', 'card_present', 'device', 'channel', 'distance_from_home', 'high_risk_merchant', 'transaction_hour', 'weekend_transaction', 'is_fraud']  # specify your columns

for col in columns_to_check:
    print(f"\nValue counts for column: {col}")
    print(df[col].value_counts())



Value counts for column: city
city
Unknown City    6977703
San Diego         50380
Phoenix           50283
Dallas            50090
San Antonio       50031
San Jose          49974
Houston           49915
Philadelphia      49870
Chicago           49853
New York          49754
Los Angeles       49453
Name: count, dtype: int64

Value counts for column: merchant_category
merchant_category
Healthcare       935976
Entertainment    935398
Restaurant       935354
Retail           935061
Travel           935010
Gas              934539
Grocery          933191
Education        932777
Name: count, dtype: int64

Value counts for column: merchant_type
merchant_type
online       1400440
physical      934199
pharmacy      467998
medical       467978
local         467484
major         467055
supplies      466390
fast_food     312504
events        312351
streaming     311828
premium       311444
casual        311406
gaming        311219
hotels        234112
booking       233840
transport     233769
airl

In [6]:
df.head(5)

Unnamed: 0,transaction_id,customer_id,card_number,timestamp,merchant_category,merchant_type,merchant,amount,currency,country,...,device,channel,device_fingerprint,ip_address,distance_from_home,high_risk_merchant,transaction_hour,weekend_transaction,velocity_last_hour,is_fraud
0,TX_a0ad2a2a,CUST_72886,6646734767813109,2024-09-30 00:00:01.034820+00:00,Restaurant,fast_food,Taco Bell,294.87,GBP,UK,...,iOS App,mobile,e8e6160445c935fd0001501e4cbac8bc,197.153.60.199,0,False,0,False,"{'num_transactions': 1197, 'total_amount': 334...",False
1,TX_3599c101,CUST_70474,376800864692727,2024-09-30 00:00:01.764464+00:00,Entertainment,gaming,Steam,3368.97,BRL,Brazil,...,Edge,web,a73043a57091e775af37f252b3a32af9,208.123.221.203,1,True,0,False,"{'num_transactions': 509, 'total_amount': 2011...",True
2,TX_a9461c6d,CUST_10715,5251909460951913,2024-09-30 00:00:02.273762+00:00,Grocery,physical,Whole Foods,102582.38,JPY,Japan,...,Firefox,web,218864e94ceaa41577d216b149722261,10.194.159.204,0,False,0,False,"{'num_transactions': 332, 'total_amount': 3916...",False
3,TX_7be21fc4,CUST_16193,376079286931183,2024-09-30 00:00:02.297466+00:00,Gas,major,Exxon,630.6,AUD,Australia,...,iOS App,mobile,70423fa3a1e74d01203cf93b51b9631d,17.230.177.225,0,False,0,False,"{'num_transactions': 764, 'total_amount': 2201...",False
4,TX_150f490b,CUST_87572,6172948052178810,2024-09-30 00:00:02.544063+00:00,Healthcare,medical,Medical Center,724949.27,NGN,Nigeria,...,Chrome,web,9880776c7b6038f2af86bd4e18a1b1a4,136.241.219.151,1,False,0,False,"{'num_transactions': 218, 'total_amount': 4827...",True


In [9]:
df = df.drop(['city', 'timestamp', 'city_size', 'transaction_id', 'ip_address', 'customer_id', 'card_number', 'device_fingerprint'], axis=1)

In [10]:
import numpy as np
from ast import literal_eval

def safe_extract_velocity(df):
    # Step 1: Verify and clean the dictionary column
    if 'velocity_last_hour' not in df.columns:
        raise KeyError("velocity_last_hour column not found")

    # Convert string representations to actual dictionaries if needed
    if isinstance(df['velocity_last_hour'].iloc[0], str):
        try:
            df['velocity_last_hour'] = df['velocity_last_hour'].apply(literal_eval)
        except:
            raise ValueError("Could not parse string values to dictionaries")

    # Step 2: Pre-allocate arrays for performance
    n_rows = len(df)
    results = {
        'num_transactions': np.zeros(n_rows, dtype=np.int32),
        'total_amount': np.zeros(n_rows, dtype=np.float64),
        'unique_merchants': np.zeros(n_rows, dtype=np.int32),
        'unique_countries': np.zeros(n_rows, dtype=np.int32),
        'max_single_amount': np.zeros(n_rows, dtype=np.float64)
    }

    # Step 3: Extract values with proper error handling
    for i, value in enumerate(df['velocity_last_hour']):
        if not isinstance(value, dict):
            continue  # Keeps zeros for invalid entries

        for key in results:
            if key in value:
                results[key][i] = value[key]

    # Step 4: Add new columns to dataframe
    for col_name, values in results.items():
        df[col_name] = values

    # Step 5: Drop original column
    return df.drop('velocity_last_hour', axis=1)

# Usage
df2 = safe_extract_velocity(df)

In [11]:
pip install category_encoders




In [12]:
pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Downloading scikit_learn-1.7.0-cp312-cp312-win_amd64.whl (10.7 MB)
   ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
    --------------------------------------- 0.3/10.7 MB ? eta -:--:--
   -- ------------------------------------- 0.8/10.7 MB 2.8 MB/s eta 0:00:04
   ----- ---------------------------------- 1.6/10.7 MB 2.9 MB/s eta 0:00:04
   ------- -------------------------------- 2.1/10.7 MB 2.8 MB/s eta 0:00:04
   --------- ------------------------------ 2.6/10.7 MB 2.7 MB/s eta 0:00:04
   ---------- ----------------------------- 2.9/10.7 MB 2.5 MB/s eta 0:00:04
   ----------- ---------------------------- 3.1/10.7 MB 2.4 MB/s eta 0:00:04
   ------------- -------------------------- 3.7/10.7 MB 2.3 MB/s eta 0:00:04
   --------------- ------------------------ 4.2/10.7 MB 2.2 MB/s eta 0:00:03
   ----------------- ---------------------- 4.7/10.7 MB 2.3 MB/s eta 0:00:03


In [13]:
from sklearn.feature_selection import VarianceThreshold
from category_encoders import TargetEncoder

# Separate features and target
X = df2.drop(columns=['is_fraud'])
y = df2['is_fraud']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(exclude=['object', 'category']).columns

# Step 1: Encode categorical columns using TargetEncoder (temporary)
encoder = TargetEncoder()
X_encoded_cats = encoder.fit_transform(X[categorical_cols], y)

# Step 2: Combine encoded categorical and numeric columns
X_encoded = pd.concat([X[numerical_cols], X_encoded_cats], axis=1)

# Step 3: Apply VarianceThreshold
vt = VarianceThreshold(threshold=0.0)
vt.fit(X_encoded)
low_variance_mask = vt.get_support()
low_variance_features = X_encoded.columns[~low_variance_mask]

print("Low variance features removed:")
print(low_variance_features.tolist())

Low variance features removed:
[]


In [14]:
# Calculate the correlation matrix
corr_matrix = X_encoded.corr().abs()

# Create a mask to get upper triangle of the correlation matrix (excluding self-correlations)
upper_triangle_mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
upper_triangle = corr_matrix.where(upper_triangle_mask)

# Find feature pairs with correlation > 0.95
correlated_pairs = [
    (col1, col2, corr_matrix.loc[col1, col2])
    for col1 in corr_matrix.columns
    for col2 in corr_matrix.columns
    if col1 != col2 and corr_matrix.loc[col1, col2] > 0.90
    and corr_matrix.columns.get_loc(col1) < corr_matrix.columns.get_loc(col2)
]

# Sort pairs by correlation (descending)
correlated_pairs.sort(key=lambda x: x[2], reverse=True)

# Display the top N correlated pairs
for col1, col2, corr in correlated_pairs:
    print(f"{col1} <--> {col2} : Correlation = {corr:.4f}")

card_present <--> channel : Correlation = 1.0000
currency <--> country : Correlation = 1.0000
device <--> channel : Correlation = 0.9999
card_present <--> device : Correlation = 0.9999


In [15]:
df2 = df2.drop(['channel', 'currency', 'device'], axis=1)

In [16]:
df2.head(2)

Unnamed: 0,merchant_category,merchant_type,merchant,amount,country,card_type,card_present,distance_from_home,high_risk_merchant,transaction_hour,weekend_transaction,is_fraud,num_transactions,total_amount,unique_merchants,unique_countries,max_single_amount
0,Restaurant,fast_food,Taco Bell,294.87,UK,Platinum Credit,False,0,False,0,False,False,1197,33498560.0,105,12,1925481.0
1,Entertainment,gaming,Steam,3368.97,Brazil,Platinum Credit,False,1,True,0,False,True,509,20114760.0,100,12,5149117.0


In [17]:
df2.to_csv('fraud_data_notencoded.csv', index=False)

In [18]:
from sklearn.model_selection import train_test_split
import category_encoders as ce

# Assuming df2 is already loaded and available
# Step 1: Split data into features and target
X = df2.drop('is_fraud', axis=1)
y = df2['is_fraud']

# Step 2: Split into train and test (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 3: Combine X and y for encoding
train_data = X_train.copy()
train_data['is_fraud'] = y_train

test_data = X_test.copy()
test_data['is_fraud'] = y_test

# Step 4: Define categorical columns for target encoding
cat_columns = ['merchant', 'merchant_category', 'merchant_type', 'country', 'card_type', 'card_present', 'distance_from_home', 'high_risk_merchant', 'weekend_transaction']

# Step 5: Apply target encoding
encoder = ce.TargetEncoder(cols=cat_columns)
train_data_encoded = encoder.fit_transform(train_data, train_data['is_fraud'])
test_data_encoded = encoder.transform(test_data)

# Step 6: Save to CSV
train_data_encoded.to_csv('encoded_train_data1.csv', index=False)
test_data_encoded.to_csv('encoded_test_data1.csv', index=False)