# Feature Engineering for Fraud Detection

In this notebook, we will focus on feature engineering techniques that can enhance the performance of our fraud detection models. We will create new features based on existing data, including time-based features and transaction frequency metrics. Additionally, we will address class imbalance in the dataset.

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
import os


In [2]:
# -------------------------------
# 1. Load Raw Data
# -------------------------------
raw_file = "../data/raw/Fraud_Data.csv"
df = pd.read_csv(raw_file)
print("Raw data shape:", df.shape)
df.head()


Raw data shape: (151112, 11)


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [3]:
# -------------------------------
# 2. Basic Cleaning
# -------------------------------

# Remove duplicates
df = df.drop_duplicates()
print("After dropping duplicates:", df.shape)

# Convert date columns to datetime
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])

# Check missing values
df.isnull().sum()


After dropping duplicates: (151112, 11)


user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [4]:
# -------------------------------
# 3. Impute Missing Values
# -------------------------------

# Identify numeric and categorical columns
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

# Impute numeric with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Impute categorical with mode
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify no missing
df.isnull().sum().sum()


np.int64(0)

In [5]:
# -------------------------------
# 4. Feature Extraction
# -------------------------------

# Signup features
df['signup_hour'] = df['signup_time'].dt.hour
df['signup_day'] = df['signup_time'].dt.day
df['signup_weekday'] = df['signup_time'].dt.weekday

# Purchase features
df['purchase_hour'] = df['purchase_time'].dt.hour
df['purchase_day'] = df['purchase_time'].dt.day
df['purchase_weekday'] = df['purchase_time'].dt.weekday

df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,signup_hour,signup_day,signup_weekday,purchase_hour,purchase_day,purchase_weekday
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,22,24,1,2,18,5
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,20,7,6,1,8,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,18,1,3,18,1,3
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,21,28,1,13,4,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,7,21,1,18,9,2


In [6]:
# -------------------------------
# 5. Drop unnecessary columns
# -------------------------------
drop_cols = ['user_id', 'signup_time', 'purchase_time']
df = df.drop(columns=drop_cols)
df.head()


Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,class,signup_hour,signup_day,signup_weekday,purchase_hour,purchase_day,purchase_weekday
0,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,22,24,1,2,18,5
1,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,20,7,6,1,8,0
2,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,18,1,3,18,1,3
3,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,21,28,1,13,4,0
4,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,7,21,1,18,9,2


In [10]:
# -------------------------------
# 6. Encode Categorical Features
# -------------------------------
cat_features = ['device_id', 'source', 'browser', 'sex']

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False,min_frequency=10)
encoded = pd.DataFrame(ohe.fit_transform(df[cat_features]),
                       columns=ohe.get_feature_names_out(cat_features))

df = pd.concat([df.drop(columns=cat_features), encoded], axis=1)
df.head()


Unnamed: 0,purchase_value,age,ip_address,class,signup_hour,signup_day,signup_weekday,purchase_hour,purchase_day,purchase_weekday,...,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_F,sex_M
0,34,39,732758400.0,0,22,24,1,2,18,5,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,16,53,350311400.0,0,20,7,6,1,8,0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,15,53,2621474000.0,1,18,1,3,18,1,3,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,44,41,3840542000.0,0,21,28,1,13,4,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,39,45,415583100.0,0,7,21,1,18,9,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [11]:
# -------------------------------
# 7. Scale Numeric Features
# -------------------------------
num_features = ['purchase_value', 'age', 'signup_hour', 'signup_day',
                'signup_weekday', 'purchase_hour', 'purchase_day', 'purchase_weekday']

scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])
df.head()


Unnamed: 0,purchase_value,age,ip_address,class,signup_hour,signup_day,signup_weekday,purchase_hour,purchase_day,purchase_weekday,...,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_F,sex_M
0,-0.160204,0.679914,732758400.0,0,1.519557,1.06163,-1.010114,-1.377455,0.308768,0.99102,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.142592,2.304476,350311400.0,0,1.230382,-0.885487,1.481209,-1.522122,-0.82578,-1.501259,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-1.197169,2.304476,2621474000.0,1,0.941206,-1.572705,-0.013585,0.937208,-1.619963,-0.005891,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.385567,0.911994,3840542000.0,0,1.374969,1.519775,-1.010114,0.213876,-1.279599,-1.501259,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.112681,1.376155,415583100.0,0,-0.649257,0.718021,-1.010114,0.937208,-0.712325,-0.504347,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [12]:
# -------------------------------
# 8. Handle Class Imbalance
# -------------------------------
X = df.drop(columns='class')
y = df['class']

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Combine back to a DataFrame
df_ready = pd.concat([X_res, y_res], axis=1)
print("Ready-for-training data shape:", df_ready.shape)


Ready-for-training data shape: (273922, 536)


In [13]:
# -------------------------------
# 9. Save Ready-for-Training Data
# -------------------------------
ready_folder = "../data/readyfortraining"
os.makedirs(ready_folder, exist_ok=True)

output_file = os.path.join(ready_folder, "Fraud_Data_ready.csv")
df_ready.to_csv(output_file, index=False)

print(f"Ready-for-training data saved to: {output_file}")


Ready-for-training data saved to: ../data/readyfortraining\Fraud_Data_ready.csv


## Summary

In this notebook, we successfully engineered new features that can help improve the performance of our fraud detection models. We also addressed the class imbalance issue, which is crucial for building robust models. The next steps will involve modeling these features to evaluate their effectiveness.