# 🧹 Data Preprocessing & EDA
This notebook handles missing values, performs data cleaning, merges geolocation data, performs feature engineering, and prepares data for modeling.

### 📦 Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import ipaddress


## 📂 Load Data

In [2]:
fraud_df = pd.read_csv('../../data/raw/Fraud_Data.csv')
ip_df = pd.read_csv('../../data/raw/IpAddress_to_Country.csv')
cc_df = pd.read_csv('../../data/raw/creditcard.csv')

In [3]:
fraud_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
85832,371436,2015-01-07 17:58:27,2015-01-21 20:08:06,37,ANTQMUTAYPTFN,Ads,Chrome,M,24,3757157000.0,1
98726,305631,2015-02-12 20:19:56,2015-05-08 09:34:51,31,CSZSSLDHHUXBI,Ads,Chrome,M,24,3407133000.0,0
136073,15354,2015-06-06 01:31:09,2015-09-05 02:57:40,13,QJUVWBWPRFPCQ,SEO,IE,M,22,2022262000.0,0
100244,357241,2015-02-12 15:20:02,2015-03-04 19:28:43,22,OLLGEQPNYTTYZ,Ads,Safari,M,49,33582630.0,0
23513,102373,2015-02-23 10:05:41,2015-03-26 16:18:41,41,RBHKFQBSVKXQD,Direct,IE,M,24,3159428000.0,0


In [4]:
ip_df.sample(5)

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
81975,3254887000.0,3254886911,Russian Federation
43983,2622620000.0,2622685183,United States
133630,3561710000.0,3561717759,Poland
7739,1075618000.0,1075621887,United States
112755,3395076000.0,3395080191,Indonesia


In [5]:
cc_df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
55457,46948.0,-0.977097,0.52432,0.489315,-0.058848,-0.328836,0.701093,0.479254,0.699701,-0.040778,...,0.056808,0.029476,0.048463,-0.956106,0.044656,0.427833,0.169352,0.122904,150.0,0
18113,29198.0,-2.074129,-1.519325,1.77079,-2.751204,-0.495873,-0.644513,1.137406,-0.309236,1.733862,...,0.065297,0.468279,0.221572,-0.080215,0.83538,-0.131732,-0.164362,-0.138222,293.94,0
24055,33048.0,1.311467,-0.69234,0.227316,-0.751277,-0.704398,0.017826,-0.682405,0.12884,-0.7611,...,0.069118,0.157845,-0.05699,-0.292379,0.452364,-0.220789,0.01119,-0.002441,25.95,0
8302,11084.0,1.052861,0.122419,0.658575,0.690618,0.195665,0.890361,-0.397005,0.323601,1.373195,...,-0.273037,-0.419762,0.322584,-0.717545,-0.182686,0.166148,0.012429,1.6e-05,1.98,0
26785,34251.0,1.275667,-0.935442,0.796874,-0.842222,-1.264062,0.222628,-1.295561,0.327107,-0.403311,...,0.432035,1.085385,-0.122598,-0.294331,0.308474,-0.013769,0.036093,0.008792,29.95,0


## 🧼 Data Cleaning

### Missing Values

In [6]:
#count missing values in the df
fraud_df.isna().sum()

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [7]:
#check for missing values in the ip df
ip_df.isna().sum()

lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64

In [8]:
#check for missing values in credit card df
cc_df.isna().sum().sum()

0

In [9]:
# Convert timestamps
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Remove duplicates
fraud_df.drop_duplicates(inplace=True)

In [10]:
fraud_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,0
...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,0
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,0


In [12]:
ip_df.dtypes

lower_bound_ip_address    float64
upper_bound_ip_address      int64
country                    object
dtype: object

## 🌐 IP Geolocation Mapping

In [15]:
fraud_df['ip_int'] = fraud_df['ip_address'].astype(float).astype(int)

# Make sure bounds are int
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].astype(int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].astype(int)

# Merge based on range: use pd.merge_asof or manual interval join
merged_df = pd.merge_asof(
    fraud_df.sort_values('ip_int'),
    ip_df.sort_values('lower_bound_ip_address'),
    left_on='ip_int',
    right_on='lower_bound_ip_address',
    direction='backward'
)

# Make sure bounds are int
merged_df['lower_bound_ip_address'] = merged_df['lower_bound_ip_address'].fillna(-1).astype('int')
merged_df['upper_bound_ip_address'] = merged_df['upper_bound_ip_address'].fillna(-1).astype('int')

# Optional: filter where ip_int is also <= upper_bound_ip_address
merged_df = merged_df[merged_df['ip_int'] <= merged_df['upper_bound_ip_address']]



In [16]:
merged_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country
66894,330023,2015-04-18 06:55:02,2015-07-19 21:11:45,26,JEXDPESBNUGOI,Direct,IE,F,31,1918452000.0,0,1918452270,1917845504,1918894079,China
67004,393514,2015-02-14 13:49:54,2015-05-03 14:06:58,35,MFTROLCPDUVNH,Ads,FireFox,M,30,1921333000.0,0,1921333338,1921318912,1921384447,Malaysia
52376,250981,2015-07-14 18:08:19,2015-09-22 21:53:04,41,ZGRXVAYSRQCOI,SEO,IE,F,28,1481700000.0,0,1481699657,1481695232,1481703423,Slovakia (SLOVAK Republic)
8313,229541,2015-03-16 07:58:06,2015-03-17 12:21:54,56,CMRRVJPYYBERM,Ads,Chrome,M,33,227990100.0,0,227990061,226492416,234881023,United States
124708,169461,2015-05-22 15:19:48,2015-08-11 06:02:38,27,SDIYDQSODDEJR,SEO,Chrome,M,23,3567145000.0,0,3567145094,3567124480,3567157247,European Union


In [None]:
# Merge with country info
merged_df = pd.merge_asof(
    fraud_df.sort_values('ip_int'),
    ip_df.sort_values('lower_bound_ip_address'),
    left_on='ip_int', right_on='lower_bound_ip_address'
)
# Make sure bounds are int
merged_df['lower_bound_ip_address'] = merged_df['lower_bound_ip_address'].fillna(-1).astype('int')
merged_df['upper_bound_ip_address'] = merged_df['upper_bound_ip_address'].fillna(-1).astype('int')

In [18]:
merged_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country
82860,387369,2015-01-02 11:49:46,2015-01-12 10:52:35,46,DQSTTMOQVYKZW,SEO,Opera,M,42,2357314000.0,0,2357314442,2357264000.0,2357330000.0,Taiwan; Republic of China (ROC)
42472,313150,2015-01-07 08:15:55,2015-01-07 08:15:56,65,VRYLIRAQJIIIE,SEO,Opera,F,30,1213105000.0,1,1213104903,1212940000.0,1213202000.0,United States
48717,38100,2015-03-18 05:13:22,2015-04-13 10:26:39,23,NDXXJCCHZHCUG,SEO,IE,M,24,1383218000.0,0,1383218296,1383211000.0,1383219000.0,Cyprus
107346,45755,2015-02-14 21:03:52,2015-03-06 10:09:25,28,TLYYFADISEHTN,Ads,Chrome,M,31,3064825000.0,0,3064824965,3064824000.0,3064832000.0,Japan
105371,372360,2015-07-24 23:58:28,2015-08-31 06:40:38,37,MBZVVKFZADKNY,Ads,IE,M,23,3008417000.0,0,3008416963,3008365000.0,3008627000.0,Brazil


## 🧠 Feature Engineering

In [None]:
# Time since signup
merged_df['time_since_signup'] = (merged_df['purchase_time'] - merged_df['signup_time']).dt.total_seconds() / 3600
# Hour and day of transaction
merged_df['hour_of_day'] = merged_df['purchase_time'].dt.hour
merged_df['day_of_week'] = merged_df['purchase_time'].dt.dayofweek

In [None]:
merged_df.sample(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country,time_since_signup,hour_of_day,day_of_week
112630,157735,2015-08-15 04:07:32,2015-09-14 20:26:34,34,IMZOMEVVLIVUS,Ads,Safari,F,34,3223526000.0,0,3223525811,3223526000.0,3223526000.0,United States,736.317222,20,0
39818,280723,2015-02-02 20:08:40,2015-03-21 20:11:20,49,NTDNDBLFSHJBU,Ads,IE,M,46,1140383000.0,0,1140383231,1139802000.0,1140851000.0,United States,1128.044444,20,5
28761,1297,2015-03-09 00:35:50,2015-05-12 20:42:58,64,TPMUFGIMXNNMN,Ads,Chrome,F,34,802266900.0,0,802266868,796917800.0,805306400.0,Canada,1556.118889,20,1
14958,193149,2015-07-04 08:06:32,2015-08-12 12:45:20,34,RFJLLUQAADIOR,Ads,Chrome,F,22,412677500.0,0,412677506,412663800.0,412680200.0,United States,940.646667,12,2
50467,387328,2015-07-29 10:57:11,2015-08-05 01:50:19,43,FMVXSXHZSPQFT,SEO,Chrome,M,47,1429562000.0,0,1429561925,1429209000.0,1430258000.0,Spain,158.885556,1,2


## ⚖️ Handle Class Imbalance (SMOTE)

In [None]:
features = merged_df[['purchase_value', 'time_since_signup', 'hour_of_day', 'day_of_week']]
target = merged_df['class']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, stratify=target)
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
features.sample(5)

Unnamed: 0,purchase_value,time_since_signup,hour_of_day,day_of_week
105069,50,1648.265833,20,4
73719,54,2832.614167,17,5
100111,23,1527.428056,3,1
23115,32,900.906944,18,2
19160,43,1578.668333,16,2
