In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [11]:
df = pd.read_csv("./datasets/fraudTest.csv")
df.describe()

df.info()

df.head()

df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
555714,555714,2020-12-31 23:59:07,30560609640617,fraud_Reilly and Sons,health_fitness,43.77,Michael,Olson,M,558 Michael Estates,...,40.4931,-91.8912,519,Town planner,1966-02-13,9b1f753c79894c9f4b71f04581835ada,1388534347,39.946837,-91.333331,0
555715,555715,2020-12-31 23:59:09,3556613125071656,fraud_Hoppe-Parisian,kids_pets,111.84,Jose,Vasquez,M,572 Davis Mountains,...,29.0393,-95.4401,28739,Futures trader,1999-12-27,2090647dac2c89a1d86c514c427f5b91,1388534349,29.661049,-96.186633,0
555716,555716,2020-12-31 23:59:15,6011724471098086,fraud_Rau-Robel,kids_pets,86.88,Ann,Lawson,F,144 Evans Islands Apt. 683,...,46.1966,-118.9017,3684,Musician,1981-11-29,6c5b7c8add471975aa0fec023b2e8408,1388534355,46.65834,-119.715054,0
555717,555717,2020-12-31 23:59:24,4079773899158,fraud_Breitenberg LLC,travel,7.99,Eric,Preston,M,7020 Doyle Stream Apt. 951,...,44.6255,-116.4493,129,Cartographer,1965-12-15,14392d723bb7737606b2700ac791b7aa,1388534364,44.470525,-117.080888,0
555718,555718,2020-12-31 23:59:34,4170689372027579,fraud_Dare-Marvin,entertainment,38.13,Samuel,Frey,M,830 Myers Plaza Apt. 384,...,35.6665,-97.4798,116001,Media buyer,1993-05-10,1765bb45b3aa3224b4cdcb6e7a96cee3,1388534374,36.210097,-97.036372,0


In [12]:
df.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [34]:
df.drop(columns=["first","last", "Unnamed: 0"], inplace=True)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   gender                 1296675 non-null  object 
 6   street                 1296675 non-null  object 
 7   city                   1296675 non-null  object 
 8   state                  1296675 non-null  object 
 9   zip                    1296675 non-null  int64  
 10  lat                    1296675 non-null  float64
 11  long                   1296675 non-null  float64
 12  city_pop               1296675 non-null  int64  
 13  job                    1296675 non-null  object 
 14  dob               

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_and_save_fraud_dataset(df: pd.DataFrame, output_path: str) -> dict:
    """
    Preprocess fraud dataset into a numeric tabular form and saves it to CSV.
    
    Keeps:
        - `trans_num` as transaction ID
        - `trans_date_trans_time` parsed to datetime and broken into components
    Encodes:
        - `merchant`, `job`, `city`, `street` -> integer IDs
        - `gender`, `category`, `state` -> categorical codes
    Normalizes:
        - lat, long, merch_lat, merch_long
    Drops:
        - `dob`, `unix_time` (not needed in final dataset)
    
    Saves:
        - Clean CSV at `output_path`
    
    Returns:
        - encoders: dict mapping integer IDs back to original values and scaler for coords
    """

    df = df.copy()
    encoders = {}

    # --- Parse transaction datetime ---
    df["trans_datetime"] = pd.to_datetime(df["trans_date_trans_time"])
    df["trans_year"]  = df["trans_datetime"].dt.year
    df["trans_month"] = df["trans_datetime"].dt.month
    df["trans_day"]   = df["trans_datetime"].dt.day
    df["trans_hour"]  = df["trans_datetime"].dt.hour
    df["trans_dow"]   = df["trans_datetime"].dt.weekday  # 0=Mon..6=Sun

    # Keep original transaction ID and date string
    df.rename(columns={"trans_num": "transaction_id"}, inplace=True)

    # --- ID Encoding for high-cardinality columns ---
    for col in ["merchant", "job", "city", "street"]:
        df[col] = df[col].astype("category")
        encoders[col] = dict(enumerate(df[col].cat.categories))
        df[col] = df[col].cat.codes

    # --- Encode gender, category, state ---
    for col in ["gender", "category", "state"]:
        df[col] = df[col].astype("category").cat.codes

    # --- Normalize coordinates ---
    scaler_coords = StandardScaler()
    df[["lat", "long", "merch_lat", "merch_long"]] = scaler_coords.fit_transform(
        df[["lat", "long", "merch_lat", "merch_long"]]
    )
    encoders["coords_scaler"] = scaler_coords

    # --- Drop unused columns ---
    df.drop(columns=["dob", "unix_time", "trans_date_trans_time"], inplace=True)

    # --- Save to CSV ---
    df.to_csv(output_path, index=False)
    print(f"✅ Preprocessed dataset saved to: {output_path}")

    return encoders


In [15]:
encoders = preprocess_and_save_fraud_dataset(df, "preprocessed_fraud_test.csv")

✅ Preprocessed dataset saved to: preprocessed_fraud_test.csv


In [3]:
df = pd.read_csv("./preprocessed_fraud.csv")

In [4]:
df.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,street,city,state,zip,lat,...,transaction_id,merch_lat,merch_long,is_fraud,trans_datetime,trans_year,trans_month,trans_day,trans_hour,trans_dow
0,2703186189652095,514,8,4.97,0,568,526,27,28654,-0.48442,...,0b242abb623afc578575680df30655b9,-0.494354,0.593864,0,2019-01-01 00:00:18,2019,1,1,0,1
1,630423337322,241,4,107.23,0,435,612,47,99160,2.03912,...,1f76529f8574734946361c461b024d99,2.078699,-2.030341,0,2019-01-01 00:00:44,2019,1,1,0,1
2,38859492057661,390,0,220.11,1,602,468,13,83252,0.717754,...,a1a22d70485983eac12b5b88dad1cf95,0.902849,-1.592323,0,2019-01-01 00:00:51,2019,1,1,0,1
3,3534093764340240,360,2,45.0,1,930,84,26,59632,1.515617,...,6b849c168bdad6f867558c3793159a81,1.662886,-1.621848,0,2019-01-01 00:01:16,2019,1,1,0,1
4,375534208663984,297,9,41.96,1,418,216,45,24433,-0.023035,...,a41d7549acf90789359a9aa5346dcb46,0.026941,0.841909,0,2019-01-01 00:03:06,2019,1,1,0,1


In [5]:
df.columns

Index(['cc_num', 'merchant', 'category', 'amt', 'gender', 'street', 'city',
       'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'transaction_id',
       'merch_lat', 'merch_long', 'is_fraud', 'trans_datetime', 'trans_year',
       'trans_month', 'trans_day', 'trans_hour', 'trans_dow'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   cc_num          1296675 non-null  int64  
 1   merchant        1296675 non-null  int64  
 2   category        1296675 non-null  int64  
 3   amt             1296675 non-null  float64
 4   gender          1296675 non-null  int64  
 5   street          1296675 non-null  int64  
 6   city            1296675 non-null  int64  
 7   state           1296675 non-null  int64  
 8   zip             1296675 non-null  int64  
 9   lat             1296675 non-null  float64
 10  long            1296675 non-null  float64
 11  city_pop        1296675 non-null  int64  
 12  job             1296675 non-null  int64  
 13  transaction_id  1296675 non-null  object 
 14  merch_lat       1296675 non-null  float64
 15  merch_long      1296675 non-null  float64
 16  is_fraud        1296675 non-null  in

In [8]:
num_fraud = (df["is_fraud"] == 1).sum()
print("Number of fraudulent transactions:", num_fraud)

Number of fraudulent transactions: 7506


In [9]:

def create_balanced_subset(input_csv: str, output_csv: str = "./preprocessed_fraud.csv"):
    # Load full dataset
    df = pd.read_csv(input_csv)

    # Separate fraud and non-fraud
    df_fraud = df[df["is_fraud"] == 1]
    df_nonfraud = df[df["is_fraud"] == 0]

    # Sample 150k non-fraud transactions
    df_nonfraud_sample = df_nonfraud.sample(n=150_000, random_state=42)

    # Combine and shuffle
    df_subset = pd.concat([df_fraud, df_nonfraud_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

    # Save to CSV
    df_subset.to_csv(output_csv, index=False)
    print(f"✅ Balanced subset saved to: {output_csv}")
    print(f"➡️ Fraud cases: {len(df_fraud)}, Non-fraud sampled: 150000, Total: {len(df_subset)}")

# Usage
create_balanced_subset("./preprocessed_fraud.csv")

✅ Balanced subset saved to: ./preprocessed_fraud.csv
➡️ Fraud cases: 7506, Non-fraud sampled: 150000, Total: 157506


In [9]:
import xgboost as xgb
xgb_model = xgb.Booster()
xgb_model.load_model("./models/xgb_models/xgb_model.pkl")

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("./preprocessed_fraud_test.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,...,transaction_id,merch_lat,merch_long,is_fraud,trans_datetime,trans_year,trans_month,trans_day,trans_hour,trans_dow
0,0,2291163933867244,319,10,2.86,Jeff,Elliott,1,341,157,...,2da90c7d74bd46a0caf3777415b3ebd3,-0.894145,0.657586,0,2020-06-21 12:14:25,2020,6,21,12,6
1,1,3573030041201292,591,10,29.84,Joanne,Williams,0,354,16,...,324cc204407e99f51b0d6ca0055005e7,0.178126,-1.43661,0,2020-06-21 12:14:33,2020,6,21,12,6
2,2,3598215285024754,611,5,41.28,Ashley,Lopez,0,865,61,...,c81755dbbbea9d5c77f094348a7579be,0.383257,1.16764,0,2020-06-21 12:14:53,2020,6,21,12,6
3,3,3591919803438423,222,9,60.05,Brian,Williams,1,320,764,...,2159175b9efe66dc301f149d3d5abf8c,-1.909485,0.680717,0,2020-06-21 12:15:15,2020,6,21,12,6
4,4,3526826139003047,292,13,3.19,Nathan,Massey,1,548,247,...,57ff021bd3f328f8738bb535c302a31b,1.259139,0.31651,0,2020-06-21 12:15:17,2020,6,21,12,6


In [5]:
df[df["is_fraud"] == 1]

Unnamed: 0.1,Unnamed: 0,cc_num,merchant,category,amt,first,last,gender,street,city,...,transaction_id,merch_lat,merch_long,is_fraud,trans_datetime,trans_year,trans_month,trans_day,trans_hour,trans_dow
1685,1685,3560725013359375,226,5,24.84,Brooke,Smith,0,606,557,...,16bf2e46c54369a8eab2214649506425,-1.170944,-0.900958,1,2020-06-21 22:06:39,2020,6,21,22,6
1767,1767,6564459919350820,523,8,780.52,Douglas,Willis,1,587,64,...,ab4b379d2c0c9c667d46508d4e126d72,0.768929,-0.066683,1,2020-06-21 22:32:22,2020,6,21,22,6
1781,1781,6564459919350820,451,0,620.33,Douglas,Willis,1,587,64,...,47a9987ae81d99f7832a54b29a77bf4b,0.829902,0.005317,1,2020-06-21 22:37:27,2020,6,21,22,6
1784,1784,4005676619255478,238,11,1077.69,William,Perry,1,440,197,...,fe956c7e4a253c437c18918bf96f7b62,-1.439968,-0.002200,1,2020-06-21 22:38:55,2020,6,21,22,6
1857,1857,3560725013359375,246,12,842.65,Brooke,Smith,0,606,557,...,f6838c01f5d2262006e6b71d33ba7c6d,-1.418223,-0.910577,1,2020-06-21 23:02:16,2020,6,21,23,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517197,517197,2242176657877538,670,12,1041.51,Travis,Daniel,1,115,701,...,35b0297dd026d2e9a75d024a5dec7955,-0.778937,0.023328,1,2020-12-22 22:05:48,2020,12,22,22,1
517274,517274,2242176657877538,349,12,868.09,Travis,Daniel,1,115,701,...,da7f67d7375f10a054a3d919448c45dd,-0.873572,-0.011595,1,2020-12-22 22:18:07,2020,12,22,22,1
517341,517341,2242176657877538,443,11,1039.42,Travis,Daniel,1,115,701,...,25b076c7bcd70f272c1c5326bb234f4b,-0.768151,-0.003888,1,2020-12-22 22:31:48,2020,12,22,22,1
517529,517529,2242176657877538,29,4,289.27,Travis,Daniel,1,115,701,...,2df7d894868fbc99ec1d8b055585fc9d,-0.745068,-0.012358,1,2020-12-22 23:06:03,2020,12,22,23,1


In [None]:
Primary Recommendation:
Edge 62 - This is the first fraud example found in the graph data
Additional Fraud Examples:
Edge 721
Edge 729
Edge 909
Edge 1672
Edge 2754
Edge 2811
Edge 2960
Edge 3059
Edge 4635