In [1]:
import pandas as pd
import numpy as np
import os
from geopy.distance import geodesic
from datetime import datetime , date
import pickle
import joblib
import matplotlib.pyplot as plt

In [2]:
TRAIN_DATASET_PATH = "Dataset/fraudTrain.csv"
TEST_DATASET_PATH = "Dataset/fraudTest.csv"

In [3]:
def loadData(path):
    if(os.path.exists(path)):
        return pd.read_csv(path)

In [32]:
class customTransformer:

    def __init__(self , data:pd.DataFrame , numeric:list , categorical:list):
        self.data = data
        self.Numerical_Columns = numeric
        self.Categorical_Columns = categorical
        self.target = "is_fraud"
        self.X = None
        self.y = None

    def transform(self):
        self.createNumericalFeatures()
        self.removeNonRelevantFeatures()

    def createNumericalFeatures(self):
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['unix_time'], unit='s')
        self.data['dob'] = pd.to_datetime(self.data['dob'])
        # Time-based features
        self.data['hour'] = self.data['trans_date_trans_time'].dt.hour
        self.data['age'] = self.data['trans_date_trans_time'].dt.year - self.data['dob'].dt.year
        self.data['is_night_transaction'] = self.data['hour'].apply(lambda x: 1 if (0 <= x <= 5 or x >= 20) else 0)
        # 1. Transaction Amount Ranges (Bins 0-50, 200-400, 600-1200)
        self.data['amt_200_400'] = self.data['amt'].apply(lambda x: 1 if 200 <= x <= 400 else 0)
        self.data['amt_600_1200'] = self.data['amt'].apply(lambda x: 1 if 600 <= x <= 1200 else 0)
    
    def removeNonRelevantFeatures(self):
        columns = set(self.data.columns)
        print(columns)
        relevant_cols =set(["is_fraud","amt_600_1200","amt","amt_200_400",'is_night_transaction',"hour","age",'merchant', 'city', 'job', 'state' ,'category'])
        cols_to_drop = columns.difference(relevant_cols)
        print(cols_to_drop)
        self.data = self.data.drop(columns=cols_to_drop)
        self.Numerical_Columns , self.Categorical_Columns = getColumns(self.data)

In [28]:
def calculate_distance(row):
    cardholder_location = (row['lat'], row['long'])
    merchant_location = (row['merch_lat'], row['merch_long'])
    return geodesic(cardholder_location, merchant_location).kilometers
    
def toAge(row):
    born_date = datetime.strptime(row["dob"] , "%Y-%m-%d").date()
    today = date.today()
    return today.year - born_date.year - ((today.month, today.day) < (born_date.month, born_date.day))

def getColumns(dataframe:pd.DataFrame):
    columns = list(dataframe.columns)
    if "is_fraud" in columns:
        columns.remove("is_fraud")
    categorical_columns = set()
    for col in columns:
        if dataframe[col].dtype == np.dtype('object'):
            categorical_columns.add(col)
    return (set(columns).difference(categorical_columns) , categorical_columns)

In [12]:
training_data = loadData(TRAIN_DATASET_PATH)
training_data = training_data.drop(columns=["Unnamed: 0"])

In [33]:
n , c = getColumns(training_data)
preprocessing = customTransformer(training_data , n, c)

In [34]:
preprocessing.transform()

{'hour', 'state', 'unix_time', 'lat', 'trans_date_trans_time', 'long', 'job', 'is_night_transaction', 'amt', 'merchant', 'street', 'merch_lat', 'city_pop', 'first', 'trans_num', 'merch_long', 'dob', 'category', 'is_fraud', 'city', 'amt_600_1200', 'gender', 'age', 'last', 'cc_num', 'amt_200_400', 'zip'}
{'trans_num', 'lat', 'gender', 'trans_date_trans_time', 'long', 'merch_long', 'street', 'merch_lat', 'dob', 'last', 'cc_num', 'city_pop', 'zip', 'unix_time', 'first'}


In [35]:
preprocessing.data

Unnamed: 0,merchant,category,amt,city,state,job,is_fraud,hour,age,is_night_transaction,amt_200_400,amt_600_1200
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Moravian Falls,NC,"Psychologist, counselling",0,0,24,1,0,0
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,Special educational needs teacher,0,0,34,1,0,0
2,fraud_Lind-Buckridge,entertainment,220.11,Malad City,ID,Nature conservation officer,0,0,50,1,1,0
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Boulder,MT,Patent attorney,0,0,45,1,0,0
4,fraud_Keeling-Crist,misc_pos,41.96,Doe Hill,VA,Dance movement psychotherapist,0,0,26,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,fraud_Reichel Inc,entertainment,15.56,Hatch,UT,Geoscientist,0,12,52,0,0,0
1296671,fraud_Abernathy and Sons,food_dining,51.70,Tuscarora,MD,"Production assistant, television",0,12,34,0,0,0
1296672,fraud_Stiedemann Ltd,food_dining,105.93,High Rolls Mountain Park,NM,Naval architect,0,12,46,0,0,0
1296673,"fraud_Reinger, Weissnat and Strosin",food_dining,74.90,Manderson,SD,Volunteer coordinator,0,12,33,0,0,0


In [36]:
mat = preprocessing.data.corr()

  mat = preprocessing.data.corr()


In [38]:
mat["is_fraud"].sort_values(ascending=False)

is_fraud                1.000000
amt_600_1200            0.424065
amt                     0.219404
amt_200_400             0.100966
is_night_transaction    0.074610
hour                    0.013799
age                     0.012453
Name: is_fraud, dtype: float64