In [33]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, precision_recall_curve, auc
import category_encoders as ce

In [34]:
file_path = './Dataset/preprocessed_dataset_v2.csv'

In [35]:
chunk_size = 10**6
chunks = []
for chunk in pd.read_csv(file_path, chunksize=chunk_size, index_col=0):
    chunks.append(chunk)

In [41]:
data = pd.concat(chunks, axis=0)

In [42]:
data=data.drop(columns=["Payment Method"])

### Convert the date columns to a datetime object

In [38]:
data['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute']])

### Group by month and category

In [39]:
monthly_data = data.groupby([data['Date'].dt.to_period('M'), 'Category']).agg({
    'Amount': ['sum', 'count']
}).reset_index()

In [40]:
monthly_data

Unnamed: 0_level_0,Date,Category,Amount,Amount
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count
0,1991-01,Beauty Products,2.08,1
1,1991-01,Clothing,739.04,4
2,1991-01,Food/Groceries,411.42,14
3,1991-01,Healthcare,177.11,1
4,1991-01,Home Goods,27.00,1
...,...,...,...,...
4759,2020-02,Services,147092.29,4111
4760,2020-02,Subscriptions,111887.98,1544
4761,2020-02,Transportation,1325563.01,41229
4762,2020-02,Travel & Entertainment,517617.48,5480


### Flatten the column names

In [30]:
monthly_data.columns = ['Month', 'Category', 'Total_Amount', 'Frequency']

### Create features for the original data

In [31]:
data['Month'] = data['Date'].dt.to_period('M')

In [32]:
data

Unnamed: 0,Year,Month,Day,Hour,Minute,Amount,Category,Is Fraud?,Date
0,2002,2002-09,1,6,21,134.09,Personal Spending,No,2002-09-01 06:21:00
1,2002,2002-09,1,6,42,38.48,Food/Groceries,No,2002-09-01 06:42:00
2,2002,2002-09,2,6,22,120.34,Food/Groceries,No,2002-09-02 06:22:00
3,2002,2002-09,2,17,45,128.95,Clothing,No,2002-09-02 17:45:00
4,2002,2002-09,3,6,23,104.71,Healthcare,No,2002-09-03 06:23:00
...,...,...,...,...,...,...,...,...,...
23998444,2020,2020-02,27,22,23,-54.00,Transportation,No,2020-02-27 22:23:00
23998445,2020,2020-02,27,22,24,54.00,Transportation,No,2020-02-27 22:24:00
23998446,2020,2020-02,28,7,43,59.15,Transportation,No,2020-02-28 07:43:00
23998447,2020,2020-02,28,20,10,43.12,Transportation,No,2020-02-28 20:10:00


### Merge the aggregated data back to the original data

In [9]:
df = data.merge(monthly_data, on=['Month', 'Category'], how='left')

### Drop the Date column as it's no longer needed

In [10]:
df = data.drop(columns=['Date'])

In [11]:
data

Unnamed: 0,Year,Month,Day,Hour,Minute,Payment Method,Amount,Category,Is Fraud?,Total_Amount,Frequency
0,2002,2002-09,1,6,21,on-site,134.09,Personal Spending,No,266554.53,3983
1,2002,2002-09,1,6,42,on-site,38.48,Food/Groceries,No,134050.22,6997
2,2002,2002-09,2,6,22,on-site,120.34,Food/Groceries,No,134050.22,6997
3,2002,2002-09,2,17,45,on-site,128.95,Clothing,No,103212.70,1850
4,2002,2002-09,3,6,23,on-site,104.71,Healthcare,No,130554.45,1976
...,...,...,...,...,...,...,...,...,...,...,...
23998444,2020,2020-02,27,22,23,Chip Transaction,-54.00,Transportation,No,1325563.01,41229
23998445,2020,2020-02,27,22,24,Chip Transaction,54.00,Transportation,No,1325563.01,41229
23998446,2020,2020-02,28,7,43,Chip Transaction,59.15,Transportation,No,1325563.01,41229
23998447,2020,2020-02,28,20,10,Chip Transaction,43.12,Transportation,No,1325563.01,41229


### Define high and low priority based on total amount and frequency

In [12]:
def assign_priority(row):
    if row['Total_Amount'] > df['Total_Amount'].median() and row['Frequency'] > df['Frequency'].median():
        return 1  # High priority
    else:
        return 0  # Low priority

df['Priority'] = df.apply(assign_priority, axis=1)

KeyboardInterrupt: 