In [141]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from IndustryMap2 import industry_categories

In [142]:
df = pd.read_csv('../SavedData/dataset2_cleaned.csv')
df['Datetime'] = pd.to_datetime(df['Datetime'])

# Remove capitalization and spaces from 'Third Party Name' column
df['Third Party Name'] = df['Third Party Name'].str.lower().str.replace(' ', '')

In [143]:
training_transactions, testing_transactions = train_test_split(df, test_size=0.2, random_state=2)

## Remove certain industries

In [144]:
training_transactions

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name,Datetime
98763,01/06/2023,11:43:00,660561861.0,1069.557925,-12.99,,sportsdirect,2023-06-01 11:43:00
202021,31/10/2023,23:59:00,229205098.0,8430.205394,-7.00,,xbox,2023-10-31 23:59:00
9195,08/01/2023,10:59:00,191161970.0,1174.555000,-52.39,,tesco,2023-01-08 10:59:00
187042,11/10/2023,08:32:00,519933553.0,-354.791496,-18.22,,tesco,2023-10-11 08:32:00
138671,01/08/2023,09:57:00,762700245.0,1756.224700,-38.97,,sportsdirect,2023-08-01 09:57:00
...,...,...,...,...,...,...,...,...
175638,30/09/2023,08:26:00,973086490.0,8501.682677,-39.28,,cooplocal,2023-09-30 08:26:00
95816,31/05/2023,23:59:00,737645298.0,-1415.637550,-18.99,,puregym,2023-05-31 23:59:00
203245,01/11/2023,03:41:00,304455868.0,3349.715880,-14.48,,topshop,2023-11-01 03:41:00
100879,05/06/2023,08:23:00,198630866.0,-222.556324,-61.23,,tesco,2023-06-05 08:23:00


In [145]:
# Find all keys in the dictionary where the value is 'subscription' or 'finance'
companies_to_remove = [company for company, category in industry_categories.items()
                       if category in ['subscription', 'finance']]
companies_to_remove.append('grandunionbjj')
training_transactions = training_transactions[~((training_transactions['Third Party Name'] == 'amazon') & (training_transactions['Amount'] == -7.99))]
testing_transactions = testing_transactions[~((testing_transactions['Third Party Name'] == 'amazon') & (testing_transactions['Amount'] == -7.99))]

training_transactions = training_transactions[~training_transactions['Third Party Name'].isin(companies_to_remove)]
testing_transactions = testing_transactions[~testing_transactions['Third Party Name'].isin(companies_to_remove)]



In [146]:
testing_transactions[testing_transactions['Third Party Name'] == 'amazon']

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name,Datetime
55126,28/03/2023,08:55:00,905860285.0,6720.051838,-9.60,,amazon,2023-03-28 08:55:00
153121,29/08/2023,19:52:00,414805878.0,11699.926168,-5.89,,amazon,2023-08-29 19:52:00
208194,08/11/2023,19:19:00,859603181.0,3724.759408,-7.58,,amazon,2023-11-08 19:19:00
142872,08/08/2023,17:33:00,815993333.0,1462.477842,-37.00,,amazon,2023-08-08 17:33:00
47910,08/03/2023,17:14:00,522188082.0,5236.474130,-33.84,,amazon,2023-03-08 17:14:00
...,...,...,...,...,...,...,...,...
183012,04/10/2023,02:52:00,873693307.0,10069.391632,-5.83,,amazon,2023-10-04 02:52:00
142051,07/08/2023,02:07:00,815385232.0,8250.566744,-42.17,,amazon,2023-08-07 02:07:00
191146,19/10/2023,04:41:00,305778936.0,9324.831923,-3.50,,amazon,2023-10-19 04:41:00
209064,10/11/2023,14:52:00,970298834.0,2858.103792,-11.78,,amazon,2023-11-10 14:52:00


In [147]:
training_payments_df = training_transactions[training_transactions['Amount'] > 0].copy()
training_expenditures_df = training_transactions[training_transactions['Amount'] < 0].copy()
training_expenditures_df["Amount"] = training_expenditures_df["Amount"].abs()

testing_payments_df = testing_transactions[testing_transactions['Amount'] > 0].copy()
testing_expenditures_df = testing_transactions[testing_transactions['Amount'] < 0].copy()
testing_expenditures_df["Amount"] = testing_expenditures_df["Amount"].abs()

In [148]:
training_expenditures_df = training_expenditures_df.copy()
testing_expenditures_df = testing_expenditures_df.copy()

training_expenditures_df['DayOfWeek'] = training_expenditures_df['Datetime'].dt.dayofweek
training_expenditures_df['DayOfMonth'] = training_expenditures_df['Datetime'].dt.day
training_expenditures_df['MonthOfYear'] = training_expenditures_df['Datetime'].dt.month

testing_expenditures_df['DayOfWeek'] = testing_expenditures_df['Datetime'].dt.dayofweek
testing_expenditures_df['DayOfMonth'] = testing_expenditures_df['Datetime'].dt.day
testing_expenditures_df['MonthOfYear'] = testing_expenditures_df['Datetime'].dt.month

In [149]:
training_expenditures_df = training_expenditures_df.copy()
testing_expenditures_df = testing_expenditures_df.copy()

training_expenditures_df['Hour'] = training_expenditures_df['Datetime'].dt.hour
testing_expenditures_df['Hour'] = testing_expenditures_df['Datetime'].dt.hour

In [150]:
patterns_train = training_expenditures_df.groupby(['Account No', 'Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Third Party Name'])['Amount'].sum().reset_index()
patterns_test = testing_expenditures_df.groupby(['Account No', 'Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Third Party Name'])['Amount'].sum().reset_index()

In [151]:
account_no = list(df['Account No'].unique())
key_list = account_no
value_list = [0] * len(account_no)
account_num_models = dict(zip(key_list, value_list))

In [152]:
for account in account_no:
    X_train = patterns_train[patterns_train['Account No'] == account][['Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Amount']]
    X_test = patterns_test[patterns_test['Account No'] == account][['Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Amount']]
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    user_model = IsolationForest(n_estimators=100, contamination=0.01)
    account_num_models[account] = user_model.fit(X_train_scaled)
    
    predictions = account_num_models[account].predict(X_test_scaled)
    
    # Save predictions to the DataFrame
    patterns_test.loc[X_test.index, 'Prediction'] = predictions
    
# Check the DataFrame to see the predictions column
print(patterns_test.head())

    Account No  Hour  DayOfWeek  DayOfMonth  MonthOfYear Third Party Name  \
0  101531259.0     8          0          19            6            tesco   
1  101531259.0     8          2          11           10            tesco   
2  101531259.0     8          5          12            8            tesco   
3  101531259.0     9          3          30           11    fivesensesart   
4  101531259.0     9          5          30            9     sportsdirect   

   Amount  Prediction  
0  161.08         1.0  
1   65.20         1.0  
2   95.92         1.0  
3    7.49         1.0  
4   30.99         1.0  


In [153]:
patterns_test[patterns_test['Prediction'] == -1].head(60)

Unnamed: 0,Account No,Hour,DayOfWeek,DayOfMonth,MonthOfYear,Third Party Name,Amount,Prediction
25,105375973.0,1,6,29,1,sainsburylocal,16.25,-1.0
184,108563213.0,10,6,2,7,matalan,333.16,-1.0
272,108931112.0,20,2,1,2,tesco,120.02,-1.0
395,116598243.0,1,0,30,10,deliveroo,14.78,-1.0
627,119943495.0,10,6,1,1,blackwell's,19.97,-1.0
657,120634201.0,3,6,3,12,justeat,22.3,-1.0
666,120634201.0,7,6,3,12,revella,49.5,-1.0
716,120634201.0,20,2,1,2,tesco,81.04,-1.0
757,122884111.0,5,0,1,5,deliveroo,24.89,-1.0
820,124102251.0,15,6,3,12,revella,221.9,-1.0


In [120]:
patterns_test[patterns_test['Account No'] == 108563213.0]

Unnamed: 0,Account No,Hour,DayOfWeek,DayOfMonth,MonthOfYear,Third Party Name,Amount,Prediction
136,108563213.0,0,6,1,1,amazon,7.99,-1.0
137,108563213.0,2,0,28,8,fatface,47.83,1.0
138,108563213.0,3,0,4,9,cooplocal,48.75,1.0
139,108563213.0,3,2,18,10,cooplocal,44.90,1.0
140,108563213.0,3,3,16,11,etsy,30.34,1.0
...,...,...,...,...,...,...,...,...
205,108563213.0,19,1,31,1,tesco,64.36,-1.0
206,108563213.0,23,1,28,2,amazon,7.99,1.0
207,108563213.0,23,1,31,10,amazon,7.99,1.0
208,108563213.0,23,2,31,5,amazon,7.99,1.0


In [105]:
import matplotlib.pyplot as plt

account = 678330503.0

# Predict on the test dataset
predictions = account_num_models[account].predict(X_test_scaled)

print(X_test_scaled)
print(predictions)

# Analyze predictions
#n_outliers = (predictions == -1).sum()
#n_inliers = (predictions == 1).sum()

#print(f"Number of outliers detected: {n_outliers}")
#print(f"Number of inliers detected: {n_inliers}")

# Plot results (if feasible)
#plt.figure(figsize=(10, 6))
#plt.scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], c=predictions, cmap='coolwarm', edgecolor='k', s=50)
#plt.title('Isolation Forest Outlier Detection')
#plt.xlabel('Feature 1')
#plt.ylabel('Feature 2')
#plt.show()

[[-2.61259834 -1.05018253  0.05450421 -0.09401777 -0.97974706]
 [-2.10484955  0.46318094  0.49861256  1.38340428 -0.88579437]
 [-1.59710075 -1.05018253 -0.50063123  0.49695105 -0.08033482]
 [-1.59710075 -1.05018253  0.05450421 -0.09401777 -0.25398895]
 [-1.08935196 -1.05018253 -0.61165832 -1.27595541  0.27383513]
 [-0.58160316 -1.05018253 -1.16679376  1.08791987 -0.99109527]
 [-0.58160316 -1.05018253  0.38758547  1.08791987  0.11390444]
 [-0.07385437 -0.54572804  1.053748    1.38340428  0.75151592]
 [ 0.43389442 -0.04127355 -0.38960415  1.38340428 -1.09560444]
 [ 0.43389442 -0.04127355  0.05450421 -0.980471    0.80403442]
 [ 0.43389442 -0.04127355  0.38758547  1.38340428  0.69926134]
 [ 0.43389442  0.96763543 -0.50063123 -0.980471   -1.17477805]
 [ 0.43389442  0.96763543  1.71991053  0.20146664 -0.34371904]
 [ 0.43389442  1.47208992  0.38758547 -0.980471   -0.35770638]
 [ 0.43389442  1.47208992  1.053748    0.20146664  0.17170117]
 [ 0.43389442  1.47208992  1.38682926 -0.09401777  0.91