In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [90]:
df = pd.read_csv('../SavedData/dataset2_cleaned.csv')
df['Datetime'] = pd.to_datetime(df['Datetime'])

In [91]:
training_transactions, testing_transactions = train_test_split(df, test_size=0.2, random_state=2)

In [92]:
training_payments_df = training_transactions[training_transactions['Amount'] > 0].copy()
training_expenditures_df = training_transactions[training_transactions['Amount'] < 0].copy()
training_expenditures_df["Amount"] = training_expenditures_df["Amount"].abs()

testing_payments_df = testing_transactions[testing_transactions['Amount'] > 0].copy()
testing_expenditures_df = testing_transactions[testing_transactions['Amount'] < 0].copy()
testing_expenditures_df["Amount"] = testing_expenditures_df["Amount"].abs()

In [93]:
training_expenditures_df = training_expenditures_df.copy()
testing_expenditures_df = testing_expenditures_df.copy()

training_expenditures_df['DayOfWeek'] = training_expenditures_df['Datetime'].dt.dayofweek
training_expenditures_df['DayOfMonth'] = training_expenditures_df['Datetime'].dt.day
training_expenditures_df['MonthOfYear'] = training_expenditures_df['Datetime'].dt.month

testing_expenditures_df['DayOfWeek'] = testing_expenditures_df['Datetime'].dt.dayofweek
testing_expenditures_df['DayOfMonth'] = testing_expenditures_df['Datetime'].dt.day
testing_expenditures_df['MonthOfYear'] = testing_expenditures_df['Datetime'].dt.month

In [94]:
training_expenditures_df = training_expenditures_df.copy()
testing_expenditures_df = testing_expenditures_df.copy()

training_expenditures_df['Hour'] = training_expenditures_df['Datetime'].dt.hour
testing_expenditures_df['Hour'] = testing_expenditures_df['Datetime'].dt.hour

In [95]:
patterns_train = training_expenditures_df.groupby(['Account No', 'Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Datetime'])['Amount'].sum().reset_index()
patterns_test = testing_expenditures_df.groupby(['Account No', 'Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Datetime'])['Amount'].sum().reset_index()

In [96]:
account_no = list(df['Account No'].unique())
key_list = Account_Nos
value_list = [0] * len(Account_Nos)
account_num_models = dict(zip(key_list, value_list))

In [122]:
for account in account_no:
    X_train = patterns_train[patterns_train['Account No'] == account][['Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Amount']]
    X_test = patterns_test[patterns_test['Account No'] == account][['Hour', 'DayOfWeek', 'DayOfMonth', 'MonthOfYear', 'Amount']]
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    user_model = IsolationForest(n_estimators=100, contamination=0.05)
    account_num_models[account] = user_model.fit(X_train_scaled)
    
    predictions = account_num_models[account].predict(X_test_scaled)
    
    # Save predictions to the DataFrame
    patterns_test.loc[X_test.index, 'Prediction'] = predictions
    
# Check the DataFrame to see the predictions column
print(patterns_test.head())

    Account No  Hour  DayOfWeek  DayOfMonth  MonthOfYear            Datetime  \
0  101531259.0     0          2           1            2 2023-02-01 00:00:00   
1  101531259.0     0          4           1            9 2023-09-01 00:00:00   
2  101531259.0     0          4           1           12 2023-12-01 00:00:00   
3  101531259.0     0          5           1            4 2023-04-01 00:00:00   
4  101531259.0     8          0          19            6 2023-06-19 08:37:00   

   Amount  Prediction  
0  859.00        -1.0  
1  859.00         1.0  
2  859.00        -1.0  
3  859.00        -1.0  
4  161.08         1.0  


In [123]:
patterns_test.head(50)

Unnamed: 0,Account No,Hour,DayOfWeek,DayOfMonth,MonthOfYear,Datetime,Amount,Prediction
0,101531259.0,0,2,1,2,2023-02-01 00:00:00,859.0,-1.0
1,101531259.0,0,4,1,9,2023-09-01 00:00:00,859.0,1.0
2,101531259.0,0,4,1,12,2023-12-01 00:00:00,859.0,-1.0
3,101531259.0,0,5,1,4,2023-04-01 00:00:00,859.0,-1.0
4,101531259.0,8,0,19,6,2023-06-19 08:37:00,161.08,1.0
5,101531259.0,8,2,11,10,2023-10-11 08:21:00,65.2,1.0
6,101531259.0,8,5,12,8,2023-08-12 08:29:00,95.92,1.0
7,101531259.0,9,3,30,11,2023-11-30 09:21:00,7.49,1.0
8,101531259.0,9,3,31,8,2023-08-31 09:41:00,129.98,1.0
9,101531259.0,9,5,30,9,2023-09-30 09:26:00,30.99,1.0


In [124]:
patterns_test[patterns_test['Account No'] == 108563213.0].head(60)

Unnamed: 0,Account No,Hour,DayOfWeek,DayOfMonth,MonthOfYear,Datetime,Amount,Prediction
172,108563213.0,0,6,1,1,2023-01-01 00:00:00,7.99,-1.0
173,108563213.0,2,0,28,8,2023-08-28 02:42:00,47.83,1.0
174,108563213.0,3,0,4,9,2023-09-04 03:52:00,48.75,1.0
175,108563213.0,3,2,18,10,2023-10-18 03:54:00,44.9,1.0
176,108563213.0,3,3,16,11,2023-11-16 03:04:00,30.34,1.0
177,108563213.0,4,1,5,12,2023-12-05 04:29:00,62.9,1.0
178,108563213.0,4,1,25,4,2023-04-25 04:57:00,1.04,1.0
179,108563213.0,4,2,5,7,2023-07-05 04:46:00,9.89,1.0
180,108563213.0,4,3,3,8,2023-08-03 04:45:00,78.79,1.0
181,108563213.0,4,4,2,6,2023-06-02 04:15:00,36.51,1.0


In [105]:
import matplotlib.pyplot as plt

account = 678330503.0

# Predict on the test dataset
predictions = account_num_models[account].predict(X_test_scaled)

print(X_test_scaled)
print(predictions)

# Analyze predictions
#n_outliers = (predictions == -1).sum()
#n_inliers = (predictions == 1).sum()

#print(f"Number of outliers detected: {n_outliers}")
#print(f"Number of inliers detected: {n_inliers}")

# Plot results (if feasible)
#plt.figure(figsize=(10, 6))
#plt.scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], c=predictions, cmap='coolwarm', edgecolor='k', s=50)
#plt.title('Isolation Forest Outlier Detection')
#plt.xlabel('Feature 1')
#plt.ylabel('Feature 2')
#plt.show()

[[-2.61259834 -1.05018253  0.05450421 -0.09401777 -0.97974706]
 [-2.10484955  0.46318094  0.49861256  1.38340428 -0.88579437]
 [-1.59710075 -1.05018253 -0.50063123  0.49695105 -0.08033482]
 [-1.59710075 -1.05018253  0.05450421 -0.09401777 -0.25398895]
 [-1.08935196 -1.05018253 -0.61165832 -1.27595541  0.27383513]
 [-0.58160316 -1.05018253 -1.16679376  1.08791987 -0.99109527]
 [-0.58160316 -1.05018253  0.38758547  1.08791987  0.11390444]
 [-0.07385437 -0.54572804  1.053748    1.38340428  0.75151592]
 [ 0.43389442 -0.04127355 -0.38960415  1.38340428 -1.09560444]
 [ 0.43389442 -0.04127355  0.05450421 -0.980471    0.80403442]
 [ 0.43389442 -0.04127355  0.38758547  1.38340428  0.69926134]
 [ 0.43389442  0.96763543 -0.50063123 -0.980471   -1.17477805]
 [ 0.43389442  0.96763543  1.71991053  0.20146664 -0.34371904]
 [ 0.43389442  1.47208992  0.38758547 -0.980471   -0.35770638]
 [ 0.43389442  1.47208992  1.053748    0.20146664  0.17170117]
 [ 0.43389442  1.47208992  1.38682926 -0.09401777  0.91