## Anomaly detection based on daily frequency of transactions 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split

### Loading Dataset

In [2]:
df = pd.read_csv('../SavedData/dataset2_cleaned.csv')
df['Datetime'] = pd.to_datetime(df['Datetime'])
df

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name,Datetime
0,01/01/2023,00:00:00,678330503.0,2971.000000,1584.00,,Westport Care Home,2023-01-01 00:00:00
1,01/01/2023,00:00:00,472213568.0,3792.000000,1950.00,,Barbiee Boutique,2023-01-01 00:00:00
2,01/01/2023,00:00:00,472213568.0,3012.000000,-780.00,283027736.0,,2023-01-01 00:00:00
3,01/01/2023,00:00:00,283027736.0,1787.000000,780.00,472213568.0,,2023-01-01 00:00:00
4,01/01/2023,00:00:00,624500124.0,3226.000000,1825.00,,Fat Face,2023-01-01 00:00:00
...,...,...,...,...,...,...,...,...
229189,06/12/2023,20:54:00,581655972.0,45935.206861,-41.06,,Tesco,2023-12-06 20:54:00
229190,06/12/2023,20:55:00,786141370.0,-244.837500,-62.35,,Sainsbury Local,2023-12-06 20:55:00
229191,06/12/2023,21:05:00,824916823.0,9709.172159,-32.94,,Deliveroo,2023-12-06 21:05:00
229192,06/12/2023,21:13:00,366550080.0,26834.165794,-19.25,,Amazon,2023-12-06 21:13:00


### Splitting dataframe into training and testing, and expenditures and payments

In [3]:
training_transactions, testing_transactions = train_test_split(df, test_size=0.2, random_state=2)

training_payments_df = training_transactions[training_transactions['Amount'] > 0]
training_expenditures_df = training_transactions[training_transactions['Amount'] < 0]
training_expenditures_df["Amount"] = training_expenditures_df["Amount"].abs()

testing_payments_df = testing_transactions[testing_transactions['Amount'] > 0]
testing_expenditures_df = testing_transactions[testing_transactions['Amount'] < 0]
testing_expenditures_df["Amount"] = testing_expenditures_df["Amount"].abs()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_expenditures_df["Amount"] = training_expenditures_df["Amount"].abs()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_expenditures_df["Amount"] = testing_expenditures_df["Amount"].abs()


In [39]:
testing_expenditures_df

Unnamed: 0,Date,Timestamp,Account No,Balance,Amount,Third Party Account No,Third Party Name,Datetime
53721,24/03/2023,09:08:00,999752672.0,1626.663493,5.05,,Coffee #1,2023-03-24 09:08:00
119889,03/07/2023,19:37:00,668329719.0,6972.451570,72.30,,Tesco,2023-07-03 19:37:00
94754,01/06/2023,00:00:00,948069469.0,-456.592900,1239.00,,Halifax,2023-06-01 00:00:00
14102,17/01/2023,11:58:00,550644516.0,1030.853400,107.96,,Topshop,2023-01-17 11:58:00
46208,04/03/2023,19:16:00,322720763.0,3952.870800,65.84,,Tesco,2023-03-04 19:16:00
...,...,...,...,...,...,...,...,...
196467,29/10/2023,11:12:00,815993333.0,5834.530559,35.14,,Sports Direct,2023-10-29 11:12:00
226353,02/12/2023,09:24:00,454129843.0,1147.926395,54.99,,Sports Direct,2023-12-02 09:24:00
50402,14/03/2023,19:57:00,531186970.0,1817.435292,65.16,,Tesco,2023-03-14 19:57:00
58584,31/03/2023,23:59:00,240643705.0,591.087731,12.99,,SquareOnix,2023-03-31 23:59:00


### Creating a day column 

In [5]:
training_expenditures_df['Day'] = training_expenditures_df['Datetime'].dt.date
training_payments_df['Day'] = training_payments_df['Datetime'].dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_expenditures_df['Day'] = training_expenditures_df['Datetime'].dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_payments_df['Day'] = training_payments_df['Datetime'].dt.date


### Aggregating transactions by day and account

In [24]:
daily_expenditures = training_expenditures_df.groupby(['Account No', 'Day']).size().reset_index(name='Daily expenditures')
daily_payments = training_payments_df.groupby(['Account No', 'Day']).size().reset_index(name='Daily payments')

### Calculating thresholds for expenditures and payments

In [35]:
def calculate_thresholds(group, value_col):
    quantiles = group[value_col].quantile([0.25, 0.5, 0.75])
    IQR = quantiles[0.75] - quantiles[0.25]
    
    # Define the multipliers and percent increments for each threshold
    multipliers = [1.5, 2, 3]
    increments = [1, 2, 3]
    
    thresholds = []
    for multiplier, increment in zip(multipliers, increments):
        if IQR == 0:
            # When there's no variability, use the median and apply percent increment
            threshold = quantiles[0.5] * (1 + increment)
        else:
            # Standard threshold calculation using IQR
            threshold = quantiles[0.75] + multiplier * IQR
        thresholds.append(threshold)
    
    return tuple(thresholds)



In [37]:
def process_data(df, group_by_col, value_col):
    # Group by user and calculate thresholds
    thresholds = df.groupby(group_by_col).apply(lambda x: calculate_thresholds(x, value_col)).reset_index(name='Thresholds')

    # Split the tuples into separate columns
    thresholds[['Threshold 1', 'Threshold 2', 'Threshold 3']] = pd.DataFrame(thresholds['Thresholds'].tolist(), index=thresholds.index)
    
    # Drop the original 'Thresholds' column
    thresholds.drop(columns='Thresholds', inplace=True)

    return thresholds

# Example usage:
# Apply the function to daily_expenditures
daily_expenditures_thresholds = process_data(daily_expenditures, 'Account No', 'Daily expenditures')

# Apply the function to daily_payments
daily_payments_thresholds = process_data(daily_payments, 'Account No', 'Daily payments')

# Print results
print(daily_expenditures_thresholds.describe())
print(daily_payments_thresholds.describe())



         Account No  Threshold 1  Threshold 2  Threshold 3
count  9.760000e+02   976.000000   976.000000   976.000000
mean   5.508141e+08     3.600026     4.314293     5.500000
std    2.574339e+08     1.788622     1.994455     2.609254
min    1.015313e+08     1.625000     1.750000     2.000000
25%    3.313499e+08     2.000000     3.000000     4.000000
50%    5.542532e+08     3.500000     4.000000     5.000000
75%    7.668451e+08     3.500000     4.000000     5.000000
max    9.997527e+08    18.500000    22.000000    29.000000
         Account No  Threshold 1  Threshold 2  Threshold 3
count  8.740000e+02   874.000000   874.000000   874.000000
mean   5.465899e+08     2.008581     3.005149     4.004005
std    2.573171e+08     0.107850     0.114032     0.152719
min    1.015313e+08     2.000000     2.500000     2.750000
25%    3.230521e+08     2.000000     3.000000     4.000000
50%    5.497923e+08     2.000000     3.000000     4.000000
75%    7.628412e+08     2.000000     3.000000     4.0000

In [38]:
daily_expenditures_thresholds

Unnamed: 0,Account No,Threshold 1,Threshold 2,Threshold 3
0,101531259.0,3.5,4.0,5.0
1,104832000.0,2.0,3.0,4.0
2,105375973.0,8.5,10.0,13.0
3,106601471.0,2.0,3.0,4.0
4,108481285.0,3.5,4.0,5.0
...,...,...,...,...
971,995615876.0,3.5,4.0,5.0
972,996042490.0,3.5,4.0,5.0
973,998390769.0,3.5,4.0,5.0
974,998405607.0,8.5,10.0,13.0


### Risk flagging model based on frequency of transactions: Daily

In [41]:
def classify_daily_risk(testing_df, daily_thresholds_df):
    # Convert Datetime to date for accurate day-based grouping
    testing_df['Day'] = pd.to_datetime(testing_df['Datetime']).dt.date
    
    # Prepare the results list to collect data
    results = []
    
    # Iterate through each account's thresholds
    for index, row in daily_thresholds_df.iterrows():
        account_no = row['Account No']
        d1, d2, d3 = row['Threshold 1'], row['Threshold 2'], row['Threshold 3']
        
        # Filter transactions for the current account
        account_transactions = testing_df[testing_df['Account No'] == account_no]
        
        # Get unique transaction dates for the account
        unique_dates = account_transactions['Day'].unique()
        
        for transaction_date in unique_dates:
            # Filter transactions for the account on the specific date
            transactions_on_date = account_transactions[account_transactions['Day'] == transaction_date]
            
            # Count the number of transactions on the date
            total_transactions = len(transactions_on_date)
            
            # Classify risk based on transaction count compared to thresholds
            if total_transactions < d1:
                risk = 'No Risk'
            elif total_transactions < d2:
                risk = 'Low Risk'
            elif total_transactions < d3:
                risk = 'Medium Risk'
            else:
                risk = 'High Risk'
            
            # Append the result to the list
            results.append({
                'Account No': account_no,
                'Transaction Date': transaction_date,
                'Number of Transactions': total_transactions,
                'Risk': risk
            })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    return results_df

# Example usage with expenditures
results_expenditures_df = classify_daily_risk(testing_expenditures_df, daily_expenditures_thresholds)

# Example usage with payments
results_payments_df = classify_daily_risk(testing_payments_df, daily_payments_thresholds)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_df['Day'] = pd.to_datetime(testing_df['Datetime']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing_df['Day'] = pd.to_datetime(testing_df['Datetime']).dt.date


In [55]:
high_risk= results_expenditures_df[results_expenditures_df['Risk']== 'Medium Risk']
high_risk

Unnamed: 0,Account No,Transaction Date,Number of Transactions,Risk
31,104832000.0,2023-04-30,3,Medium Risk
34,104832000.0,2023-08-31,3,Medium Risk
63,106601471.0,2023-07-31,3,Medium Risk
66,106601471.0,2023-06-30,3,Medium Risk
72,106601471.0,2023-02-14,3,Medium Risk
...,...,...,...,...
35091,987656636.0,2023-09-24,4,Medium Risk
35095,987656636.0,2023-07-24,4,Medium Risk
35207,992204045.0,2023-07-16,4,Medium Risk
35277,993039226.0,2023-02-28,3,Medium Risk


In [50]:
results_payments_df[results_payments_df['Risk']== 'Low Risk']

Unnamed: 0,Account No,Transaction Date,Number of Transactions,Risk
1526,762896656.0,2023-02-28,2,Low Risk
