In [86]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

In [21]:
biz_df = pd.read_csv("C:/Users/Diya/OneDrive/Desktop/Mini Project/Datasets/fake_transactional_data_24.csv")

In [22]:
biz_df.head(6)

Unnamed: 0,from_totally_fake_account,monopoly_money_amount,to_randomly_generated_account,not_happened_yet_date
0,10371.0,4.0,CINEMA,01/01/2025
1,88339.0,2.4,40544,01/01/2025
2,18555.0,2.4,85149,01/01/2025
3,18555.0,4.1,HIPSTER_COFFEE_SHOP,01/01/2025
4,80792.0,1.95,18555,01/01/2025
5,18555.0,4.45,TOTALLY_A_REAL_COFFEE_SHOP,01/01/2025


In [23]:
biz_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10148280 entries, 0 to 10148279
Data columns (total 4 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   from_totally_fake_account      float64
 1   monopoly_money_amount          float64
 2   to_randomly_generated_account  object 
 3   not_happened_yet_date          object 
dtypes: float64(2), object(2)
memory usage: 309.7+ MB


In [24]:
# Filter to include only business transactions
is_business_transaction = biz_df['to_randomly_generated_account'].apply(lambda x: not x.isnumeric())
business_transactions = biz_df[is_business_transaction]

# Create a new DataFrame
business_df = business_transactions[['to_randomly_generated_account', 'monopoly_money_amount', 'not_happened_yet_date']]
business_df.columns = ['Business_Name', 'Amount_Received', 'Date']

In [25]:
business_df.head()

Unnamed: 0,Business_Name,Amount_Received,Date
0,CINEMA,4.0,01/01/2025
3,HIPSTER_COFFEE_SHOP,4.1,01/01/2025
5,TOTALLY_A_REAL_COFFEE_SHOP,4.45,01/01/2025
6,COFFEE_SHOP,1.45,01/01/2025
7,CAFE,5.0,01/01/2025


In [26]:
#Aggregate transactions
business_df = business_df.groupby(['Business_Name', 'Date']).sum().reset_index()

In [27]:
business_df.head()

Unnamed: 0,Business_Name,Date,Amount_Received
0,ACCESSORY_SHOP,01/01/2025,140.0
1,ACCESSORY_SHOP,01/02/2025,89.0
2,ACCESSORY_SHOP,01/03/2025,212.0
3,ACCESSORY_SHOP,01/04/2025,273.0
4,ACCESSORY_SHOP,01/05/2025,143.0


##### ISOLATION FOREST

In [108]:
# Apply Isolation Forest to detect outliers in 'Amount_Received'
iso_forest = IsolationForest(n_estimators=100, contamination= 'auto', random_state=42)
business_df['scores'] = iso_forest.fit_predict(business_df[['Amount_Received']])

# Determine if a transaction is an outlier (True for outliers, False for normal)
business_df['is_outlier'] = business_df['scores'] == -1

# Filter for normal transactions
normal_transactions = business_df[business_df['is_outlier'] == False]

In [109]:
print(business_df)

             Business_Name  Amount_Received  scores  is_outlier
Date                                                           
2025-01-01  ACCESSORY_SHOP           140.00       1       False
2025-01-02  ACCESSORY_SHOP            89.00      -1        True
2025-01-03  ACCESSORY_SHOP           212.00       1       False
2025-01-04  ACCESSORY_SHOP           273.00       1       False
2025-01-05  ACCESSORY_SHOP           143.00       1       False
...                    ...              ...     ...         ...
2025-05-30     WINE_CELLAR             9.00      -1        True
2025-07-30     WINE_CELLAR           214.93       1       False
2025-08-30     WINE_CELLAR            38.98      -1        True
2025-10-30     WINE_CELLAR           166.97       1       False
2025-12-30     WINE_CELLAR            90.98      -1        True

[27214 rows x 4 columns]


##### VIEW OUTLIERS FOR SPECIFIC BUSINESS

In [110]:
# Specify the business name interested in
specific_business_name = "WINE_CELLAR"

# Filter the DataFrame to select rows where the Business_Name matches the specified business
specific_business_transactions = business_df[business_df['Business_Name'] == specific_business_name]

# Display the filtered DataFrame
print(specific_business_transactions)

           Business_Name  Amount_Received  scores  is_outlier
Date                                                         
2025-01-01   WINE_CELLAR           223.95       1       False
2025-01-02   WINE_CELLAR           160.93       1       False
2025-01-03   WINE_CELLAR            86.97      -1        True
2025-01-04   WINE_CELLAR           291.92       1       False
2025-01-05   WINE_CELLAR           230.94       1       False
...                  ...              ...     ...         ...
2025-05-30   WINE_CELLAR             9.00      -1        True
2025-07-30   WINE_CELLAR           214.93       1       False
2025-08-30   WINE_CELLAR            38.98      -1        True
2025-10-30   WINE_CELLAR           166.97       1       False
2025-12-30   WINE_CELLAR            90.98      -1        True

[352 rows x 4 columns]


##### CREATING LOWER AND UPPER LIMITS USING MEAN OF TRANSACTIONS

In [35]:
average_transactions = business_df.groupby('Business_Name')['Amount_Received'].mean().reset_index(name='Average_Amount')
lower_limit = average_transactions['Average_Amount'].min()
upper_limit = average_transactions['Average_Amount'].max()

In [37]:
limits_df = limits[['Business_Name', 'lower_limit', 'upper_limit']].drop_duplicates()
limits_df

Unnamed: 0,Business_Name,lower_limit,upper_limit
0,ACCESSORY_SHOP,97.00,292.00
1,A_CAFE,980.60,4115.10
2,A_LOCAL_COFFEE_SHOP,958.80,4118.00
3,A_SUPERMARKET,12978.91,18147.24
4,BAR,2775.00,51384.00
...,...,...,...
74,WE_HAVE_BEAN_WEIGHTING,141.82,551.83
75,WHISKEY_BAR,369.00,8733.50
76,WHISKEY_SHOP,111.98,512.88
77,WINE_BAR,342.00,8501.50


##### CATEGORIZE BASED ON AVERAGE OF TRANSACTIONS INTO STANDARD, UNDER-THRESHOD AND OVER-THRESHOLD

In [43]:
def categorize_business(row, lower_limit, upper_limit):
    if row['Average_Amount'] < lower_limit:
        return 'Under-Threshold'
    elif lower_limit <= row['Average_Amount'] <= upper_limit:
        return 'Standard'
    else:
        return 'Over-Threshold'

# Apply the categorization function to each row, passing the overall lower and upper limits
average_transactions['Category'] = average_transactions.apply(categorize_business, axis=1, args=(lower_limit, upper_limit))

# Add the overall lower and upper limits to the DataFrame
average_transactions['Lower_Limit'] = lower_limit
average_transactions['Upper_Limit'] = upper_limit

# Display the final DataFrame
print(average_transactions)

             Business_Name  Average_Amount  Category  Lower_Limit  Upper_Limit
0           ACCESSORY_SHOP      205.664835  Standard   195.901099     29550.13
1                   A_CAFE     3144.874313  Standard   195.901099     29550.13
2      A_LOCAL_COFFEE_SHOP     3137.176648  Standard   195.901099     29550.13
3            A_SUPERMARKET    15529.611484  Standard   195.901099     29550.13
4                      BAR    23236.487637  Standard   195.901099     29550.13
..                     ...             ...       ...          ...          ...
74  WE_HAVE_BEAN_WEIGHTING      674.428097  Standard   195.901099     29550.13
75             WHISKEY_BAR     3746.524725  Standard   195.901099     29550.13
76            WHISKEY_SHOP      666.492699  Standard   195.901099     29550.13
77                WINE_BAR     3717.559066  Standard   195.901099     29550.13
78             WINE_CELLAR      660.625710  Standard   195.901099     29550.13

[79 rows x 5 columns]


In [113]:
#print(business_df.columns)
#business_df.head()

In [115]:
# Filter the DataFrame for rows where the Business_Name is "ACCESSORY_SHOP"
accessory_shop_df = business_df[business_df['Business_Name'] == 'ACCESSORY_SHOP']
accessory_shop_df

Unnamed: 0_level_0,Business_Name,Amount_Received,scores,is_outlier
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-01-01,ACCESSORY_SHOP,140.0,1,False
2025-01-02,ACCESSORY_SHOP,89.0,-1,True
2025-01-03,ACCESSORY_SHOP,212.0,1,False
2025-01-04,ACCESSORY_SHOP,273.0,1,False
2025-01-05,ACCESSORY_SHOP,143.0,1,False
...,...,...,...,...
2025-05-31,ACCESSORY_SHOP,219.0,1,False
2025-07-31,ACCESSORY_SHOP,186.0,1,False
2025-08-31,ACCESSORY_SHOP,260.0,1,False
2025-10-31,ACCESSORY_SHOP,252.0,1,False


##### DAILY, WEEKLY AND MONTHLY COUNTS OF TRANSACTIONS FOR EACH BUSINESS

In [61]:
# Daily transaction counts for each business
daily_counts = business_df.groupby('Business_Name').resample('D').size()

# Weekly transaction counts for each business
weekly_counts = business_df.groupby('Business_Name').resample('W').size()

# Monthly transaction counts for each business
monthly_counts = business_df.groupby('Business_Name').resample('M').size()

In [81]:
# Convert Series to DataFrame
daily_df = daily_counts.reset_index(name='Daily_Counts')
weekly_df = weekly_counts.reset_index(name='Weekly_Counts')
monthly_df = monthly_counts.reset_index(name='Monthly_Counts')

# Merge into a single DataFrame
counts_df = pd.merge(daily_df, weekly_df, on='Business_Name')
counts_df = pd.merge(counts_df, monthly_df, on='Business_Name')


In [82]:
print(counts_df)

           Business_Name     Date_x  Daily_Counts     Date_y  Weekly_Counts  \
0         ACCESSORY_SHOP 2025-01-01             1 2025-01-05              5   
1         ACCESSORY_SHOP 2025-01-01             1 2025-01-05              5   
2         ACCESSORY_SHOP 2025-01-01             1 2025-01-05              5   
3         ACCESSORY_SHOP 2025-01-01             1 2025-01-05              5   
4         ACCESSORY_SHOP 2025-01-01             1 2025-01-05              5   
...                  ...        ...           ...        ...            ...   
17462164     WINE_CELLAR 2025-12-30             1 2026-01-04              2   
17462165     WINE_CELLAR 2025-12-30             1 2026-01-04              2   
17462166     WINE_CELLAR 2025-12-30             1 2026-01-04              2   
17462167     WINE_CELLAR 2025-12-30             1 2026-01-04              2   
17462168     WINE_CELLAR 2025-12-30             1 2026-01-04              2   

               Date  Monthly_Counts  
0        2025