In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

In [2]:
#load data
data = pd.read_csv('fake_transactional_data_24.csv')

In [3]:
#build random seed to generate balance
np.random.seed(17)

data['account_balance'] = np.random.uniform(1000, 20000, size=len(data))

In [5]:
#proportion of transaction
data['transaction_percentage'] = data['monopoly_money_amount'] / data['account_balance'] * 100

In [6]:
#fraud rule

data['alert'] = np.select(
    [
        data['transaction_percentage'] > 50,
        data['transaction_percentage'] > 30,
        data['transaction_percentage'] > 10
    ],
    [
        'Freeze Account',  
        'Call to Inquire',  
        'Send Warning'      
    ],
    default='No Action'  
)

In [8]:
#Isolation forest set up
forest = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=17)

# choosing feature
features = data[['monopoly_money_amount', 'transaction_percentage']]

# train model
forest.fit(features)


In [9]:

data['anomaly_score'] = forest.decision_function(features)


data['is_anomaly'] = forest.predict(features)
data['is_anomaly'] = data['is_anomaly'].map({1: 'No', -1: 'Yes'})


In [12]:
print(data.head(10))

   from_totally_fake_account  monopoly_money_amount  \
0                      10371                   4.00   
1                      88339                   2.40   
2                      18555                   2.40   
3                      18555                   4.10   
4                      80792                   1.95   
5                      18555                   4.45   
6                      18555                   1.45   
7                      18555                   5.00   
8                      41378                   2.55   
9                      41378                   1.80   

  to_randomly_generated_account not_happened_yet_date  account_balance  \
0                        CINEMA            01/01/2025      6598.635051   
1                         40544            01/01/2025     11081.148357   
2                         85149            01/01/2025      4638.894952   
3           HIPSTER_COFFEE_SHOP            01/01/2025      2290.106806   
4                       

In [15]:
anomaly_counts = data['is_anomaly'].value_counts()
anomaly_counts

No     938451
Yes    110124
Name: is_anomaly, dtype: int64

In [16]:
alert_counts = data['alert'].value_counts()
alert_counts

No Action          1045261
Call to Inquire        419
Freeze Account         350
Name: alert, dtype: int64

In [18]:
#throw away the anomaly data
data_clean = data[data['is_anomaly'] == 'No']

# delete anomaly column
data_clean = data_clean.drop(columns=['anomaly_score', 'is_anomaly'])

alert_counts_clean = data_clean['alert'].value_counts()
alert_counts_clean


No Action    938451
Name: alert, dtype: int64