<a href="https://colab.research.google.com/github/PravalikaBojja/CyberWaves/blob/main/FinancialTransactionsAnamolyDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [2]:
from google.colab import files
uploaded = files.upload()

Saving Financial Transactions.csv.txt to Financial Transactions.csv.txt


In [4]:
# Load the dataset
data = pd.read_csv("/content/Financial Transactions.csv.txt")

In [5]:
# Data Preprocessing
def preprocess_data(df):
    # Convert date to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Remove rows with NaN in 'date' or 'amount'
    df = df.dropna(subset=['date', 'amount'])

    # Convert amount to float
    df['amount'] = df['amount'].astype(float)

    return df

In [6]:
data = preprocess_data(data)

In [7]:
print(data.head())

  transaction_id       date       category  amount
0         TRX001 2024-06-01           Food    25.0
1         TRX002 2024-06-01      Utilities   150.0
2         TRX003 2024-06-01  Entertainment   200.0
3         TRX004 2024-06-02           Food  3000.0
4         TRX005 2024-06-02      Transport    45.0


In [8]:
# Statistical Analysis
def calculate_statistics(df):
    stats = df.groupby('category').agg({
        'amount': ['mean', 'median', 'std', 'count']
    }).reset_index()

    stats.columns = ['category', 'mean', 'median', 'std', 'count']
    return stats


In [9]:
stats = calculate_statistics(data)

In [10]:
# Calculate IQR for each category
def calculate_iqr(df):
    iqr_stats = df.groupby('category').amount.agg(['quantile'])
    iqr_stats['IQR'] = iqr_stats['quantile'] - iqr_stats['quantile']
    iqr_stats = iqr_stats.reset_index()
    return iqr_stats

In [11]:
iqr_stats = calculate_iqr(data)

In [12]:
# Define thresholds for anomaly detection
def define_thresholds(stats, iqr_stats):
    thresholds = pd.merge(stats, iqr_stats, on='category')
    thresholds['upper_limit'] = thresholds['median'] + 3 * thresholds['std']
    thresholds['lower_limit'] = thresholds['median'] - 3 * thresholds['std']
    thresholds['iqr_upper'] = thresholds['quantile'] + 1.5 * thresholds['IQR']
    thresholds['iqr_lower'] = thresholds['quantile'] - 1.5 * thresholds['IQR']
    return thresholds

In [13]:
thresholds = define_thresholds(stats, iqr_stats)

In [14]:
# Anomaly Detection
def detect_anomalies(df, thresholds):
    anomalies = []
    for _, row in df.iterrows():
        category = row['category']
        amount = row['amount']
        threshold = thresholds[thresholds['category'] == category].iloc[0]

        if amount > threshold['upper_limit'] or amount < threshold['lower_limit']:
            anomalies.append({
                'transaction_id': row['transaction_id'],
                'date': row['date'],
                'category': row['category'],
                'amount': row['amount'],
                'reason_for_anomaly': '3 std deviations from the mean'
            })
        elif amount > threshold['iqr_upper'] or amount < threshold['iqr_lower']:
            anomalies.append({
                'transaction_id': row['transaction_id'],
                'date': row['date'],
                'category': row['category'],
                'amount': row['amount'],
                'reason_for_anomaly': 'Outside IQR range'
            })
    return anomalies

In [15]:
anomalies = detect_anomalies(data, thresholds)

In [16]:
# Reporting
def generate_report(anomalies):
    report_df = pd.DataFrame(anomalies)
    report_df.to_csv('anomalies_report.csv', index=False)
    print("Anomalies report generated: anomalies_report.csv")

generate_report(anomalies)


Anomalies report generated: anomalies_report.csv
