In [1]:
import os
%pwd

'c:\\Users\\user\\Desktop\\moniepoint\\notebooks'

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\user\\Desktop\\moniepoint'

In [4]:
from pathlib import Path  
from dataclasses import dataclass 

@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    file_path: str   
    local_data_dir: Path  
    clean_path: str 
    
from src.anomaly_detection.constant import *
from src.anomaly_detection.utils.common import load_yaml, create_directories

C:\Users\user\Desktop\moniepoint


In [5]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH,
                schema_filepath = SCHEMA_FILE_PATH, 
                params_filepath = PARAMS_FILE_PATH):
        self.config = load_yaml(config_filepath)
        self.params = load_yaml(params_filepath)
        self.schema = load_yaml(schema_filepath)
        
        create_directories([self.config.artifacts_root])
        
    # def clean_data
        
    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion
        create_directories([config.root_dir])
        data_ingestion_config = DataIngestionConfig(
            root_dir = config.root_dir,
            file_path = config.file_path,
            local_data_dir = config.local_data_dir,
            clean_path = config.clean_data_path
        )
        
        return data_ingestion_config

In [6]:
ConfigurationManager().get_data_ingestion_config().local_data_dir

2025-08-14 20:23:41 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-14 20:23:41 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-14 20:23:41 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-14 20:23:41 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-14 20:23:41 - INFO - [common.py:88] - Created directory at: artifacts/data_ingestion


'artifacts/data_ingestion'

In [7]:
ConfigurationManager().get_data_ingestion_config().clean_path

2025-08-14 20:23:44 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-14 20:23:44 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-14 20:23:44 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-14 20:23:44 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-14 20:23:44 - INFO - [common.py:88] - Created directory at: artifacts/data_ingestion


'artifacts/data_ingestion/cleaned_anomaly_detection.csv'

In [8]:
ConfigurationManager().get_data_ingestion_config().file_path

2025-08-14 20:23:47 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-14 20:23:47 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-14 20:23:47 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-14 20:23:47 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-14 20:23:47 - INFO - [common.py:88] - Created directory at: artifacts/data_ingestion


'moniepoint/monie_point_data/synthetic_dirty_transaction_logs.csv'

In [9]:
import pandas as pd   
import re 
from datetime import datetime

In [10]:
import logging

In [11]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config  

    def load_file(self):
        file_path = Path(self.config.file_path)
        output_path = Path(self.config.local_data_dir) / "anomaly_detection.csv"
        
        os.makedirs(output_path.parent, exist_ok=True)
        print(file_path)
        
        try:
            df = pd.read_csv(f"c:\\Users\\user\\Desktop\\{file_path}")
            if df.empty:
                logging.warning(f"The file {file_path} is empty")
            else:
                logging.info(f"The file {file_path} loaded successfully with {len(df)} rows")
                print(df.head())

                # Save a local copy
                df.to_csv(output_path, index=False)
                logging.info(f"Data saved to {output_path}")

        except FileNotFoundError:
            logging.error(f"The file {file_path} was not found")
        except Exception as e:
            logging.error(f"An error occurred while loading the file: {e}")

In [12]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.load_file()
except Exception as e:
    raise e

2025-08-14 20:24:01 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-14 20:24:01 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-14 20:24:01 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-14 20:24:01 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-14 20:24:01 - INFO - [common.py:88] - Created directory at: artifacts/data_ingestion
2025-08-14 20:24:01 - INFO - [682757324.py:17] - The file moniepoint\monie_point_data\synthetic_dirty_transaction_logs.csv loaded successfully with 10000 rows
2025-08-14 20:24:01 - INFO - [682757324.py:22] - Data saved to artifacts\data_ingestion\anomaly_detection.csv


moniepoint\monie_point_data\synthetic_dirty_transaction_logs.csv
                                             raw_log
0  2025-07-05 19:18:10::user1069::withdrawal::299...
1                                                NaN
2                                      MALFORMED_LOG
3  usr:user1076|cashout|€4821.85|Glasgow|2025-07-...
4  2025-07-20 05:38:14 >> [user1034] did top-up -...


In [13]:
from typing import TypedDict, Optional

class AmountData(TypedDict):
    amount: Optional[float|None]
    currency: Optional[str|None]

In [49]:
from abc import ABC, abstractmethod
from datetime import datetime


class BaseTransactionParser(ABC):
    """Abstract base class for transaction log parsers."""

    @abstractmethod
    def clean_transaction_logs(self, csv_file_path=None, raw_data=None) -> pd.DataFrame:
        """Clean and parse transaction logs. raw_data is for testing with raw text."""
        pass

    @abstractmethod
    def parse_transaction_line(self, line: str, row_id: int) -> Optional[dict]:
        """Parse a single transaction log line into structured data."""
        pass

    @staticmethod
    def parse_datetime(date_str: str):
        """Parse datetime string in supported formats."""
        if not date_str or date_str.lower() == 'none':
            return None

        try:
            if re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', date_str):
                return datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
            if re.match(r'^\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}$', date_str):
                return datetime.strptime(date_str, '%d/%m/%Y %H:%M:%S')
        except ValueError:
            print(f"Could not parse datetime: {date_str}")

        return None

    @staticmethod
    def extract_amount(amount_str: str):
        """Extract amount and currency from an amount string."""
        if not amount_str:
            return {'amount': None, 'currency': 'GBP'}  

        # Keep only valid number and currency characters
        amount_clean = re.sub(r'[^\d.,€£$]', '', str(amount_str))

        number_match = re.search(r'([\d,]+\.?\d*)', amount_clean)
        if number_match:
            amount = float(number_match.group(1).replace(',', ''))
            currency = 'GBP'  # default
            if '€' in amount_str or 'â‚¬' in amount_str:
                currency = 'EUR'
            elif '£' in amount_str or 'Â£' in amount_str:
                currency = 'GBP'
            elif '$' in amount_str:
                currency = 'USD'
            return {'amount': amount, 'currency': currency}

        return {'amount': None, 'currency': 'GBP'}

    @staticmethod
    def get_currency_code(symbol: Optional[str]):
        """Convert currency symbol or ISO code to standard currency code."""
        mapping = {
            '€': 'EUR', 'â‚¬': 'EUR', 'EUR': 'EUR',
            '£': 'GBP', 'Â£': 'GBP', 'GBP': 'GBP',
            '$': 'USD', 'USD': 'USD'
        }
        # Default to GBP if not found or missing
        if not symbol:
            return 'GBP'
        return mapping.get(symbol.strip(), 'GBP')

    @staticmethod
    def clean_field(field: Optional[str]):
        """Clean and standardize field values, including removing known bad unicode."""
        if not field or field.lower() in ['none', 'null', '']:
            return None

        replacements = {
            "â‚¬": "€",
            "Â£": "£",
            "Â": "",
            "\u201a": "",  # weird comma-like character
        }
        for bad, good in replacements.items():
            field = field.replace(bad, good)

        # Collapse multiple spaces and trim
        return re.sub(r"\s+", " ", field).strip()


In [50]:
class TransactionCleaner(BaseTransactionParser):
    """
    Concrete transaction parser with multiple log format patterns.
    
    This class handles 9 different transaction log formats commonly found in 
    financial systems, including various date formats, currency symbols, and 
    delimiters. 
    """
    def clean_transaction_logs(self, csv_file_path=None, raw_data=None) -> pd.DataFrame:
        """
        Clean and parse transaction logs from file or raw data.
        
        Args:
            csv_file_path: Path to file containing transaction logs
            raw_data: Raw string data for testing purposes
            
        Returns:
            pandas.DataFrame with structured transaction data
            
        Raises:
            ValueError: If neither csv_file_path nor raw_data is provided
        """
        if csv_file_path:
            raw_content = Path(csv_file_path).read_text(encoding='utf-8')
        elif raw_data:
            raw_content = raw_data
        else:
            raise ValueError("Either csv_file_path or raw_data must be provided")

        # Filter valid lines - exclude malformed logs and headers
        lines = raw_content.split('\n')
        valid_lines = [
            line.strip()
            for line in lines
            if line.strip()
            and line.strip() not in ['""', 'MALFORMED_LOG', 'raw_log']
            and not line.strip().startswith('raw_log')
        ]

        cleaned_records = []
        for idx, line in enumerate(valid_lines):
            record = self.parse_transaction_line(line, idx + 1)
            if record:
                cleaned_records.append(record)

        df = pd.DataFrame(cleaned_records)

        # Post-processing: ensure proper data types and handle missing values
        if not df.empty:
            df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
            df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
            df['row_id'] = df['row_id'].astype(int)
            df['location'] = df['location'].fillna('Unknown')
            df['device'] = df['device'].fillna('Unknown')
            df['currency'] = df['currency'].fillna('USD')

        return df

    def parse_transaction_line(self, line: str, row_id: int) -> Optional[dict]:
        """
        Parse a single transaction log line into structured data.
        
        This method attempts to match the input line against 9 different patterns
        in order of specificity, returning structured data if a match is found.
        
        Args:
            line: Raw transaction log line
            row_id: Row identifier for tracking
            
        Returns:
            Dictionary with parsed transaction data or None if no pattern matches
        """
        if not line or line in ['""', 'MALFORMED_LOG']:
            return None

        # Initialize record structure
        record = {
            'row_id': row_id,
            'original_log': line,
            'datetime': None,
            'user_id': None,
            'transaction_type': None,
            'amount': None,
            'currency': None,
            'location': None,
            'device': None
        }

        try:
            # Pattern 1: Double colon format
            # Example: 2023-05-14 14:05:31::user123::top-up::500::ATM Location::Device
            pattern1 = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})::(\w+)::([\w-]+)::([\d,.]+)::([^:]+)::(.+)$'
            match = re.match(pattern1, line)
            if match:
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3),
                    'amount': self.extract_amount(match.group(4))['amount'],
                    'currency': self.extract_amount(match.group(4))['currency'],
                    'location': self.clean_field(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 2: Pipe separated format
            # Example: usr:user123|top-up|£500|Location|2023-05-14 14:05:31|Device
            pattern2 = r'^usr:(\w+)\|([\w-]+)\|([€£$]?[\d,.]+)\|([^|]+)\|(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\|(.+)$'
            match = re.match(pattern2, line)
            if match:
                amount_info = self.extract_amount(match.group(3))
                record.update({
                    'user_id': match.group(1),
                    'transaction_type': match.group(2),
                    'amount': amount_info['amount'],
                    'currency': amount_info['currency'],
                    'location': self.clean_field(match.group(4)),
                    'datetime': self.parse_datetime(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 3: Arrow format with brackets
            # Example: 2023-05-14 14:05:31 >> [user123] did top-up - amt=£500 - Location // dev:Device
            pattern3 = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) >> \[(\w+)\] did ([\w-]+) - amt=([€£$]?[\d,.]+) - ([^/]+) // dev:(.+)$'
            match = re.match(pattern3, line)
            if match:
                amount_info = self.extract_amount(match.group(4))
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3).replace('-', '_'),
                    'amount': amount_info['amount'],
                    'currency': amount_info['currency'],
                    'location': self.clean_field(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 4: Pipe format with user/txn/device labels
            # Example: 2023-05-14 14:05:31 | user: user123 | txn: top-up of £500 from Location | device: Device
            pattern4 = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \| user: (\w+) \| txn: ([\w-]+) of ([€£$]?[\d,.]+) from ([^|]+) \| device: (.+)$'
            match = re.match(pattern4, line)
            if match:
                amount_info = self.extract_amount(match.group(4))
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3),
                    'amount': amount_info['amount'],
                    'currency': amount_info['currency'],
                    'location': self.clean_field(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 5: Dash separated with user/action/ATM/device
            # Example: 2023-05-14 14:05:31 - user=user123 - action=top-up £500 - ATM: Location - device=Device
            pattern5 = r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) - user=(\w+) - action=([\w-]+) ([€£$]?[\d,.]+) - ATM: ([^-]+) - device=(.+)$'
            match = re.match(pattern5, line)
            if match:
                amount_info = self.extract_amount(match.group(4))
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3),
                    'amount': amount_info['amount'],
                    'currency': amount_info['currency'],
                    'location': self.clean_field(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 6: Triple colon with asterisks (DD/MM/YYYY format)
            # Example: 14/05/2023 14:05:31 ::: user123 *** TOP-UP ::: amt:£500 @ Location <Device>
            pattern6 = r'^(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}) ::: (\w+) \*\*\* ([\w-]+) ::: amt:([€£$]?[\d,.]+) @ ([^<]+) <([^>]+)>$'
            match = re.match(pattern6, line)
            if match:
                amount_info = self.extract_amount(match.group(4))
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3).lower(),
                    'amount': amount_info['amount'],
                    'currency': amount_info['currency'],
                    'location': self.clean_field(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 7: Simple space-separated format
            # Example: user123 2023-05-14 14:05:31 top-up 500 Location Device
            pattern7 = r'^(\w+) (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ([\w-]+) ([\d,.]+) (\S+) (.+)$'
            match = re.match(pattern7, line)
            if match:
                record.update({
                    'user_id': match.group(1),
                    'datetime': self.parse_datetime(match.group(2)),
                    'transaction_type': match.group(3),
                    'amount': float(match.group(4).replace(',', '')),
                    'currency': 'GBP',  # Default for this pattern
                    'location': self.clean_field(match.group(5)),
                    'device': self.clean_field(match.group(6))
                })
                return record

            # Pattern 8: Alternative triple colon format (DD/MM/YYYY)
            # Example: 14/05/2023 14:05:31 ::: user123 *** TOP-UP ::: amt:500£ @ Location <Device>
            pattern8 = r'^(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}) ::: (\w+) \*\*\* ([\w-]+) ::: amt:([\d,.]+)([€£$]) @ ([^<]+) <([^>]+)>$'
            match = re.match(pattern8, line)
            if match:
                amount = float(match.group(4).replace(',', ''))
                currency_symbol = match.group(5)
                currency = self.get_currency_code(currency_symbol)
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3).lower(),
                    'amount': amount,
                    'currency': currency,
                    'location': self.clean_field(match.group(6)),
                    'device': self.clean_field(match.group(7))
                })
                return record

            # Pattern 9: Triple colon format with unicode currency (DD/MM/YYYY)
            # Example: 04/07/2025 00:41:51 ::: user1044 *** REFUND ::: amt:3491.94â‚¬ @ Manchester <Huawei P30>
            # This pattern specifically handles malformed unicode currency symbols
            pattern9 = r'^(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}) ::: (\w+) \*\*\* ([\w-]+) ::: amt:([\d,.]+)(â‚¬|€|£|Â£|\$) @ ([^<]+) <([^>]+)>$'
            match = re.match(pattern9, line)
            if match:
                amount = float(match.group(4).replace(',', ''))
                currency_symbol = match.group(5)
                # Use the clean_field method to handle unicode issues first
                cleaned_currency = self.clean_field(currency_symbol)
                currency = self.get_currency_code(cleaned_currency)
                record.update({
                    'datetime': self.parse_datetime(match.group(1)),
                    'user_id': match.group(2),
                    'transaction_type': match.group(3).lower(),
                    'amount': amount,
                    'currency': currency,
                    'location': self.clean_field(match.group(6)),
                    'device': self.clean_field(match.group(7))
                })
                return record

        except Exception as e:
            print(f"Error parsing line {row_id}: {line[:50]}... - {str(e)}")
            return None

        # No pattern matched - this will help debug missing patterns
        print(f"No pattern matched for line {row_id}: {line[:100]}...")
        return None

In [None]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config  

    def load_file(self):
        file_path = Path(self.config.file_path)
        output_path = Path(self.config.local_data_dir) / "anomaly_detection.csv"
        
        os.makedirs(output_path.parent, exist_ok=True)
        print(file_path)
        
        try:
            df = pd.read_csv(f"c:\\Users\\user\\Desktop\\{file_path}")
            if df.empty:
                logging.warning(f"The file {file_path} is empty")
            else:
                logging.info(f"The file {file_path} loaded successfully with {len(df)} rows")
                print(df.head())

                # Save a local copy
                df.to_csv(output_path, index=False)
                logging.info(f"Data saved to {output_path}")

        except FileNotFoundError:
            logging.error(f"The file {file_path} was not found")
        except Exception as e:
            logging.error(f"An error occurred while loading the file: {e}")
            
    def clean_data(self):
        file_path = Path(self.config.file_path)
        clean_output_path = Path(self.config.clean_path)
        os.makedirs(clean_output_path.parent, exist_ok=True)
        try:
            df = TransactionCleaner().clean_transaction_logs(csv_file_path = f"c:\\Users\\user\\Desktop\\{file_path}")
            if df.empty:
                logging.warning(f"The file {file_path} is empty")
            else:
                logging.info(f"The file {file_path} loaded successfully with {len(df)} rows")
                print(df.head())
                # Save a local copy
                df.to_csv(clean_output_path, index=False)
                logging.info(f"Data saved to {clean_output_path}")
        except FileNotFoundError:
            logging.error(f"The file {file_path} was not found")
        except Exception as e:
            logging.error(f"An error occurred while loading the file: {e}")
            
    def get_data(self) -> Optional[pd.DataFrame]:
        """A simple function to return Dataframe object"""
        file_path = Path(self.config.file_path)
        try:
            df = TransactionCleaner().clean_transaction_logs(csv_file_path = f"c:\\Users\\user\\Desktop\\{file_path}")
            if df.empty:
                logging.warning(f"The file {file_path} is empty")
            else:
                logging.info(f"The file {file_path} loaded successfully with {len(df)} rows")
                return df
        except FileNotFoundError:
            logging.error(f"The file {file_path} was not found")
        except Exception as e:
            logging.error(f"An error occurred while loading the file: {e}")
        

In [17]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.load_file()
    data_ingestion.clean_data()
except Exception as e:
    raise e

2025-08-14 20:25:09 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\config\config.yaml loaded successfully.
2025-08-14 20:25:09 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\params.yaml loaded successfully.
2025-08-14 20:25:09 - INFO - [common.py:34] - YAML file C:\Users\user\Desktop\moniepoint\schema.yaml loaded successfully.
2025-08-14 20:25:09 - INFO - [common.py:88] - Created directory at: artifacts
2025-08-14 20:25:09 - INFO - [common.py:88] - Created directory at: artifacts/data_ingestion
2025-08-14 20:25:09 - INFO - [641458980.py:17] - The file moniepoint\monie_point_data\synthetic_dirty_transaction_logs.csv loaded successfully with 10000 rows
2025-08-14 20:25:09 - INFO - [641458980.py:22] - Data saved to artifacts\data_ingestion\anomaly_detection.csv


moniepoint\monie_point_data\synthetic_dirty_transaction_logs.csv
                                             raw_log
0  2025-07-05 19:18:10::user1069::withdrawal::299...
1                                                NaN
2                                      MALFORMED_LOG
3  usr:user1076|cashout|€4821.85|Glasgow|2025-07-...
4  2025-07-20 05:38:14 >> [user1034] did top-up -...


2025-08-14 20:25:09 - INFO - [641458980.py:38] - The file moniepoint\monie_point_data\synthetic_dirty_transaction_logs.csv loaded successfully with 7774 rows


   row_id                                       original_log  \
0       1  2025-07-05 19:18:10::user1069::withdrawal::299...   
1       2  usr:user1076|cashout|€4821.85|Glasgow|2025-07-...   
2       3  2025-07-20 05:38:14 >> [user1034] did top-up -...   
3       4  2025-06-13 10:04:51 >> [user1068] did deposit ...   
4       5  2025-07-29 23:47:37 | user: user1014 | txn: de...   

             datetime   user_id transaction_type   amount currency location  \
0 2025-07-05 19:18:10  user1069       withdrawal  2995.12      GBP   London   
1 2025-07-15 12:56:05  user1076          cashout  4821.85      EUR  Glasgow   
2 2025-07-20 05:38:14  user1034           top_up  2191.06      EUR  Unknown   
3 2025-06-13 10:04:51  user1068          deposit  1691.09      EUR  Glasgow   
4 2025-07-29 23:47:37  user1014          deposit  3539.50      GBP  Glasgow   

      device  
0  iPhone 13  
1    Pixel 6  
2  iPhone 13  
3    Unknown  
4  iPhone 13  


2025-08-14 20:25:10 - INFO - [641458980.py:42] - Data saved to artifacts\data_ingestion\cleaned_anomaly_detection.csv


In [18]:
df1 = pd.read_csv(r"C:\Users\user\Desktop\moniepoint\artifacts\data_ingestion\anomaly_detection.csv")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   raw_log  8876 non-null   object
dtypes: object(1)
memory usage: 78.3+ KB


In [20]:
print(df1.isna().sum())

raw_log    1124
dtype: int64


In [21]:
len(df1.loc[df1['raw_log'] == "MALFORMED_LOG"])

1102

In [23]:
df2 = pd.read_csv(r"C:\Users\user\Desktop\moniepoint\artifacts\data_ingestion\cleaned_anomaly_detection.csv")
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7774 entries, 0 to 7773
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   row_id            7774 non-null   int64  
 1   original_log      7774 non-null   object 
 2   datetime          7774 non-null   object 
 3   user_id           7774 non-null   object 
 4   transaction_type  7774 non-null   object 
 5   amount            7774 non-null   float64
 6   currency          7774 non-null   object 
 7   location          7774 non-null   object 
 8   device            7774 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 546.7+ KB


In [24]:
df1.head()

Unnamed: 0,raw_log
0,2025-07-05 19:18:10::user1069::withdrawal::299...
1,
2,MALFORMED_LOG
3,usr:user1076|cashout|€4821.85|Glasgow|2025-07-...
4,2025-07-20 05:38:14 >> [user1034] did top-up -...


In [25]:
df1.tail(100)

Unnamed: 0,raw_log
9900,2025-07-14 17:39:56 >> [user1070] did purchase...
9901,19/06/2025 01:28:37 ::: user1031 *** TRANSFER ...
9902,usr:user1058|withdrawal|$1461.55|Glasgow|2025-...
9903,usr:user1048|cashout|€4394.98|Cardiff|2025-06-...
9904,2025-07-26 19:37:26 - user=user1083 - action=r...
...,...
9995,2025-07-31 07:41:34::user1024::deposit::1331.2...
9996,
9997,2025-06-23 05:14:03 >> [user1029] did withdraw...
9998,


In [26]:
df2.tail(100)

Unnamed: 0,row_id,original_log,datetime,user_id,transaction_type,amount,currency,location,device
7674,7675,usr:user1050|transfer|€3942.66|London|2025-07-...,2025-07-27 03:22:50,user1050,transfer,3942.66,EUR,London,iPhone 13
7675,7676,user1011 2025-06-07 12:21:48 top-up 1525.12 Gl...,2025-06-07 12:21:48,user1011,top-up,1525.12,GBP,Glasgow,Huawei P30
7676,7677,2025-07-13 21:28:28::user1076::refund::2392.79...,2025-07-13 21:28:28,user1076,refund,2392.79,GBP,Cardiff,Samsung Galaxy S10
7677,7678,2025-07-17 06:19:19::user1005::debit::890.5::L...,2025-07-17 06:19:19,user1005,debit,890.50,GBP,London,iPhone 13
7678,7679,user1097 2025-07-09 16:02:39 purchase 3331.1 C...,2025-07-09 16:02:39,user1097,purchase,3331.10,GBP,Cardiff,Unknown
...,...,...,...,...,...,...,...,...,...
7769,7770,usr:user1058|debit|£3992.06|Leeds|2025-06-10 0...,2025-06-10 03:55:41,user1058,debit,3992.06,GBP,Leeds,Pixel 6
7770,7771,08/07/2025 08:50:09 ::: user1069 *** WITHDRAWA...,2025-07-08 08:50:09,user1069,withdrawal,1401.58,EUR,Birmingham,Nokia 3310
7771,7772,2025-07-31 07:41:34::user1024::deposit::1331.2...,2025-07-31 07:41:34,user1024,deposit,1331.20,GBP,Cardiff,Nokia 3310
7772,7773,2025-06-23 05:14:03 >> [user1029] did withdraw...,2025-06-23 05:14:03,user1029,withdrawal,4203.63,GBP,London,Xiaomi Mi 11


In [27]:
df2[df2['user_id'] == "user1010"]

Unnamed: 0,row_id,original_log,datetime,user_id,transaction_type,amount,currency,location,device
25,26,02/07/2025 12:28:08 ::: user1010 *** TRANSFER ...,2025-07-02 12:28:08,user1010,transfer,4881.05,GBP,Leeds,iPhone 13
101,102,user1010 2025-06-14 17:47:12 top-up 1874.83 Gl...,2025-06-14 17:47:12,user1010,top-up,1874.83,GBP,Glasgow,Unknown
121,122,user1010 2025-07-16 14:14:11 top-up 79.89 Lond...,2025-07-16 14:14:11,user1010,top-up,79.89,GBP,London,Samsung Galaxy S10
125,126,2025-06-17 12:08:53 - user=user1010 - action=d...,2025-06-17 12:08:53,user1010,debit,2528.25,USD,Manchester,Nokia 3310
249,250,usr:user1010|refund|€4381.42|Glasgow|2025-07-0...,2025-07-06 03:09:38,user1010,refund,4381.42,EUR,Glasgow,Xiaomi Mi 11
...,...,...,...,...,...,...,...,...,...
7523,7524,2025-06-12 22:24:42 >> [user1010] did debit - ...,2025-06-12 22:24:42,user1010,debit,547.73,GBP,Unknown,Huawei P30
7548,7549,user1010 2025-07-04 17:34:02 withdrawal 3545.7...,2025-07-04 17:34:02,user1010,withdrawal,3545.79,GBP,London,Nokia 3310
7599,7600,user1010 2025-07-27 07:02:30 transfer 4851.53 ...,2025-07-27 07:02:30,user1010,transfer,4851.53,GBP,Cardiff,Nokia 3310
7619,7620,2025-07-23 11:38:41::user1010::deposit::1939.7...,2025-07-23 11:38:41,user1010,deposit,1939.71,GBP,Liverpool,Unknown


In [28]:
df2.columns

Index(['row_id', 'original_log', 'datetime', 'user_id', 'transaction_type',
       'amount', 'currency', 'location', 'device'],
      dtype='object')

In [29]:
df2.tail(30)

Unnamed: 0,row_id,original_log,datetime,user_id,transaction_type,amount,currency,location,device
7744,7745,2025-07-21 03:57:10 >> [user1090] did refund -...,2025-07-21 03:57:10,user1090,refund,1244.33,GBP,Liverpool,Unknown
7745,7746,usr:user1097|deposit|£109.2|Leeds|2025-06-03 1...,2025-06-03 14:30:36,user1097,deposit,109.2,GBP,Leeds,Xiaomi Mi 11
7746,7747,usr:user1013|withdrawal|€2943.81|Birmingham|20...,2025-07-18 07:09:20,user1013,withdrawal,2943.81,EUR,Birmingham,Nokia 3310
7747,7748,2025-06-20 17:13:08::user1091::refund::4114.88...,2025-06-20 17:13:08,user1091,refund,4114.88,GBP,Glasgow,iPhone 13
7748,7749,2025-06-30 13:51:38::user1081::refund::1726.99...,2025-06-30 13:51:38,user1081,refund,1726.99,GBP,Liverpool,Xiaomi Mi 11
7749,7750,05/06/2025 19:28:19 ::: user1084 *** REFUND ::...,2025-06-05 19:28:19,user1084,refund,2082.07,USD,Cardiff,Unknown
7750,7751,14/06/2025 13:08:45 ::: user1027 *** PURCHASE ...,2025-06-14 13:08:45,user1027,purchase,855.77,EUR,Leeds,Huawei P30
7751,7752,user1037 2025-06-16 01:21:14 transfer 1934.83 ...,2025-06-16 01:21:14,user1037,transfer,1934.83,GBP,Cardiff,iPhone 13
7752,7753,2025-07-27 20:27:36 - user=user1007 - action=d...,2025-07-27 20:27:36,user1007,deposit,4561.6,USD,Birmingham,iPhone 13
7753,7754,usr:user1003|cashout|£3342.45|Liverpool|2025-0...,2025-07-10 20:29:16,user1003,cashout,3342.45,GBP,Liverpool,Nokia 3310


In [44]:
df = pd.read_csv(r"C:\Users\user\Desktop\moniepoint\artifacts\data_ingestion\cleaned_anomaly_detection.csv")
df.head(30)

Unnamed: 0,row_id,original_log,datetime,user_id,transaction_type,amount,currency,location,device
0,1,2025-07-05 19:18:10::user1069::withdrawal::299...,2025-07-05 19:18:10,user1069,withdrawal,2995.12,GBP,London,iPhone 13
1,2,usr:user1076|cashout|€4821.85|Glasgow|2025-07-...,2025-07-15 12:56:05,user1076,cashout,4821.85,EUR,Glasgow,Pixel 6
2,3,2025-07-20 05:38:14 >> [user1034] did top-up -...,2025-07-20 05:38:14,user1034,top_up,2191.06,EUR,Unknown,iPhone 13
3,4,2025-06-13 10:04:51 >> [user1068] did deposit ...,2025-06-13 10:04:51,user1068,deposit,1691.09,EUR,Glasgow,Unknown
4,5,2025-07-29 23:47:37 | user: user1014 | txn: de...,2025-07-29 23:47:37,user1014,deposit,3539.5,GBP,Glasgow,iPhone 13
5,6,2025-06-23 14:45:58 - user=user1075 - action=d...,2025-06-23 14:45:58,user1075,debit,1215.74,USD,Leeds,Samsung Galaxy S10
6,7,2025-07-31 06:50:50 | user: user1071 | txn: ca...,2025-07-31 06:50:50,user1071,cashout,1772.13,USD,Unknown,Nokia 3310
7,8,2025-07-07 20:42:12 - user=user1098 - action=d...,2025-07-07 20:42:12,user1098,deposit,304.0,EUR,Birmingham,Nokia 3310
8,9,24/07/2025 22:47:06 ::: user1080 *** PURCHASE ...,2025-07-24 22:47:06,user1080,purchase,951.85,USD,Liverpool,Xiaomi Mi 11
9,10,2025-06-10 20:06:30 >> [user1025] did withdraw...,2025-06-10 20:06:30,user1025,withdrawal,3261.07,EUR,London,iPhone 13


In [46]:
df['transaction_type'].value_counts()

transaction_type
cashout       1015
debit         1007
deposit        991
withdrawal     971
transfer       958
purchase       940
refund         905
top-up         870
top_up         117
Name: count, dtype: int64

In [31]:
print(df.shape)

(7774, 9)


In [32]:
df2 = pd.read_csv(r"C:\Users\user\Desktop\moniepoint\artifacts\data_ingestion\anomaly_detection.csv")
print(df2.shape)

(10000, 1)


In [33]:
missing_count = df2.isna().sum()
mal_count = len(df2.loc[df2['raw_log'] == "MALFORMED_LOG"])

print(df2.shape[0] - (missing_count + mal_count))

raw_log    7774
dtype: int64


In [34]:
import numpy as np

In [35]:
class FeatureEngineer:
    def __init__(self, df: pd.DataFrame):
        """Initialize with a dataset for batch feature engineering."""
        self.df = df.copy()
        self.df["datetime"] = pd.to_datetime(self.df["datetime"])

    def _basic_time_features(self, df):
        """Generate time-based features."""
        df = df.copy()  # 
        df["day_of_week"] = df["datetime"].dt.dayofweek
        df["hour_of_day"] = df["datetime"].dt.hour
        df["month"] = df["datetime"].dt.month
        df["quarter"] = df["datetime"].dt.quarter
        df["is_weekend"] = df["day_of_week"] >= 5
        df["day_of_month"] = df["datetime"].dt.day
        df["is_business_hours"] = df["hour_of_day"].between(9, 17)
        return df

    def _user_behavior_features(self, df):
        """Generate rolling and user-specific behavior features."""
        df = df.copy() 
        df = df.sort_values(["user_id", "datetime"]).reset_index(drop=True)
        
        # Set datetime as index temporarily for rolling calculations
        original_index = df.index
        df = df.set_index('datetime')
        
        # Transaction count last 7 days
        df['transaction_count_last_7_days'] = (
            df.groupby('user_id')['amount']
            .rolling('7D', closed='right')
            .count()
            .reset_index(level=0, drop=True)
        )
        
        # Average transaction amount last 10 days        
        df["average_transaction_amount_last_10_days"] = (
            df.groupby('user_id')['amount']
            .rolling('10D', closed='right')
            .mean()
            .reset_index(level=0, drop=True)
        )
        
        # Reset index to bring datetime back as a column
        df = df.reset_index()
        df.index = original_index

        # Days since last transaction
        df["days_since_last_transaction"] = (
            df.groupby("user_id")["datetime"].diff().dt.days
        )

        # Unique locations used so far
        df["unique_locations_used"] = (
            df.groupby("user_id")["location"]
            .transform(lambda x: [len(set(x.iloc[:i+1])) for i in range(len(x))])
        )
        return df

    def _statistical_features(self, df):
        """Generate statistical anomaly-related features."""
        df = df.copy()  # Prevent modifying original
        df["amount_z_score_user"] = df.groupby("user_id")["amount"].transform(
            lambda x: (x - x.mean()) / (x.std() + 1e-9)
        )
        df["hours_since_last_transaction_user"] = df.groupby("user_id")["datetime"].diff().dt.total_seconds() / 3600
        df["transaction_count_today_user"] = df.groupby(["user_id", df["datetime"].dt.date])["amount"].transform("count")
        df["amount_percentile_user"] = df.groupby("user_id")["amount"].transform(
            lambda x: x.rank(pct=True)
        )
        return df

    def engineer_batch(self):
        """Process the whole DataFrame."""
        df = self._basic_time_features(self.df)
        df = self._user_behavior_features(df)
        df = self._statistical_features(df)
        return df

    def engineer_single(self, user_id: str, transaction: dict):
        """
        Process a single transaction for a given user.
        transaction: dict like {"datetime": "2025-08-14 10:00:00", "amount": 150,
                                "transaction_type": "purchase", "currency": "USD",
                                "location": "NY", "device": "mobile"}
        """
        transaction_df = pd.DataFrame([transaction])
        transaction_df["datetime"] = pd.to_datetime(transaction_df["datetime"])
        transaction_df["user_id"] = user_id  

        # Check if user exists and meets activity threshold
        user_data = self.df[self.df["user_id"] == user_id]
        if len(user_data) >= 20 and (user_data["datetime"].max() - user_data["datetime"].min()).days >= 7:
            # Append the new transaction for feature calculation
            combined = pd.concat([user_data, transaction_df], ignore_index=True)
            combined = self._basic_time_features(combined)
            combined = self._user_behavior_features(combined)
            combined = self._statistical_features(combined)

            # Return only the last row (the new transaction features)
            return combined.iloc[[-1]].reset_index(drop=True)
        else:
            # Return only basic time + raw features (rule-based)
            transaction_df = self._basic_time_features(transaction_df)
            return transaction_df.reset_index(drop=True)

In [36]:
# Example historical data
df = pd.DataFrame({
    "datetime": pd.date_range(start="2025-07-01", periods=25, freq="D").tolist() * 2,
    "user_id": ["U1"] * 25 + ["U2"] * 25,
    "transaction_type": ["purchase"] * 50,
    "amount": np.random.randint(10, 500, size=50),
    "currency": ["USD"] * 50,
    "location": np.random.choice(["NY", "LA", "SF"], size=50),
    "device": np.random.choice(["mobile", "desktop"], size=50)
})

print(df)

     datetime user_id transaction_type  amount currency location   device
0  2025-07-01      U1         purchase     344      USD       SF  desktop
1  2025-07-02      U1         purchase     497      USD       LA  desktop
2  2025-07-03      U1         purchase     236      USD       NY   mobile
3  2025-07-04      U1         purchase     451      USD       NY  desktop
4  2025-07-05      U1         purchase     323      USD       LA   mobile
5  2025-07-06      U1         purchase     469      USD       LA  desktop
6  2025-07-07      U1         purchase     430      USD       NY   mobile
7  2025-07-08      U1         purchase     333      USD       NY   mobile
8  2025-07-09      U1         purchase     387      USD       LA   mobile
9  2025-07-10      U1         purchase     173      USD       LA  desktop
10 2025-07-11      U1         purchase      85      USD       LA  desktop
11 2025-07-12      U1         purchase     392      USD       SF  desktop
12 2025-07-13      U1         purchase

In [37]:
fe = FeatureEngineer(df)
batch_features = fe.engineer_batch()
print(batch_features)

     datetime user_id transaction_type  amount currency location   device  \
0  2025-07-01      U1         purchase     344      USD       SF  desktop   
1  2025-07-02      U1         purchase     497      USD       LA  desktop   
2  2025-07-03      U1         purchase     236      USD       NY   mobile   
3  2025-07-04      U1         purchase     451      USD       NY  desktop   
4  2025-07-05      U1         purchase     323      USD       LA   mobile   
5  2025-07-06      U1         purchase     469      USD       LA  desktop   
6  2025-07-07      U1         purchase     430      USD       NY   mobile   
7  2025-07-08      U1         purchase     333      USD       NY   mobile   
8  2025-07-09      U1         purchase     387      USD       LA   mobile   
9  2025-07-10      U1         purchase     173      USD       LA  desktop   
10 2025-07-11      U1         purchase      85      USD       LA  desktop   
11 2025-07-12      U1         purchase     392      USD       SF  desktop   

In [38]:
# Simulate new UI transaction
new_transaction = {
    "datetime": "2025-08-14 12:00:00",
    "user_id": "U1",
    "transaction_type": "purchase",
    "amount": 150,
    "currency": "USD",
    "location": "NY",
    "device": "mobile"
}
single_features = fe.engineer_single(user_id="U1", transaction=new_transaction)

print(single_features)

             datetime user_id transaction_type  amount currency location  \
0 2025-08-14 12:00:00      U1         purchase     150      USD       NY   

   device  day_of_week  hour_of_day  month  ...  day_of_month  \
0  mobile            3           12      8  ...            14   

   is_business_hours  transaction_count_last_7_days  \
0               True                            1.0   

   average_transaction_amount_last_10_days  days_since_last_transaction  \
0                                    150.0                         20.0   

   unique_locations_used  amount_z_score_user  \
0                      3            -0.925334   

   hours_since_last_transaction_user  transaction_count_today_user  \
0                              492.0                             1   

   amount_percentile_user  
0                0.230769  

[1 rows x 22 columns]


In [39]:
print(single_features)

             datetime user_id transaction_type  amount currency location  \
0 2025-08-14 12:00:00      U1         purchase     150      USD       NY   

   device  day_of_week  hour_of_day  month  ...  day_of_month  \
0  mobile            3           12      8  ...            14   

   is_business_hours  transaction_count_last_7_days  \
0               True                            1.0   

   average_transaction_amount_last_10_days  days_since_last_transaction  \
0                                    150.0                         20.0   

   unique_locations_used  amount_z_score_user  \
0                      3            -0.925334   

   hours_since_last_transaction_user  transaction_count_today_user  \
0                              492.0                             1   

   amount_percentile_user  
0                0.230769  

[1 rows x 22 columns]


In [40]:
# Simulate a brand new user transaction (never seen in historical data)
new_user_transaction = {
    "datetime": "2025-08-14 15:30:00",
    "user_id": "U999",  # ID not in the historical df
    "transaction_type": "purchase",
    "amount": 300,
    "currency": "USD",
    "location": "SF",
    "device": "desktop"
}

single_features_new_user = fe.engineer_single(
    user_id="U999", 
    transaction=new_user_transaction
)

print(single_features_new_user)

             datetime user_id transaction_type  amount currency location  \
0 2025-08-14 15:30:00    U999         purchase     300      USD       SF   

    device  day_of_week  hour_of_day  month  quarter  is_weekend  \
0  desktop            3           15      8        3       False   

   day_of_month  is_business_hours  
0            14               True  


In [42]:
single_features_new_user

Unnamed: 0,datetime,user_id,transaction_type,amount,currency,location,device,day_of_week,hour_of_day,month,quarter,is_weekend,day_of_month,is_business_hours
0,2025-08-14 15:30:00,U999,purchase,300,USD,SF,desktop,3,15,8,3,False,14,True


In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv(r"C:\Users\user\Desktop\moniepoint\artifacts\data_ingestion\cleaned_anomaly_detection.csv")
df.head()

Unnamed: 0,row_id,original_log,datetime,user_id,transaction_type,amount,currency,location,device
0,1,2025-07-05 19:18:10::user1069::withdrawal::299...,2025-07-05 19:18:10,user1069,withdrawal,2995.12,GBP,London,iPhone 13
1,2,usr:user1076|cashout|€4821.85|Glasgow|2025-07-...,2025-07-15 12:56:05,user1076,cashout,4821.85,EUR,Glasgow,Pixel 6
2,3,2025-07-20 05:38:14 >> [user1034] did top-up -...,2025-07-20 05:38:14,user1034,top_up,2191.06,EUR,Unknown,iPhone 13
3,4,2025-06-13 10:04:51 >> [user1068] did deposit ...,2025-06-13 10:04:51,user1068,deposit,1691.09,EUR,Glasgow,Unknown
4,5,2025-07-29 23:47:37 | user: user1014 | txn: de...,2025-07-29 23:47:37,user1014,deposit,3539.5,GBP,Glasgow,iPhone 13


In [4]:
df['transaction_type'].unique()

array(['withdrawal', 'cashout', 'top_up', 'deposit', 'debit', 'purchase',
       'top-up', 'refund', 'transfer'], dtype=object)