# Fraud Transaction Statistical Analysis - Data Exploration Framework

This notebook provides a comprehensive Python environment setup and data exploration framework for fraud transaction statistical analysis.

## Objective
- Load and explore transaction_fraud_data.parquet
- Perform comprehensive data exploration and preprocessing
- Generate data quality reports
- Create reusable functions for statistical analysis preparation

## Libraries and Environment Setup

In [None]:
# Core data manipulation and analysis libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Statistical analysis libraries
import scipy.stats as stats
from scipy.stats import normaltest, shapiro, kstest, jarque_bera
import statsmodels.api as sm
from statsmodels.stats.diagnostic import lilliefors
import pingouin as pg
from diptest import diptest

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Visualization libraries
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from matplotlib.patches import Rectangle

# Date and time handling
from datetime import datetime, timedelta
import pytz

# System and utility libraries
import os
import sys
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Union
import json

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print('✅ All libraries imported successfully!')
print(f'📊 Pandas version: {pd.__version__}')
print(f'🔢 NumPy version: {np.__version__}')
print(f'📈 SciPy version: {stats.__version__ if hasattr(stats, "__version__") else "Available"}')
print(f'🎨 Matplotlib version: {plt.matplotlib.__version__}')
print(f'🌊 Seaborn version: {sns.__version__}')

## Data Loading and Initial Exploration Functions

In [None]:
def create_sample_fraud_data(n_samples: int = 10000, fraud_rate: float = 0.05) -> pd.DataFrame:
    """
    Create a sample fraud transaction dataset matching the schema described in README.md
    
    Parameters:
    -----------
    n_samples : int
        Number of transactions to generate
    fraud_rate : float
        Proportion of fraudulent transactions (0-1)
    
    Returns:
    --------
    pd.DataFrame
        Generated fraud transaction dataset
    """
    np.random.seed(42)  # For reproducibility
    
    # Generate base data
    data = {
        'transaction_id': [f'TXN_{i:08d}' for i in range(n_samples)],
        'customer_id': [f'CUST_{np.random.randint(1, n_samples//5):06d}' for _ in range(n_samples)],
        'card_number': np.random.randint(1000000000000000, 9999999999999999, n_samples),
        'timestamp': pd.date_range('2024-09-30', '2024-10-30', periods=n_samples),
        'vendor_category': np.random.choice(['Retail', 'Travel', 'Entertainment', 'Healthcare', 'Education', 'Fuel', 'Restaurant'], n_samples),
        'vendor_type': np.random.choice(['online', 'offline', 'premium', 'fastfood'], n_samples),
        'vendor': [f'Vendor_{np.random.randint(1, 1000):03d}' for _ in range(n_samples)],
        'amount': np.random.lognormal(3, 1.5, n_samples),  # Log-normal distribution for realistic amounts
        'currency': np.random.choice(['USD', 'EUR', 'GBP', 'JPY', 'CAD', 'AUD'], n_samples, p=[0.4, 0.2, 0.1, 0.1, 0.1, 0.1]),
        'country': np.random.choice(['USA', 'UK', 'Germany', 'France', 'Japan', 'Canada', 'Australia'], n_samples),
        'city': [f'City_{np.random.randint(1, 500):03d}' for _ in range(n_samples)],
        'city_size': np.random.choice(['small', 'medium', 'large'], n_samples, p=[0.3, 0.4, 0.3]),
        'card_type': np.random.choice(['Basic Credit', 'Gold Credit', 'Platinum Credit', 'Debit'], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
        'is_card_present': np.random.choice([True, False], n_samples, p=[0.6, 0.4]),
        'device': np.random.choice(['Chrome', 'Safari', 'iOS App', 'Android App', 'Firefox'], n_samples),
        'channel': np.random.choice(['web', 'mobile', 'pos'], n_samples, p=[0.4, 0.3, 0.3]),
        'device_fingerprint': [f'FP_{np.random.randint(1000000, 9999999):07d}' for _ in range(n_samples)],
        'ip_address': [f'{np.random.randint(1, 255)}.{np.random.randint(1, 255)}.{np.random.randint(1, 255)}.{np.random.randint(1, 255)}' for _ in range(n_samples)],
        'is_outside_home_country': np.random.choice([True, False], n_samples, p=[0.15, 0.85]),
        'is_high_risk_vendor': np.random.choice([True, False], n_samples, p=[0.2, 0.8]),
        'is_weekend': np.random.choice([True, False], n_samples, p=[0.3, 0.7])
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add last_hour_activity as nested structure
    df['last_hour_activity'] = [
        {
            'num_transactions': np.random.randint(1, 10),
            'total_amount': np.random.uniform(10, 1000),
            'unique_merchants': np.random.randint(1, 5),
            'unique_countries': np.random.randint(1, 3),
            'max_single_amount': np.random.uniform(50, 500)
        } for _ in range(n_samples)
    ]
    
    # Generate fraud labels with realistic patterns
    fraud_indices = np.random.choice(n_samples, int(n_samples * fraud_rate), replace=False)
    df['is_fraud'] = False
    df.loc[fraud_indices, 'is_fraud'] = True
    
    # Make fraudulent transactions more realistic
    # Higher amounts for fraud
    df.loc[df['is_fraud'], 'amount'] *= np.random.uniform(2, 5, sum(df['is_fraud']))
    # More likely to be outside home country
    df.loc[df['is_fraud'], 'is_outside_home_country'] = np.random.choice([True, False], sum(df['is_fraud']), p=[0.7, 0.3])
    # More likely to be high risk vendor
    df.loc[df['is_fraud'], 'is_high_risk_vendor'] = np.random.choice([True, False], sum(df['is_fraud']), p=[0.8, 0.2])
    
    return df

In [None]:
def load_fraud_data(file_path: str = 'transaction_fraud_data.parquet') -> pd.DataFrame:
    """
    Load fraud transaction data from parquet file or create sample data if file doesn't exist
    
    Parameters:
    -----------
    file_path : str
        Path to the parquet file
    
    Returns:
    --------
    pd.DataFrame
        Fraud transaction dataset
    """
    if os.path.exists(file_path):
        print(f'📁 Loading data from {file_path}...')
        df = pd.read_parquet(file_path)
        print(f'✅ Data loaded successfully! Shape: {df.shape}')
    else:
        print(f'⚠️  File {file_path} not found. Creating sample data...')
        df = create_sample_fraud_data()
        # Save the sample data
        df.to_parquet(file_path, index=False)
        print(f'✅ Sample data created and saved to {file_path}! Shape: {df.shape}')
    
    return df

# Load the data
df_fraud = load_fraud_data()
print(f'\n📊 Dataset loaded with shape: {df_fraud.shape}')
print(f'📋 Columns: {list(df_fraud.columns)}')