# Data Exploration: African Financial Crime Datasets

This notebook explores the synthetic SAR dataset and provides visualizations of financial crime patterns across African countries.

## Contents
1. Load and inspect SAR data
2. Explore crime typologies
3. Country-wise distribution
4. Transaction amount analysis
5. Red flag patterns
6. Temporal analysis

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ Libraries imported successfully")

## 1. Load SAR Data

In [None]:
# Load synthetic SAR data
def load_sar_data(filepath):
    """Load SAR data from JSONL file."""
    sars = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                sars.append(json.loads(line))
    return sars

# Adjust path as needed
data_path = '../data/raw/synthetic_sars.jsonl'

if Path(data_path).exists():
    sars = load_sar_data(data_path)
    print(f"✅ Loaded {len(sars)} SARs")
    
    # Convert to DataFrame for analysis
    df = pd.DataFrame(sars)
    print(f"\nDataFrame shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
else:
    print(f"❌ Data file not found at {data_path}")
    print("Please run: python data/scripts/generate_synthetic_sars.py --count 100")

In [None]:
# Display first few SARs
df.head()

In [None]:
# Basic statistics
print("Dataset Summary:")
print(f"Total SARs: {len(df)}")
print(f"\nCountries covered: {df['country'].nunique()}")
print(f"Typologies covered: {df['typology'].nunique()}")
print(f"\nDate range: {df['report_date'].min()} to {df['report_date'].max()}")

## 2. Crime Typology Distribution

In [None]:
# Typology distribution
typology_counts = df['typology'].value_counts()

plt.figure(figsize=(12, 6))
typology_counts.plot(kind='bar', color='steelblue')
plt.title('Financial Crime Typology Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Crime Typology', fontsize=12)
plt.ylabel('Number of SARs', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

print("\nTypology Statistics:")
print(typology_counts)

## 3. Country-wise Analysis

In [None]:
# Country distribution
country_counts = df['country'].value_counts()

plt.figure(figsize=(12, 6))
country_counts.plot(kind='barh', color='coral')
plt.title('SAR Distribution by Country', fontsize=16, fontweight='bold')
plt.xlabel('Number of SARs', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.tight_layout()
plt.show()

print("\nCountry Statistics:")
print(country_counts)

In [None]:
# Heatmap: Country vs Typology
heatmap_data = pd.crosstab(df['country'], df['typology'])

plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlOrRd', linewidths=0.5)
plt.title('Crime Typology Distribution by Country', fontsize=16, fontweight='bold')
plt.xlabel('Crime Typology', fontsize=12)
plt.ylabel('Country', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 4. Transaction Amount Analysis

In [None]:
# Convert total_amount to numeric
df['total_amount_numeric'] = pd.to_numeric(df['total_amount'], errors='coerce')

# Amount distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['total_amount_numeric'].dropna(), bins=30, color='skyblue', edgecolor='black')
axes[0].set_title('Distribution of Transaction Amounts', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Amount', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)

# Box plot by country
top_countries = df['country'].value_counts().head(5).index
df_top = df[df['country'].isin(top_countries)]
df_top.boxplot(column='total_amount_numeric', by='country', ax=axes[1])
axes[1].set_title('Transaction Amounts by Top 5 Countries', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Country', fontsize=12)
axes[1].set_ylabel('Amount', fontsize=12)
plt.suptitle('')

plt.tight_layout()
plt.show()

print("\nAmount Statistics:")
print(df['total_amount_numeric'].describe())

## 5. Red Flag Analysis

In [None]:
# Extract and count red flags
all_red_flags = []
for flags in df['red_flags']:
    if isinstance(flags, list):
        all_red_flags.extend(flags)

red_flag_counts = Counter(all_red_flags)
top_red_flags = dict(red_flag_counts.most_common(15))

plt.figure(figsize=(12, 8))
plt.barh(list(top_red_flags.keys()), list(top_red_flags.values()), color='indianred')
plt.title('Most Common Red Flags', fontsize=16, fontweight='bold')
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Red Flag', fontsize=12)
plt.tight_layout()
plt.show()

print(f"\nTotal unique red flags: {len(red_flag_counts)}")
print(f"Total red flag occurrences: {len(all_red_flags)}")

## 6. Subject Type Analysis

In [None]:
# Subject type distribution
subject_counts = df['subject_type'].value_counts()

plt.figure(figsize=(8, 8))
plt.pie(subject_counts, labels=subject_counts.index, autopct='%1.1f%%', 
        colors=['lightblue', 'lightcoral'], startangle=90)
plt.title('Subject Type Distribution', fontsize=16, fontweight='bold')
plt.axis('equal')
plt.show()

print("\nSubject Type Statistics:")
print(subject_counts)

## 7. Key Insights

### Summary
- Dataset provides comprehensive coverage of African financial crime scenarios
- Multiple typologies represented across 10 African countries
- Transaction amounts vary significantly by country and crime type
- Common red flags identified for training the model

### Next Steps
1. Proceed to notebook 02 for data preparation
2. Format data for instruction-tuning
3. Create train/validation/test splits

In [None]:
# Save summary statistics
summary = {
    'total_sars': len(df),
    'countries': df['country'].nunique(),
    'typologies': df['typology'].nunique(),
    'avg_amount': df['total_amount_numeric'].mean(),
    'total_red_flags': len(red_flag_counts)
}

print("\n" + "="*50)
print("DATASET SUMMARY")
print("="*50)
for key, value in summary.items():
    print(f"{key.replace('_', ' ').title()}: {value:,.0f}" if isinstance(value, float) else f"{key.replace('_', ' ').title()}: {value}")
print("="*50)