# 02: Feature Engineering

This notebook implements corridor-normalised features that form the foundation of our context-aware fraud detection system.

## Objectives
1. Build corridor-specific statistical profiles
2. Implement normalised feature calculations
3. Demonstrate how normalisation improves signal quality

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

# Load data from previous notebook
transactions = pd.read_csv('synthetic_transactions.csv', parse_dates=['timestamp'])
print(f'Loaded {len(transactions):,} transactions')

## 1. Build Corridor Statistical Profiles

Each corridor needs a statistical profile that captures "normal" behaviour for that specific payment route.

In [None]:
class CorridorProfiler:
    """
    Builds and stores statistical profiles for each payment corridor.
    These profiles define what 'normal' looks like for each route.
    """
    
    def __init__(self):
        self.profiles = {}
    
    def fit(self, transactions_df):
        """
        Calculate corridor profiles from historical transaction data.
        In production, this would be run on fraud-free (or low-fraud) historical data.
        """
        for corridor in transactions_df['corridor'].unique():
            corridor_data = transactions_df[transactions_df['corridor'] == corridor]
            
            # Amount statistics
            amount_stats = {
                'median': corridor_data['amount'].median(),
                'mean': corridor_data['amount'].mean(),
                'std': corridor_data['amount'].std(),
                'p25': corridor_data['amount'].quantile(0.25),
                'p75': corridor_data['amount'].quantile(0.75),
                'p90': corridor_data['amount'].quantile(0.90),
                'p95': corridor_data['amount'].quantile(0.95),
                'p99': corridor_data['amount'].quantile(0.99),
            }
            
            # Velocity statistics (transactions per sender)
            sender_counts = corridor_data.groupby('sender_id').size()
            velocity_stats = {
                'median_txn_per_sender': sender_counts.median(),
                'mean_txn_per_sender': sender_counts.mean(),
                'p95_txn_per_sender': sender_counts.quantile(0.95),
            }
            
            # Temporal patterns
            corridor_data = corridor_data.copy()
            corridor_data['hour'] = corridor_data['timestamp'].dt.hour
            corridor_data['dayofweek'] = corridor_data['timestamp'].dt.dayofweek
            
            hour_dist = corridor_data['hour'].value_counts(normalize=True).to_dict()
            day_dist = corridor_data['dayofweek'].value_counts(normalize=True).to_dict()
            
            # Peak hours (top 5 hours by volume)
            peak_hours = corridor_data['hour'].value_counts().head(5).index.tolist()
            
            temporal_stats = {
                'hour_distribution': hour_dist,
                'day_distribution': day_dist,
                'peak_hours': peak_hours,
            }
            
            self.profiles[corridor] = {
                'corridor': corridor,
                'corridor_name': corridor_data['corridor_name'].iloc[0],
                'n_transactions': len(corridor_data),
                'n_unique_senders': corridor_data['sender_id'].nunique(),
                'amount': amount_stats,
                'velocity': velocity_stats,
                'temporal': temporal_stats,
            }
        
        return self
    
    def get_profile(self, corridor):
        """Retrieve profile for a specific corridor."""
        return self.profiles.get(corridor)
    
    def summary(self):
        """Print summary of all corridor profiles."""
        rows = []
        for corridor, profile in self.profiles.items():
            rows.append({
                'Corridor': profile['corridor_name'],
                'Transactions': profile['n_transactions'],
                'Unique Senders': profile['n_unique_senders'],
                'Median Amount': f"£{profile['amount']['median']:.0f}",
                '95th %ile Amount': f"£{profile['amount']['p95']:.0f}",
                'Avg Txn/Sender': f"{profile['velocity']['mean_txn_per_sender']:.1f}",
            })
        return pd.DataFrame(rows)

# Build profiles
profiler = CorridorProfiler()
profiler.fit(transactions)

print('=== Corridor Profiles ===')
print(profiler.summary().to_string(index=False))

## 2. Implement Corridor-Normalised Features

Now we build features that are meaningful within the context of each corridor.

In [None]:
class CorridorFeatureEngine:
    """
    Calculates corridor-normalised features for fraud detection.
    Each feature is calibrated against corridor-specific baselines.
    """
    
    def __init__(self, profiler):
        self.profiler = profiler
        self.sender_history = defaultdict(lambda: {
            'transactions': [],
            'beneficiaries': set(),
            'devices': set(),
            'first_seen': None,
        })
    
    def _build_sender_history(self, transactions_df):
        """Build sender transaction history for velocity calculations."""
        sorted_txns = transactions_df.sort_values('timestamp')
        
        for _, row in sorted_txns.iterrows():
            sender = row['sender_id']
            self.sender_history[sender]['transactions'].append({
                'timestamp': row['timestamp'],
                'amount': row['amount'],
                'corridor': row['corridor'],
            })
            if self.sender_history[sender]['first_seen'] is None:
                self.sender_history[sender]['first_seen'] = row['timestamp']
    
    def amount_deviation_score(self, amount, corridor):
        """
        Calculate how unusual the amount is for this corridor.
        Returns 0-1 score where higher = more unusual.
        """
        profile = self.profiler.get_profile(corridor)
        if not profile:
            return 0.5  # Default for unknown corridors
        
        median = profile['amount']['median']
        p95 = profile['amount']['p95']
        p99 = profile['amount']['p99']
        
        if amount <= median:
            return 0.0
        elif amount <= p95:
            # Linear scale from 0 to 0.5 between median and p95
            return ((amount - median) / (p95 - median)) * 0.5
        elif amount <= p99:
            # Linear scale from 0.5 to 0.8 between p95 and p99
            return 0.5 + ((amount - p95) / (p99 - p95)) * 0.3
        else:
            # Above p99: scale towards 1.0 but cap
            excess = (amount - p99) / p99
            return min(0.8 + excess * 0.2, 1.0)
    
    def velocity_score(self, sender_id, current_timestamp, corridor, window_hours=24):
        """
        Calculate normalised transaction velocity.
        Compares sender's recent activity to corridor norms.
        """
        profile = self.profiler.get_profile(corridor)
        if not profile:
            return 0.5
        
        history = self.sender_history.get(sender_id, {'transactions': []})
        
        # Count transactions in window
        window_start = current_timestamp - timedelta(hours=window_hours)
        recent_count = sum(
            1 for txn in history['transactions']
            if window_start <= txn['timestamp'] < current_timestamp
        )
        
        # Normalise against corridor median
        corridor_median = profile['velocity']['median_txn_per_sender']
        corridor_p95 = profile['velocity']['p95_txn_per_sender']
        
        # Scale: daily velocity vs expected monthly/period velocity
        # Simplified: compare count to what's normal for a day
        expected_daily = corridor_median / 30  # Rough daily expectation
        
        if recent_count <= expected_daily:
            return 0.0
        elif recent_count <= expected_daily * 3:
            return 0.3
        elif recent_count <= expected_daily * 5:
            return 0.6
        else:
            return min(0.6 + (recent_count - expected_daily * 5) * 0.1, 1.0)
    
    def temporal_anomaly_score(self, timestamp, corridor):
        """
        Score how unusual the transaction timing is for this corridor.
        """
        profile = self.profiler.get_profile(corridor)
        if not profile:
            return 0.0
        
        hour = timestamp.hour
        peak_hours = profile['temporal']['peak_hours']
        hour_dist = profile['temporal']['hour_distribution']
        
        # Check if hour is in peak hours
        if hour in peak_hours:
            return 0.0
        
        # Check how common this hour is
        hour_probability = hour_dist.get(hour, 0.01)
        
        if hour_probability > 0.05:  # >5% of transactions
            return 0.1
        elif hour_probability > 0.02:
            return 0.3
        else:
            return 0.5
    
    def sender_maturity_score(self, sender_id, current_timestamp):
        """
        Score based on how established the sender is.
        New accounts are higher risk.
        """
        history = self.sender_history.get(sender_id)
        
        if not history or not history['first_seen']:
            return 0.8  # New sender = elevated risk
        
        account_age_days = (current_timestamp - history['first_seen']).days
        n_transactions = len(history['transactions'])
        
        if account_age_days < 7:
            return 0.7
        elif account_age_days < 30:
            return 0.4
        elif account_age_days < 90:
            return 0.2
        else:
            return 0.0
    
    def calculate_features(self, transaction):
        """
        Calculate all corridor-normalised features for a transaction.
        Returns dictionary of feature scores (all 0-1 range).
        """
        corridor = transaction['corridor']
        
        features = {
            'amount_deviation': self.amount_deviation_score(
                transaction['amount'], corridor
            ),
            'velocity': self.velocity_score(
                transaction['sender_id'],
                transaction['timestamp'],
                corridor
            ),
            'temporal_anomaly': self.temporal_anomaly_score(
                transaction['timestamp'], corridor
            ),
            'sender_maturity': self.sender_maturity_score(
                transaction['sender_id'],
                transaction['timestamp']
            ),
        }
        
        return features

# Initialise feature engine
feature_engine = CorridorFeatureEngine(profiler)
feature_engine._build_sender_history(transactions)

print('Feature engine initialised')

In [None]:
# Calculate features for all transactions
print('Calculating features for all transactions...')

feature_records = []
for idx, row in transactions.iterrows():
    features = feature_engine.calculate_features(row)
    features['transaction_idx'] = idx
    features['corridor'] = row['corridor']
    features['corridor_name'] = row['corridor_name']
    features['is_fraud'] = row['is_fraud']
    features['amount'] = row['amount']
    feature_records.append(features)

features_df = pd.DataFrame(feature_records)
print(f'Calculated features for {len(features_df):,} transactions')

## 3. Analyse Feature Distributions

Let's see how our normalised features compare between fraud and legitimate transactions.

In [None]:
# Compare feature distributions: Fraud vs Legitimate
feature_cols = ['amount_deviation', 'velocity', 'temporal_anomaly', 'sender_maturity']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, feature in enumerate(feature_cols):
    ax = axes[idx]
    
    # Separate fraud and legitimate
    fraud_values = features_df[features_df['is_fraud']][feature]
    legit_values = features_df[~features_df['is_fraud']][feature]
    
    ax.hist(legit_values, bins=30, alpha=0.6, label='Legitimate', density=True)
    ax.hist(fraud_values, bins=30, alpha=0.6, label='Fraud', density=True)
    
    ax.set_title(f'{feature.replace("_", " ").title()} Score Distribution')
    ax.set_xlabel('Score')
    ax.set_ylabel('Density')
    ax.legend()

plt.suptitle('Feature Score Distributions: Fraud vs Legitimate', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Feature statistics by fraud status
print('=== Feature Statistics by Fraud Status ===\n')

for feature in feature_cols:
    fraud_mean = features_df[features_df['is_fraud']][feature].mean()
    legit_mean = features_df[~features_df['is_fraud']][feature].mean()
    separation = fraud_mean - legit_mean
    
    print(f'{feature}:')
    print(f'  Legitimate mean: {legit_mean:.3f}')
    print(f'  Fraud mean:      {fraud_mean:.3f}')
    print(f'  Separation:      {separation:+.3f}')
    print()

## 4. Compare: Global vs Corridor-Normalised Features

Let's demonstrate the improvement from corridor normalisation.

In [None]:
def global_amount_score(amount, all_transactions):
    """
    Calculate amount score using global (not corridor-specific) statistics.
    This is the 'naive' approach.
    """
    global_median = all_transactions['amount'].median()
    global_p95 = all_transactions['amount'].quantile(0.95)
    global_p99 = all_transactions['amount'].quantile(0.99)
    
    if amount <= global_median:
        return 0.0
    elif amount <= global_p95:
        return ((amount - global_median) / (global_p95 - global_median)) * 0.5
    elif amount <= global_p99:
        return 0.5 + ((amount - global_p95) / (global_p99 - global_p95)) * 0.3
    else:
        excess = (amount - global_p99) / global_p99
        return min(0.8 + excess * 0.2, 1.0)

# Calculate global (non-normalised) amount scores
features_df['amount_deviation_global'] = features_df['amount'].apply(
    lambda x: global_amount_score(x, transactions)
)

print('Calculated global amount scores for comparison')

In [None]:
# Compare performance: Global vs Corridor-Normalised
from sklearn.metrics import roc_auc_score

print('=== Feature Quality Comparison (ROC-AUC) ===\n')
print('Higher AUC = better separation between fraud and legitimate\n')

# Overall comparison
global_auc = roc_auc_score(features_df['is_fraud'], features_df['amount_deviation_global'])
normalised_auc = roc_auc_score(features_df['is_fraud'], features_df['amount_deviation'])

print(f'Overall (all corridors):')
print(f'  Global amount score AUC:     {global_auc:.4f}')
print(f'  Normalised amount score AUC: {normalised_auc:.4f}')
print(f'  Improvement:                 {(normalised_auc - global_auc) * 100:+.2f}%\n')

# Per-corridor comparison
print('Per-corridor breakdown:')
for corridor_name in features_df['corridor_name'].unique():
    corridor_data = features_df[features_df['corridor_name'] == corridor_name]
    
    if corridor_data['is_fraud'].sum() < 5:  # Need minimum fraud cases
        continue
    
    global_auc = roc_auc_score(corridor_data['is_fraud'], corridor_data['amount_deviation_global'])
    normalised_auc = roc_auc_score(corridor_data['is_fraud'], corridor_data['amount_deviation'])
    
    print(f'  {corridor_name}: Global={global_auc:.3f}, Normalised={normalised_auc:.3f} ({(normalised_auc-global_auc)*100:+.1f}%)')

In [None]:
# Visualise the improvement
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Global scores by corridor
ax1 = axes[0]
for corridor_name in features_df['corridor_name'].unique():
    corridor_data = features_df[features_df['corridor_name'] == corridor_name]
    fraud_scores = corridor_data[corridor_data['is_fraud']]['amount_deviation_global']
    ax1.hist(fraud_scores, bins=20, alpha=0.5, label=corridor_name, density=True)

ax1.set_title('Global Amount Scores (Fraud Only)\nScores overlap significantly')
ax1.set_xlabel('Score')
ax1.set_ylabel('Density')
ax1.legend()

# Normalised scores by corridor
ax2 = axes[1]
for corridor_name in features_df['corridor_name'].unique():
    corridor_data = features_df[features_df['corridor_name'] == corridor_name]
    fraud_scores = corridor_data[corridor_data['is_fraud']]['amount_deviation']
    ax2.hist(fraud_scores, bins=20, alpha=0.5, label=corridor_name, density=True)

ax2.set_title('Corridor-Normalised Scores (Fraud Only)\nMore consistent signal across corridors')
ax2.set_xlabel('Score')
ax2.set_ylabel('Density')
ax2.legend()

plt.tight_layout()
plt.savefig('global_vs_normalised.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Create Combined Feature Score

For the next notebook, we'll prepare a simple combined score to use as baseline.

In [None]:
# Simple equal-weighted combination (baseline)
features_df['combined_score_baseline'] = (
    features_df['amount_deviation'] * 0.25 +
    features_df['velocity'] * 0.25 +
    features_df['temporal_anomaly'] * 0.25 +
    features_df['sender_maturity'] * 0.25
)

# Check performance
combined_auc = roc_auc_score(features_df['is_fraud'], features_df['combined_score_baseline'])
print(f'Combined score (equal weights) AUC: {combined_auc:.4f}')

In [None]:
# Save features for next notebook
features_df.to_csv('transaction_features.csv', index=False)
print(f'Saved features to transaction_features.csv')

## Summary

This notebook demonstrated:

1. **Corridor profiling**: Building statistical baselines for each payment route

2. **Normalised features**: Amount, velocity, temporal, and maturity scores calibrated to corridor norms

3. **Quality improvement**: Corridor-normalised features provide better fraud signal than global features

4. **Foundation for dynamic weighting**: Equal-weight combination is a baseline; next we'll optimise weights per corridor

Next notebook: **Dynamic Signal Weighting** — learning optimal feature weights for each corridor.