In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import joblib
from scipy import stats
from scipy.interpolate import interp1d
from scipy.signal import find_peaks

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load preprocessing artifacts
scaler = joblib.load(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\processed\standard_scaler.pkl")
cat_vocab = joblib.load(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\processed\cat_vocab.pkl")
cat_mapping = joblib.load(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\processed\cat_mapping.pkl")
print("Loaded StandardScaler and categorical artifacts.")

# Define features (same as in preprocessing)
numeric_features = ['amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long',
                   'age', 'trans_hour', 'trans_day', 'trans_month', 'trans_dayofweek']
cat_features = ['merchant', 'category', 'gender', 'street', 'city', 'state', 'zip', 'job']

# Load training data to get engineered feature ranges
X_train_df = pd.read_csv(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\processed\X_train.csv")
y_train_df = pd.read_csv(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\processed\y_train.csv")
y_train = y_train_df.iloc[:, 0]
fraud_mask = (y_train == 1)
X_train_num = X_train_df[numeric_features].loc[fraud_mask].values

# Engineered feature indices and min/max
eng_features = ['trans_hour', 'trans_day', 'trans_month', 'trans_dayofweek']
eng_indices = [numeric_features.index(feat) for feat in eng_features]
amt_idx = numeric_features.index('amt')
eng_min_np = np.min(X_train_num[:, eng_indices], axis=0)
eng_max_np = np.max(X_train_num[:, eng_indices], axis=0)
eng_min = torch.tensor(eng_min_np, dtype=torch.float32).to(device)
eng_max = torch.tensor(eng_max_np, dtype=torch.float32).to(device)

# Store real fraud amount distribution
real_fraud_amt_scaled = X_train_num[:, amt_idx].copy()

# Diffusion parameters
T_train = 800
beta_start = 1e-4
beta_end = 0.02
beta = torch.linspace(beta_start, beta_end, T_train).to(device)
alpha = 1.0 - beta
alpha_hat = torch.cumprod(alpha, dim=0)

# Function for getting empirical CDF
def get_cdf(data):
    """Compute empirical CDF from data"""
    sorted_data = np.sort(data)
    ecdf_y = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    return sorted_data, ecdf_y

# Distribution matching function
def get_distribution_transform_function(source_vals, target_vals):
    """
    Creates a function that transforms values from source distribution to target distribution
    through quantile matching
    """
    source_sorted, source_cdf = get_cdf(source_vals)
    target_sorted, target_cdf = get_cdf(target_vals)
    
    source_to_quantile = interp1d(source_sorted, source_cdf, 
                                bounds_error=False, fill_value=(0, 1))
    quantile_to_target = interp1d(target_cdf, target_sorted, 
                                bounds_error=False, 
                                fill_value=(min(target_sorted), max(target_sorted)))
    
    def transform(vals):
        quantiles = source_to_quantile(vals)
        return quantile_to_target(quantiles)
    
    return transform

def match_distribution(values, target_values):
    """
    Transform values to match the distribution of target_values
    """
    transform = get_distribution_transform_function(values, target_values)
    return transform(values)

# Create cyclic features for generation
def create_cyclic_features(data, feature_indices, periods):
    """Create sine and cosine features for cyclical data"""
    cyclic_data = np.zeros((data.shape[0], len(feature_indices) * 2))
    
    for i, (idx, period) in enumerate(zip(feature_indices, periods)):
        # Normalize to [0, 2π]
        values = data[:, idx].copy()
        normalized = 2 * np.pi * values / period
        
        # Create sin and cos features
        cyclic_data[:, i*2] = np.sin(normalized)
        cyclic_data[:, i*2+1] = np.cos(normalized)
        
    return cyclic_data

# Define periods for each time feature
hour_period = 24.0
day_period = 31.0
month_period = 12.0
dow_period = 7.0
periods = [hour_period, day_period, month_period, dow_period]

# Original scale data for creating cyclic features
X_train_num_original = scaler.inverse_transform(X_train_num)
cyclic_fraud = create_cyclic_features(
    X_train_num_original, 
    [numeric_features.index(feat) for feat in eng_features], 
    periods
)

# Define the model
class CombinedNoisePredictor(nn.Module):
    def __init__(self, num_input_dim, cat_vocab_sizes, cyclic_dim=8, cat_embed_dim=4, hidden_dim=256):
        super(CombinedNoisePredictor, self).__init__()
        self.embeddings = nn.ModuleDict()
        for col, vocab in cat_vocab_sizes.items():
            self.embeddings[col] = nn.Embedding(vocab, cat_embed_dim)
        cat_total_dim = len(cat_vocab_sizes) * cat_embed_dim
        
        combined_input_dim = num_input_dim + cat_total_dim + cyclic_dim
        
        self.fc1 = nn.Linear(combined_input_dim + 1, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, combined_input_dim)
        
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.norm2 = nn.LayerNorm(hidden_dim)
        self.norm3 = nn.LayerNorm(hidden_dim)
        
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.xavier_uniform_(self.fc4.weight)
        
    def forward(self, x_num, x_cat, x_cyclic, t):
        embeds = []
        for i, col in enumerate(self.embeddings):
            emb = self.embeddings[col](x_cat[:, i])
            embeds.append(emb)
        x_cat_emb = torch.cat(embeds, dim=1)
        
        x = torch.cat([x_num, x_cat_emb, x_cyclic], dim=1)
        
        t_norm = t.unsqueeze(1).float() / T_train
        x_input = torch.cat([x, t_norm], dim=1)
        
        h = self.activation(self.fc1(x_input))
        h = self.norm1(h)
        h = self.dropout(h)
        
        h_res = h
        h = self.activation(self.fc2(h))
        h = self.norm2(h)
        h = self.dropout(h)
        h = h + 0.1 * h_res
        
        h_res = h
        h = self.activation(self.fc3(h))
        h = self.norm3(h)
        h = self.dropout(h)
        h = h + 0.1 * h_res
        
        out = self.fc4(h)
        
        return out

# Instantiate the model with the same parameters
cat_vocab_sizes = {col: cat_vocab[col] for col in cat_features}
num_input_dim = len(numeric_features)
cyclic_dim = len(eng_features) * 2

model = CombinedNoisePredictor(
    num_input_dim=num_input_dim,
    cat_vocab_sizes=cat_vocab_sizes,
    cyclic_dim=cyclic_dim,
    cat_embed_dim=4,
    hidden_dim=256
).to(device)

# Load the trained model weights
model.load_state_dict(torch.load(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\baseline_improved_v7.pth", 
                               map_location=device))
print("Model loaded successfully!")

# Define the function to generate synthetic fraud (same as original)
def generate_synthetic_fraud(model, num_samples, T_gen=600):
    model.eval()
    with torch.no_grad():
        # Create categorical samples
        cat_samples = {}
        for col in cat_vocab_sizes:
            vocab_size = cat_vocab_sizes[col]
            cat_samples[col] = torch.randint(0, vocab_size, (num_samples,), device=device, dtype=torch.long)
        x_cat = torch.stack([cat_samples[col] for col in cat_features], dim=1)
        
        X_fraud_tensor = torch.tensor(X_train_num, dtype=torch.float32).to(device)
        
        # Better initialization with bimodal distribution awareness
        idx1 = torch.randint(0, X_fraud_tensor.shape[0] // 2, (num_samples // 2,), device=device)
        idx2 = torch.randint(X_fraud_tensor.shape[0] // 2, X_fraud_tensor.shape[0], (num_samples - num_samples // 2,), device=device)
        idx = torch.cat([idx1, idx2])
        
        noise = torch.randn(num_samples, num_input_dim).to(device) * 0.3
        x_t_num = X_fraud_tensor[idx] + X_fraud_tensor.std(dim=0, keepdim=True) * noise
        
        X_cyclic_fraud_tensor = torch.tensor(cyclic_fraud, dtype=torch.float32).to(device)
        x_cyclic = X_cyclic_fraud_tensor[idx]
        
        # Enhanced handling for amount to better match bimodal distribution
        amt_idx = numeric_features.index('amt')
        
        fraud_amts = X_fraud_tensor[:, amt_idx]
        
        fraud_amts_np = fraud_amts.cpu().numpy()
        kde = stats.gaussian_kde(fraud_amts_np)
        x_grid = np.linspace(fraud_amts_np.min(), fraud_amts_np.max(), 1000)
        kde_values = kde(x_grid)
        
        peaks, _ = find_peaks(kde_values, height=0.05*kde_values.max())
        peak_x = x_grid[peaks]
        print(f"Found {len(peak_x)} peaks in amount distribution at: {peak_x}")
        
        if len(peak_x) >= 2:
            peak_x = sorted(peak_x)
            
            high_peak_samples = int(num_samples * 0.9)
            low_peak_samples = num_samples - high_peak_samples
            
            low_peak = torch.tensor(peak_x[0], device=device)
            high_peak = torch.tensor(peak_x[-1], device=device)
            
            peak_distance = high_peak - low_peak
            low_noise = torch.randn(low_peak_samples, device=device) * (0.10 * peak_distance)
            high_noise = torch.randn(high_peak_samples, device=device) * (0.08 * peak_distance)
            
            low_amts = low_peak + low_noise
            high_amts = high_peak + high_noise
            
            amt_values = torch.cat([low_amts, high_amts])
            perm = torch.randperm(num_samples)
            amt_values = amt_values[perm]
            
            x_t_num[:, amt_idx] = amt_values
        else:
            sorted_amts, _ = torch.sort(fraud_amts)
            n = sorted_amts.size(0)
            
            lower_idx = torch.randint(0, n // 3, (num_samples // 10,), device=device)
            upper_idx = torch.randint(2 * n // 3, n, (num_samples - num_samples // 10,), device=device)
            
            amt_indices = torch.cat([lower_idx, upper_idx])
            amt_noise = torch.randn(num_samples, device=device) * 0.08
            
            x_t_num[:, amt_idx] = sorted_amts[amt_indices] + amt_noise
        
        # Reverse diffusion process
        for t_step in reversed(range(1, T_gen)):
            t = torch.full((num_samples,), t_step, device=device, dtype=torch.long)
            
            pred_noise = model(x_t_num, x_cat, x_cyclic, t)
            pred_noise_numeric = pred_noise[:, :num_input_dim]
            
            pred_noise_numeric = torch.clamp(pred_noise_numeric, -5.0, 5.0)
            
            beta_t = beta[t].unsqueeze(1)
            sqrt_alpha_t = torch.sqrt(alpha[t]).unsqueeze(1)
            sqrt_one_minus_alpha_hat_t = torch.sqrt(1 - alpha_hat[t]).unsqueeze(1)
            
            noise_scale = torch.sqrt(beta_t)
            if t_step < 200:
                noise_scale = noise_scale * (t_step / 200.0)
            
            z = torch.randn_like(x_t_num) * noise_scale if t_step > 1 else torch.zeros_like(x_t_num)
            
            x_t_num = (x_t_num - (beta_t / (sqrt_one_minus_alpha_hat_t + 1e-8)) * pred_noise_numeric) / (sqrt_alpha_t + 1e-8) + z
            
            x_t_num = torch.clamp(x_t_num, -10.0, 10.0)
        
        # Clip engineered features to observed range
        x_t_num_clipped = x_t_num.clone()
        x0_est_eng = x_t_num[:, eng_indices]
        x0_est_eng = torch.max(torch.min(x0_est_eng, eng_max.unsqueeze(0)), eng_min.unsqueeze(0))
        x_t_num_clipped[:, eng_indices] = x0_est_eng
        
        # Post-processing step to directly fix amount distribution
        syn_amt_values = x_t_num_clipped[:, amt_idx].cpu().numpy()
        
        transformed_amt = match_distribution(syn_amt_values, real_fraud_amt_scaled)
        
        x_t_num_clipped[:, amt_idx] = torch.tensor(transformed_amt, dtype=torch.float32).to(device)
        
        return x_t_num_clipped, x_cat

# Generate synthetic samples
num_synthetic = 8000
print("Generating synthetic samples...")
synthetic_num_norm, synthetic_cat = generate_synthetic_fraud(model, num_synthetic)
print("Synthetic numeric samples shape (normalized):", synthetic_num_norm.shape)
print("Synthetic categorical samples shape:", synthetic_cat.shape)

# IMPORTANT CHANGE: Skip inverse transformation to keep the same scale as X_train
synthetic_num_norm_np = synthetic_num_norm.cpu().numpy()

# Create dataframe directly from normalized values
synthetic_numeric_df = pd.DataFrame(synthetic_num_norm_np, columns=numeric_features)
synthetic_cat_df = pd.DataFrame(synthetic_cat.cpu().numpy(), columns=cat_features)
synthetic_full_df = pd.concat([synthetic_numeric_df, synthetic_cat_df], axis=1)

# Save the normalized synthetic data (same scale as X_train)
synthetic_full_df.to_csv(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\synthetic_fraud_v7_normalized_8kpoints.csv", index=False)
print("Normalized synthetic data saved to CSV - now in the SAME SCALE as X_train")

# If you also want the original scale version for reference
synthetic_num_original = scaler.inverse_transform(synthetic_num_norm_np)
synthetic_original_df = pd.DataFrame(synthetic_num_original, columns=numeric_features)
synthetic_full_original_df = pd.concat([synthetic_original_df, synthetic_cat_df], axis=1)
synthetic_full_original_df.to_csv(r"D:\DS Northeastern\DS 5500 - Capstone\FraudFusion\Data\synthetic_fraud_v7_original_8kpoints.csv", index=False)
print("Original scale synthetic data also saved for reference")

# Quick verification
print("\nVerification that X_train and synthetic_fraud_v7_normalized are in the same scale:")
X_fraud_tensor = torch.tensor(X_train_num, dtype=torch.float32)
X_train_subset = pd.DataFrame(X_fraud_tensor.numpy(), columns=numeric_features).describe()
synthetic_subset = synthetic_numeric_df.describe()

print("\nX_train statistics (normalized):")
print(X_train_subset)
print("\nSynthetic data statistics (normalized):")
print(synthetic_subset)

print("\nBoth should show similar scales - generally mean near 0 and std near 1 for normalized features")

Using device: cuda
Loaded StandardScaler and categorical artifacts.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Model loaded successfully!
Generating synthetic samples...
Found 4 peaks in amount distribution at: [-0.82075799 -0.50234721  1.72147407  2.53519049]
Synthetic numeric samples shape (normalized): torch.Size([8000, 11])
Synthetic categorical samples shape: torch.Size([8000, 8])
Normalized synthetic data saved to CSV - now in the SAME SCALE as X_train
Original scale synthetic data also saved for reference

Verification that X_train and synthetic_fraud_v7_normalized are in the same scale:

X_train statistics (normalized):
               amt          lat         long     city_pop    merch_lat  \
count  6273.000000  6273.000000  6273.000000  6273.000000  6273.000000   
mean      1.569536     0.035681     0.003129     0.027991     0.033742   
std       1.263539     1.006742     1.033924     0.996909     1.006928   
min      -2.180321    -3.650266    -5.487712    -2.113985    -3.795280   
25%       1.512012    -0.699740    -0.477387    -0.685151    -0.689590   
50%       1.883656     0.180715