In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Generate sample transaction data
def generate_transaction_data(n_transactions=10000):
    """
    Generate synthetic transaction data with fraud indicators
    """
    data = []
    
    for i in range(n_transactions):
        # Basic transaction info
        transaction_id = f"TXN_{i+1:06d}"
        
        # Amount - fraudulent transactions tend to be higher or very low
        if np.random.random() < 0.05:  # 5% fraud rate
            is_fraud = 1
            if np.random.random() < 0.7:
                amount = np.random.exponential(2000)  # Higher amounts
            else:
                amount = np.random.uniform(1, 10)  # Very low amounts
        else:
            is_fraud = 0
            amount = np.random.exponential(150)  # Normal amounts
        
        # Time of day (0-23 hours)
        if is_fraud:
            # Fraudulent transactions more likely at odd hours
            hour = np.random.choice([0,1,2,3,22,23], p=[0.2,0.2,0.2,0.2,0.1,0.1]) if np.random.random() < 0.6 else np.random.randint(0, 24)
        else:
            # Normal transactions during business hours
            hour = np.random.choice(range(8, 20)) if np.random.random() < 0.7 else np.random.randint(0, 24)
        
        # Device type
        device_types = ['mobile', 'desktop', 'tablet']
        if is_fraud:
            device = np.random.choice(device_types, p=[0.6, 0.3, 0.1])  # Fraudsters prefer mobile
        else:
            device = np.random.choice(device_types, p=[0.4, 0.5, 0.1])  # Normal distribution
        
        # Location risk (1-10 scale)
        if is_fraud:
            location_risk = np.random.randint(6, 11)  # Higher risk locations
        else:
            location_risk = np.random.randint(1, 6)   # Lower risk locations
        
        # Account age (days)
        if is_fraud:
            account_age = np.random.randint(1, 30)    # New accounts more risky
        else:
            account_age = np.random.randint(30, 1000) # Established accounts
        
        # Previous transaction count in last 24h
        if is_fraud:
            prev_transactions = np.random.randint(0, 3)  # Fewer previous transactions
        else:
            prev_transactions = np.random.randint(1, 10) # Normal activity
        
        data.append({
            'transaction_id': transaction_id,
            'amount': round(amount, 2),
            'hour': hour,
            'device': device,
            'location_risk': location_risk,
            'account_age': account_age,
            'prev_transactions_24h': prev_transactions,
            'is_fraud': is_fraud
        })
    
    return pd.DataFrame(data)

# Generate the dataset
df = generate_transaction_data(10000)
print("Dataset created successfully!")
print(f"Total transactions: {len(df)}")
print(f"Fraud transactions: {df['is_fraud'].sum()}")
print(f"Fraud rate: {df['is_fraud'].mean():.2%}")

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Statistics:")
print(df.describe())

print("\nFraud Distribution by Device:")
print(pd.crosstab(df['device'], df['is_fraud'], normalize='columns'))

# Create feature engineering function
def prepare_features(df):
    """
    Prepare features for the logistic regression model
    """
    df_processed = df.copy()
    
    # One-hot encode device type
    device_dummies = pd.get_dummies(df_processed['device'], prefix='device')
    df_processed = pd.concat([df_processed, device_dummies], axis=1)
    
    # Create time-based features
    df_processed['is_night'] = (df_processed['hour'] < 6) | (df_processed['hour'] > 22)
    df_processed['is_business_hours'] = (df_processed['hour'] >= 9) & (df_processed['hour'] <= 17)
    
    # Create amount-based features
    df_processed['log_amount'] = np.log1p(df_processed['amount'])
    df_processed['high_amount'] = df_processed['amount'] > df_processed['amount'].quantile(0.95)
    df_processed['low_amount'] = df_processed['amount'] < 10
    
    # Create account risk features
    df_processed['new_account'] = df_processed['account_age'] < 30
    df_processed['low_activity'] = df_processed['prev_transactions_24h'] < 2
    
    return df_processed

# Prepare features
df_features = prepare_features(df)
print("Features prepared successfully!")
print(f"Number of features: {df_features.shape[1]}")

# Select features for modeling
feature_columns = [
    'amount', 'log_amount', 'hour', 'location_risk', 'account_age', 
    'prev_transactions_24h', 'device_desktop', 'device_mobile', 
    'device_tablet', 'is_night', 'is_business_hours', 'high_amount', 
    'low_amount', 'new_account', 'low_activity'
]

X = df_features[feature_columns]
y = df_features['is_fraud']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")
print(f"Training accuracy: {model.score(X_train_scaled, y_train):.3f}")
print(f"Test accuracy: {model.score(X_test_scaled, y_test):.3f}")

def score_transaction(transaction_data, model, scaler, feature_columns):
    """
    Score a single transaction or batch of transactions for fraud probability
    
    Parameters:
    - transaction_data: dict or DataFrame with transaction details
    - model: trained logistic regression model
    - scaler: fitted StandardScaler
    - feature_columns: list of feature column names
    
    Returns:
    - fraud_probability: probability of fraud (0-1)
    - risk_category: 'Safe', 'Suspicious', or 'Likely Fraud'
    """
    
    # Convert single transaction to DataFrame if needed
    if isinstance(transaction_data, dict):
        df_single = pd.DataFrame([transaction_data])
    else:
        df_single = transaction_data.copy()
    
    # Prepare features
    df_processed = prepare_features(df_single)
    
    # Select and scale features
    X_score = df_processed[feature_columns]
    X_score_scaled = scaler.transform(X_score)
    
    # Get fraud probabilities
    fraud_probabilities = model.predict_proba(X_score_scaled)[:, 1]
    
    # Classify risk levels
    risk_categories = []
    for prob in fraud_probabilities:
        if prob < 0.3:
            risk_categories.append('Safe')
        elif prob < 0.7:
            risk_categories.append('Suspicious')
        else:
            risk_categories.append('Likely Fraud')
    
    if len(fraud_probabilities) == 1:
        return fraud_probabilities[0], risk_categories[0]
    else:
        return fraud_probabilities, risk_categories

# Test the scoring function with a sample transaction
sample_transaction = {
    'amount': 2500.00,
    'hour': 2,
    'device': 'mobile',
    'location_risk': 8,
    'account_age': 5,
    'prev_transactions_24h': 1
}

fraud_prob, risk_category = score_transaction(sample_transaction, model, scaler, feature_columns)
print(f"Sample Transaction Scoring:")
print(f"Fraud Probability: {fraud_prob:.3f}")
print(f"Risk Category: {risk_category}")

# Score all transactions in the test set
fraud_probs, risk_categories = score_transaction(df_features.iloc[X_test.index], model, scaler, feature_columns)

# Add results to test dataframe
test_results = df_features.iloc[X_test.index].copy()
test_results['fraud_probability'] = fraud_probs
test_results['predicted_risk'] = risk_categories
test_results['actual_fraud'] = test_results['is_fraud'].map({0: 'Not Fraud', 1: 'Fraud'})

print("All transactions scored successfully!")
print(f"Transactions processed: {len(test_results)}")

# Risk category distribution
print("Risk Category Distribution:")
risk_dist = test_results['predicted_risk'].value_counts()
print(risk_dist)
print(f"\nPercentages:")
print((risk_dist / len(test_results) * 100).round(2))

# Cross-tabulation of predicted vs actual
print("\nPredicted Risk vs Actual Fraud:")
confusion_table = pd.crosstab(test_results['predicted_risk'], test_results['actual_fraud'])
print(confusion_table)

# Calculate precision for each risk category
print("\nPrecision by Risk Category:")
for category in ['Safe', 'Suspicious', 'Likely Fraud']:
    subset = test_results[test_results['predicted_risk'] == category]
    if len(subset) > 0:
        precision = subset['is_fraud'].mean()
        print(f"{category}: {precision:.3f} ({precision:.1%})")

# Create detailed report for high-risk transactions
high_risk_transactions = test_results[test_results['predicted_risk'].isin(['Suspicious', 'Likely Fraud'])].copy()
high_risk_transactions = high_risk_transactions.sort_values('fraud_probability', ascending=False)

print("HIGH RISK TRANSACTIONS REPORT")
print("=" * 50)
print(f"Total high-risk transactions: {len(high_risk_transactions)}")
print(f"Likely Fraud: {(high_risk_transactions['predicted_risk'] == 'Likely Fraud').sum()}")
print(f"Suspicious: {(high_risk_transactions['predicted_risk'] == 'Suspicious').sum()}")

# Display top 10 highest risk transactions
print("\nTop 10 Highest Risk Transactions:")
display_columns = ['transaction_id', 'amount', 'hour', 'device', 'location_risk', 
                  'account_age', 'fraud_probability', 'predicted_risk', 'actual_fraud']
print(high_risk_transactions[display_columns].head(10).to_string(index=False))

# Set up the plotting style
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Fraud Probability Analysis Dashboard', fontsize=16, fontweight='bold')

# 1. Histogram of fraud probabilities
axes[0, 0].hist(test_results[test_results['is_fraud'] == 0]['fraud_probability'], 
                bins=50, alpha=0.7, label='Not Fraud', color='green', density=True)
axes[0, 0].hist(test_results[test_results['is_fraud'] == 1]['fraud_probability'], 
                bins=50, alpha=0.7, label='Fraud', color='red', density=True)
axes[0, 0].set_xlabel('Fraud Probability')
axes[0, 0].set_ylabel('Density')
axes[0, 0].set_title('Distribution of Fraud Probabilities')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Box plot by risk category
risk_order = ['Safe', 'Suspicious', 'Likely Fraud']
box_data = [test_results[test_results['predicted_risk'] == cat]['fraud_probability'] 
            for cat in risk_order]
bp = axes[0, 1].boxplot(box_data, labels=risk_order, patch_artist=True)
colors = ['green', 'orange', 'red']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[0, 1].set_ylabel('Fraud Probability')
axes[0, 1].set_title('Fraud Probability by Risk Category')
axes[0, 1].grid(True, alpha=0.3)

# 3. ROC-like curve showing probability thresholds
thresholds = np.linspace(0, 1, 100)
true_positive_rates = []
false_positive_rates = []

for threshold in thresholds:
    predictions = (test_results['fraud_probability'] >= threshold).astype(int)
    tp = ((predictions == 1) & (test_results['is_fraud'] == 1)).sum()
    fp = ((predictions == 1) & (test_results['is_fraud'] == 0)).sum()
    tn = ((predictions == 0) & (test_results['is_fraud'] == 0)).sum()
    fn = ((predictions == 0) & (test_results['is_fraud'] == 1)).sum()
    
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    
    true_positive_rates.append(tpr)
    false_positive_rates.append(fpr)

axes[1, 0].plot(false_positive_rates, true_positive_rates, 'b-', linewidth=2)
axes[1, 0].plot([0, 1], [0, 1], 'r--', alpha=0.5)
axes[1, 0].set_xlabel('False Positive Rate')
axes[1, 0].set_ylabel('True Positive Rate')
axes[1, 0].set_title('ROC Curve')
axes[1, 0].grid(True, alpha=0.3)

# 4. Feature importance
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': abs(model.coef_[0])
}).sort_values('importance', ascending=True)

axes[1, 1].barh(range(len(feature_importance)), feature_importance['importance'])
axes[1, 1].set_yticks(range(len(feature_importance)))
axes[1, 1].set_yticklabels(feature_importance['feature'])
axes[1, 1].set_xlabel('Absolute Coefficient Value')
axes[1, 1].set_title('Feature Importance')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/fraud_analysis_dashboard.png', dpi=300, bbox_inches='tight')
plt.show()

# Create risk category analysis plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Risk Category Analysis', fontsize=16, fontweight='bold')

# 1. Risk category counts
risk_counts = test_results['predicted_risk'].value_counts()
colors = ['green', 'orange', 'red']
axes[0, 0].pie(risk_counts.values, labels=risk_counts.index, autopct='%1.1f%%', 
               colors=colors, startangle=90)
axes[0, 0].set_title('Distribution of Risk Categories')

# 2. Fraud rate by risk category
fraud_rates = test_results.groupby('predicted_risk')['is_fraud'].mean()
bars = axes[0, 1].bar(fraud_rates.index, fraud_rates.values, color=colors)
axes[0, 1].set_ylabel('Fraud Rate')
axes[0, 1].set_title('Actual Fraud Rate by Predicted Risk Category')
axes[0, 1].set_ylim(0, 1)
for bar, rate in zip(bars, fraud_rates.values):
    axes[0, 1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                    f'{rate:.1%}', ha='center', va='bottom')

# 3. Amount distribution by risk category
for i, category in enumerate(['Safe', 'Suspicious', 'Likely Fraud']):
    data = test_results[test_results['predicted_risk'] == category]['amount']
    axes[1, 0].hist(data, bins=30, alpha=0.7, label=category, color=colors[i])
axes[1, 0].set_xlabel('Transaction Amount ($)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Transaction Amount Distribution by Risk Category')
axes[1, 0].legend()
axes[1, 0].set_xlim(0, 1000)  # Focus on typical amounts

# 4. Time of day analysis
hour_risk = test_results.groupby('hour')['fraud_probability'].mean()
axes[1, 1].plot(hour_risk.index, hour_risk.values, 'bo-', linewidth=2, markersize=6)
axes[1, 1].set_xlabel('Hour of Day')
axes[1, 1].set_ylabel('Average Fraud Probability')
axes[1, 1].set_title('Fraud Risk by Time of Day')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].set_xticks(range(0, 24, 4))

plt.tight_layout()
plt.savefig('visualizations/risk_category_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate comprehensive performance metrics
from sklearn.metrics import precision_recall_curve, auc

# Binary predictions using 0.5 threshold
y_pred_binary = (test_results['fraud_probability'] >= 0.5).astype(int)

print("MODEL PERFORMANCE METRICS")
print("=" * 40)

# Classification report
print("\nClassification Report:")
print(classification_report(test_results['is_fraud'], y_pred_binary))

# Confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(test_results['is_fraud'], y_pred_binary)
print(cm)

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(test_results['is_fraud'], 
                                                      test_results['fraud_probability'])
pr_auc = auc(recall, precision)

print(f"\nPrecision-Recall AUC: {pr_auc:.3f}")

# Business impact analysis
total_transactions = len(test_results)
flagged_transactions = (test_results['predicted_risk'] != 'Safe').sum()
actual_fraud_caught = test_results[(test_results['predicted_risk'] != 'Safe') & 
                                  (test_results['is_fraud'] == 1)].shape[0]
total_fraud = test_results['is_fraud'].sum()

print(f"\nBUSINESS IMPACT ANALYSIS")
print(f"Total transactions analyzed: {total_transactions:,}")
print(f"Transactions flagged for review: {flagged_transactions:,} ({flagged_transactions/total_transactions:.1%})")
print(f"Actual fraud cases caught: {actual_fraud_caught}/{total_fraud} ({actual_fraud_caught/total_fraud:.1%})")
print(f"Review efficiency: {actual_fraud_caught/flagged_transactions:.1%} fraud rate in flagged transactions")

class FraudDetectionSystem:
    """
    Production-ready fraud detection system
    """
    
    def __init__(self, model, scaler, feature_columns):
        self.model = model
        self.scaler = scaler
        self.feature_columns = feature_columns
        self.transaction_log = []
    
    def score_transaction(self, transaction):
        """
        Score a single transaction and return detailed results
        """
        # Prepare transaction data
        df_single = pd.DataFrame([transaction])
        df_processed = prepare_features(df_single)
        
        # Score transaction
        X_score = df_processed[self.feature_columns]
        X_score_scaled = self.scaler.transform(X_score)
        fraud_prob = self.model.predict_proba(X_score_scaled)[0, 1]
        
        # Determine risk category and action
        if fraud_prob < 0.3:
            risk_category = 'Safe'
            action = 'Approve'
        elif fraud_prob < 0.7:
            risk_category = 'Suspicious'
            action = 'Manual Review'
        else:
            risk_category = 'Likely Fraud'
            action = 'Decline'
        
        # Create result
        result = {
            'transaction_id': transaction['transaction_id'],
            'fraud_probability': fraud_prob,
            'risk_category': risk_category,
            'recommended_action': action,
            'timestamp': pd.Timestamp.now(),
            'risk_factors': self._identify_risk_factors(df_processed, fraud_prob)
        }
        
        # Log transaction
        self.transaction_log.append(result)
        
        return result
    
    def _identify_risk_factors(self, df_processed, fraud_prob):
        """
        Identify key risk factors for the transaction
        """
        risk_factors = []
        
        if df_processed['amount'].iloc[0] > 1000:
            risk_factors.append('High transaction amount')
        
        if df_processed['is_night'].iloc[0]:
            risk_factors.append('Transaction during night hours')
        
        if df_processed['new_account'].iloc[0]:
            risk_factors.append('New account (< 30 days)')
        
        if df_processed['location_risk'].iloc[0] > 7:
            risk_factors.append('High-risk location')
        
        if df_processed['low_activity'].iloc[0]:
            risk_factors.append('Low recent account activity')
        
        return risk_factors
    
    def batch_score(self, transactions):
        """
        Score multiple transactions
        """
        results = []
        for transaction in transactions:
            result = self.score_transaction(transaction)
            results.append(result)
        return results
    
    def get_daily_summary(self):
        """
        Generate daily fraud detection summary
        """
        if not self.transaction_log:
            return "No transactions processed today."
        
        df_log = pd.DataFrame(self.transaction_log)
        today_log = df_log[df_log['timestamp'].dt.date == pd.Timestamp.now().date()]
        
        summary = {
            'total_transactions': len(today_log),
            'safe_transactions': (today_log['risk_category'] == 'Safe').sum(),
            'suspicious_transactions': (today_log['risk_category'] == 'Suspicious').sum(),
            'likely_fraud': (today_log['risk_category'] == 'Likely Fraud').sum(),
            'average_fraud_probability': today_log['fraud_probability'].mean(),
            'transactions_declined': (today_log['recommended_action'] == 'Decline').sum()
        }
        
        return summary

# Initialize the fraud detection system
fraud_system = FraudDetectionSystem(model, scaler, feature_columns)

print("Fraud Detection System initialized successfully!")

# Test transactions with different risk profiles
test_transactions = [
    {
        'transaction_id': 'TEST_001',
        'amount': 50.00,
        'hour': 14,
        'device': 'desktop',
        'location_risk': 3,
        'account_age': 365,
        'prev_transactions_24h': 5
    },
    {
        'transaction_id': 'TEST_002',
        'amount': 2500.00,
        'hour': 3,
        'device': 'mobile',
        'location_risk': 9,
        'account_age': 2,
        'prev_transactions_24h': 0
    },
    {
        'transaction_id': 'TEST_003',
        'amount': 150.00,
        'hour': 10,
        'device': 'tablet',
        'location_risk': 5,
        'account_age': 180,
        'prev_transactions_24h': 3
    }
]

print("TESTING FRAUD DETECTION SYSTEM")
print("=" * 50)

for transaction in test_transactions:
    result = fraud_system.score_transaction(transaction)
    
    print(f"\nTransaction ID: {result['transaction_id']}")
    print(f"Fraud Probability: {result['fraud_probability']:.3f}")
    print(f"Risk Category: {result['risk_category']}")
    print(f"Recommended Action: {result['recommended_action']}")
    print(f"Risk Factors: {', '.join(result['risk_factors']) if result['risk_factors'] else 'None identified'}")

# Generate daily summary
summary = fraud_system.get_daily_summary()
print(f"\nDAILY SUMMARY:")
for key, value in summary.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

import joblib
import json

# Save the trained model and scaler
joblib.dump(model, 'scripts/fraud_detection_model.pkl')
joblib.dump(scaler, 'scripts/fraud_scaler.pkl')

# Save feature columns
with open('scripts/feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)

# Save test results
test_results.to_csv('data/fraud_analysis_results.csv', index=False)

# Save high-risk transactions report
high_risk_transactions.to_csv('data/high_risk_transactions.csv', index=False)

print("Model and results saved successfully!")
print("Files saved:")
print("- scripts/fraud_detection_model.pkl")
print("- scripts/fraud_scaler.pkl") 
print("- scripts/feature_columns.json")
print("- data/fraud_analysis_results.csv")
print("- data/high_risk_transactions.csv")