In [None]:
%pip install -r ../requirements.txt

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
import statsmodels.api as sm
from typing import List, Dict, Set, Tuple, Optional, Callable
from collections import Counter, defaultdict
from functools import cache
import datetime
from math import sqrt, pow

warnings.filterwarnings('ignore')

In [3]:
df_test = pd.read_csv('data/fraudTest.csv')
df_train = pd.read_csv('data/fraudTrain.csv')

In [None]:
df_test.describe()

In [None]:
df_test.head(10)

In [None]:
df_test.is_fraud.value_counts()

In [7]:
# Utils
def calculate_age(date: datetime) -> datetime:
  birthdate = datetime.datetime.strptime(date, "%Y-%m-%d")
  today = datetime.datetime.today()
  age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
  return age

euc: Callable[[float, float, float, float], float] = lambda x2, x1, y2, y1: sqrt(pow((x2 - x1), 2) + pow((y2 - y1), 2))


In [None]:
# Data Cleaning

df_test.drop(['Unnamed: 0'], axis=1, inplace=True)
df_test.drop_duplicates(inplace=True)
df_test.dropna(inplace=True)

# Feature Engineering
df_test['age'] = df_test['dob'].apply(calculate_age)
df_test.drop(['dob'], axis=1, inplace=True)
df_test['name'] = df_test['first'] + ' ' + df_test['last']
df_test.drop(['first', 'last'], axis=1, inplace=True)
df_test.head(20)

In [None]:
sns.set_style(style='whitegrid')
plt.figure(figsize=(12, 10))

In [10]:
numeric_columns = df_test.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df_test[numeric_columns].corr()

df_test['trans_timestamp'] = pd.to_datetime(df_test['trans_date_trans_time']).astype(np.int64) // 10**9
numeric_columns = df_test.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = df_test[numeric_columns].corr()

In [None]:
# Set up the matplotlib figure size
plt.figure(figsize=(14, 12))

# Customize the heatmap
sns.heatmap(
    correlation_matrix,            # Your correlation matrix data
    annot=True,                    # Display correlation values
    cmap="coolwarm",               # Color scheme for the heatmap
    fmt=".2f",                     # Format to show 2 decimal points
    linewidths=0.5,                # Line width between cells for clarity
    linecolor="black",             # Line color to separate cells
    square=True,                   # Make each cell square-shaped
    cbar_kws={'shrink': 0.8}       # Shrink the color bar for better fit
)

# Title and label customization
plt.title("Correlation Matrix Heatmap", fontsize=18, weight='bold')
plt.xticks(rotation=45, ha='right', fontsize=12)  # Rotate x-axis labels for better readability
plt.yticks(rotation=0, fontsize=12)               # Y-axis labels for clarity
plt.show()

In [None]:
# Set the plot size and style
plt.figure(figsize=(12, 6))
sns.set_style("whitegrid")

# Create the count plot
ax = sns.countplot(x='category', hue='is_fraud', data=df_test, palette='coolwarm')

# Add title and labels
plt.title('Fraud by Transaction Category', fontsize=16, weight='bold')
plt.xlabel('Transaction Category', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)

# Annotate each bar with the count value
for container in ax.containers:
    ax.bar_label(container, fmt='%d', label_type='edge', fontsize=10, padding=3)

# Display the plot
plt.show()

In [None]:
# Set the figure size for better visibility
plt.figure(figsize=(12, 8))

# Create the scatterplot
sns.scatterplot(
    x='long', 
    y='lat', 
    hue='is_fraud',                 # Color by 'is_fraud' feature
    data=df_test, 
    palette="coolwarm",             # Custom color palette for contrast
    edgecolor="k",                  # Black edge color for each point
    alpha=0.6,                      # Adjust alpha for visibility of overlapping points
    s=70                            # Adjust marker size for clarity
)

# Customize the plot with title and labels
plt.title('Geographical Distribution of Fraud', fontsize=18, weight='bold')
plt.xlabel('Longitude', fontsize=14)
plt.ylabel('Latitude', fontsize=14)

# Fine-tune legend
plt.legend(title="Fraud Status", title_fontsize='13', fontsize='11', loc='upper right')

# Show the plot
plt.show()

In [None]:
fraud_probability = df_test.groupby('state')['is_fraud'].mean().reset_index()
fraud_probability.columns = ['State', 'Fraud Probability']

# Convert fraud probability to percentage format for better readability
fraud_probability['Fraud Probability'] *= 100  # Convert to percentage

# Sort by fraud probability for a better visual order
fraud_probability = fraud_probability.sort_values(by='Fraud Probability', ascending=False)

# Set the plot size and style
plt.figure(figsize=(14, 10))
sns.set_style("whitegrid")

# Create a horizontal bar plot
sns.barplot(
    x='Fraud Probability', 
    y='State', 
    data=fraud_probability,
    palette="coolwarm"
)

# Add title and labels
plt.title('Fraud Probability by State', fontsize=16, weight='bold')
plt.xlabel('Fraud Probability (%)', fontsize=14)
plt.ylabel('State', fontsize=14)

# Display the values on each bar
for index, value in enumerate(fraud_probability['Fraud Probability']):
    plt.text(value + 0.5, index, f"{value:.2f}%", va='center')

# Show the plot
plt.show()

In [None]:
# one-to-many linear regression
target: str = 'is_fraud'

predictors: List[str] = [
  'amt', 'city_pop', 'merch_lat', 'merch_long'
]

lr_x = df_test[predictors]
lr_y = df_test[target]

lr_X_constant = sm.add_constant(lr_x)
lr_model = sm.OLS(lr_y, lr_X_constant).fit()

lr_model.summary()

In [None]:
# Original column names in the DataFrame
target: str = 'is_fraud'  # Target variable indicating fraud status
predictors: List[str] = ['amt', 'city_pop', 'merch_lat', 'merch_long']

# Mapping for human-readable names
column_name_mapping = {
    'is_fraud': 'Fraud Status',            # Target variable
    'amt': 'Transaction Amount',            # Amount of the transaction
    'city_pop': 'City Population',          # Population of the city
    'merch_lat': 'Merchant Latitude',       # Latitude of the merchant
    'merch_long': 'Merchant Longitude'      # Longitude of the merchant
}

# Set style and plot size
sns.set_style("whitegrid")
plt.figure(figsize=(15, 30))

# Create a pairplot
pairplot = sns.pairplot(
    df_test[predictors + [target]], 
    hue=target,                     # Color by 'is_fraud' feature
    palette="coolwarm",            # Use a distinct color palette
    diag_kind="kde",               # KDE plot on the diagonal for smooth distributions
    plot_kws={'alpha': 0.6}        # Add transparency for overlapping points
)

# Add a main title using the mapping for readability
pairplot.fig.suptitle('Pairplot of Predictor Features by Fraud Status', fontsize=18, y=1.02)

# Update x and y axis labels with human-readable names
for ax in pairplot.axes.flatten():
    ax.set_xlabel(column_name_mapping.get(ax.get_xlabel(), ax.get_xlabel()))
    ax.set_ylabel(column_name_mapping.get(ax.get_ylabel(), ax.get_ylabel()))

# Adjust layout for better readability
pairplot.fig.tight_layout()
plt.show()


In [None]:
# Calculate the Euclidean distance between the user's location and the merchant location (lat, long)
df_test['distance'] = df_test.apply(lambda row: euc(
    row['merch_lat'], 
    row['lat'], 
    row['merch_long'], 
    row['long']
), axis=1)

plt.figure(figsize=(20, 10))  # Increased size for better visibility

sns.histplot(
    data=df_test, 
    x='amt', 
    kde=True,
    hue='is_fraud',
    multiple="stack",
    bins=30,  # Adjust number of bins if necessary
    alpha=0.7  # Slight transparency for overlapping areas
)

plt.title('Transaction Amount Distribution', fontsize=22)  # Larger title font size
plt.xlabel('Amount ($)', fontsize=18)  # Larger x-axis label
plt.ylabel('Count', fontsize=18)  # Larger y-axis label
plt.xticks(fontsize=14)  # Larger x-axis ticks
plt.yticks(fontsize=14)  # Larger y-axis ticks
plt.legend(title='Fraud Status', fontsize=14, title_fontsize=16, loc='upper right')  # Legend font sizes

plt.show()

In [None]:
sns.histplot(
    data=df_test, 
    x='distance', 
    kde=True,
    hue='is_fraud',
    multiple="stack"
)
plt.title('User-Merchant Distance Distribution')
plt.xlabel('Distance')
plt.ylabel('Count')
plt.show()

In [None]:
print("\nSummary Statistics:")

In [None]:
print("\nTransaction Amounts:")  
df_test.groupby('is_fraud')['amt'].describe()

In [None]:
print("\nUser-Merchant Distances:")
df_test.groupby('is_fraud')['distance'].describe()

In [None]:
df_test.columns.tolist()

In [23]:
# First, calculate age and add it to df_train (similar to what you did for df_test)
df_train.drop(['Unnamed: 0'], axis=1, inplace=True)
df_train.drop_duplicates(inplace=True)
df_train.dropna(inplace=True)
df_train['age'] = df_train['dob'].apply(calculate_age)
df_train.drop(['dob'], axis=1, inplace=True)
df_train['name'] = df_train['first'] + ' ' + df_train['last']
df_train.drop(['first', 'last'], axis=1, inplace=True)

# Calculate distance for df_train
df_train['distance'] = df_train.apply(lambda row: euc(
    row['merch_lat'], 
    row['lat'], 
    row['merch_long'], 
    row['long']
), axis=1)

# Convert timestamp for df_train
df_train['trans_timestamp'] = pd.to_datetime(df_train['trans_date_trans_time']).astype(np.int64) // 10**9

# Drop irrelevant columns
columns_to_drop: List[str] = [
    'trans_date_trans_time',  # Already converted to trans_timestamp
    'cc_num',                 # Sensitive information
    'merchant',              # Categorical, high cardinality
    'street',               # Too specific location data
    'city',                 # Already have city_pop
    'state',                # Already have geographic info
    'zip',                  # Already have geographic info
    'lat',                  # Already calculated distance
    'long',                 # Already calculated distance
    'merch_lat',            # Already calculated distance
    'merch_long',           # Already calculated distance
    'trans_num',            # Transaction identifier
    'unix_time',            # Already have trans_timestamp
    'name',                 # Personal identifier
]

# Keep important features
columns_to_keep: List[str] = [
    'category',             # Transaction category
    'amt',                  # Transaction amount
    'gender',              # Demographic info
    'city_pop',            # Population density
    'job',                 # Demographic info
    'is_fraud',            # Target variable
    'age',                 # Demographic info
    'trans_timestamp',     # Timing information
    'distance'             # Distance between user and merchant
]
# Drop columns
df_test = df_test.drop(columns=columns_to_drop)

In [None]:
df_test.columns.tolist()

In [None]:
df_train.columns.tolist()

In [None]:
print("Columns to drop from df_train:")
columns_to_drop_train = [col for col in df_train.columns if col not in df_test.columns]
print(columns_to_drop_train)

fraud_train_df = df_train.drop(columns=columns_to_drop_train)
fraud_test_df = df_test.copy()

print("\nFinal columns in fraud_train_df:", fraud_train_df.columns.tolist())
print("\nFinal columns in fraud_test_df:", fraud_test_df.columns.tolist())

In [None]:
# Define the desired column order
column_order = sorted([
    'category',
    'amt',
    'gender',
    'city_pop',
    'job',
    'is_fraud',
    'age',
    'trans_timestamp',
    'distance'
])

# Reorder columns in both dataframes
fraud_train_df = fraud_train_df[column_order]
fraud_test_df = fraud_test_df[column_order]

# Verify the results
print("\nFinal columns in fraud_train_df:", fraud_train_df.columns.tolist())
print("\nFinal columns in fraud_test_df:", fraud_test_df.columns.tolist())

In [28]:
categorical_columns = fraud_train_df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}

for col in categorical_columns:
  combined_values = pd.concat([fraud_train_df[col], fraud_test_df[col]])
  
  codes, uniques = pd.factorize(combined_values)
  
  fraud_train_df[col] = pd.Categorical(fraud_train_df[col], categories=uniques).codes
  fraud_test_df[col] = pd.Categorical(fraud_test_df[col], categories=uniques).codes

In [None]:
fraud_train_df.head()

In [None]:
fraud_test_df.head()

In [31]:
X_train = fraud_train_df.drop('is_fraud', axis=1)
y_train = fraud_train_df['is_fraud']

X_test = fraud_test_df.drop('is_fraud', axis=1)
y_test = fraud_test_df['is_fraud']

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_test.head()

In [None]:
y_test.head()

In [36]:
from sklearn.preprocessing import StandardScaler

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [37]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train_scaled)

In [38]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek

# Basic SMOTE with optimized parameters
smote = SMOTE(
    sampling_strategy='auto',     # Automatically determine ratio to make classes balanced
    random_state=42,
    k_neighbors=5,               # Number of nearest neighbors to use
    n_jobs=-1,                   # Use all available processors
)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_imputed, y_train)

In [39]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_resampled, y_train_resampled)

# Get feature names (assuming X_train_resampled is a DataFrame)
feature_names = X_train_resampled.columns if hasattr(X_train_resampled, 'columns') else [f'Feature {i}' for i in range(X_train_resampled.shape[1])]

# Calculate feature importances
importances = model.feature_importances_

In [None]:
importances

In [41]:
from typing import Any
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
  accuracy_score, 
  classification_report, 
  confusion_matrix,
  roc_curve, 
  precision_recall_curve, 
  average_precision_score,
  roc_auc_score, f1_score
)
from datetime import datetime
import time
import pickle

import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [42]:
lr_model = LogisticRegression(random_state=42)

In [None]:
lr_model.fit(X_train_resampled, y_train_resampled)

In [None]:
lr_predictions = lr_model.predict(X_test_scaled)
lr_predictions

In [None]:
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_accuracy

In [46]:
def evaluate_model(
    model: Any,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    file_name: str
) -> dict:
    """
    Comprehensive model evaluation function that includes training, 
    prediction, various metrics, and visualizations.
    
    Args:
        model: The machine learning model to evaluate
        X_train: Training features
        X_test: Test features
        y_train: Training labels
        y_test: Test labels
        file_name: Name for saving the model
        
    Returns:
        dict: Dictionary containing all evaluation metrics
    """
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    predictions = model.predict(X_test)
    try:
        pred_proba = model.predict_proba(X_test)[:, 1]
        has_probability = True
    except (AttributeError, NotImplementedError):
        has_probability = False
        pred_proba = None
    
    metrics = {
        'accuracy': accuracy_score(y_test, predictions),
        'f1': f1_score(y_test, predictions),
        'training_time': training_time
    }
    
    if has_probability:
        metrics['roc_auc'] = roc_auc_score(y_test, pred_proba)
        metrics['average_precision'] = average_precision_score(y_test, pred_proba)
    
    print(f"\n{'-'*50}")
    print(f"Evaluation Results for {model.__class__.__name__}")
    print(f"{'-'*50}")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    if has_probability:
        print(f"ROC AUC: {metrics['roc_auc']:.4f}")
        print(f"Average Precision: {metrics['average_precision']:.4f}")
    print(f"Training Time: {training_time:.2f} seconds")
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, predictions))
    
    plt.figure(figsize=(20, 10))
    
    plt.subplot(2, 2, 1)
    cm = confusion_matrix(y_test, predictions)
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['Not Fraud', 'Fraud'],
        yticklabels=['Not Fraud', 'Fraud']
    )
    plt.title(f'Confusion Matrix - {model.__class__.__name__}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    if has_probability:
        # 2. ROC Curve
        plt.subplot(2, 2, 2)
        fpr, tpr, _ = roc_curve(y_test, pred_proba)
        plt.plot(fpr, tpr, label=f'ROC curve (AUC = {metrics["roc_auc"]:.3f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc="lower right")
        
        # 3. Precision-Recall Curve
        plt.subplot(2, 2, 3)
        precision, recall, _ = precision_recall_curve(y_test, pred_proba)
        plt.plot(recall, precision, label=f'PR curve (AP = {metrics["average_precision"]:.3f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend(loc="lower left")
        
        # 4. Prediction Distribution
        plt.subplot(2, 2, 4)
        sns.kdeplot(data=pd.DataFrame({
            'Probability': pred_proba,
            'Class': y_test
        }), x='Probability', hue='Class')
        plt.title('Prediction Probability Distribution')
    
    plt.tight_layout()
    plt.show()
    
    # Save model with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_path = f"./out/bonus/{file_name}_{timestamp}.pkl"
    
    try:
        with open(save_path, "wb") as f:
            pickle.dump(model, f)
        print(f"\nModel saved successfully to: {save_path}")
    except Exception as e:
        print(f"\nError saving model: {str(e)}")
    
    return metrics

In [47]:
# XGBoost with optimized parameters
xgboost_model = xgb.XGBClassifier(
    n_estimators=1000,          # More trees often yield better performance
    learning_rate=0.01,         # Smaller learning rate for better generalization
    max_depth=6,                # Control tree depth to prevent overfitting
    min_child_weight=1,         # Minimum sum of instance weight needed in a child
    gamma=0.1,                  # Minimum loss reduction for partition
    subsample=0.8,              # Fraction of samples used for training trees
    colsample_bytree=0.8,       # Fraction of features used for training trees
    random_state=42
)
evaluate_model(xgboost_model, X_train_resampled, X_test_scaled, y_train_resampled, y_test, "xgb_model")

# Random Forest with optimized parameters
random_forest_model = RandomForestClassifier(
    n_estimators=500,           # Number of trees
    max_depth=15,               # Maximum depth of trees
    min_samples_split=5,        # Minimum samples required to split
    min_samples_leaf=2,         # Minimum samples required at leaf node
    max_features='sqrt',        # Number of features to consider at each split
    bootstrap=True,             # Use bootstrap samples
    class_weight='balanced',    # Handle class imbalance
    random_state=42
)
# evaluate_model(random_forest_model, X_train_resampled, X_test_scaled, y_train_resampled, y_test, "rf_model")

# Decision Tree with optimized parameters
decision_tree_model = DecisionTreeClassifier(
    max_depth=8,                # Control tree depth
    min_samples_split=10,       # Minimum samples required to split
    min_samples_leaf=4,         # Minimum samples required at leaf node
    max_features='sqrt',        # Number of features to consider
    class_weight='balanced',    # Handle class imbalance
    random_state=42
)
evaluate_model(decision_tree_model, X_train_resampled, X_test_scaled, y_train_resampled, y_test, "dt_model")

# SVC with optimized parameters
svc_model = SVC(
    C=10.0,                     # Regularization parameter
    kernel='rbf',               # Radial basis function kernel
    gamma='scale',              # Kernel coefficient
    probability=True,           # Enable probability estimates
    class_weight='balanced',    # Handle class imbalance
    random_state=42
)
# evaluate_model(svc_model, X_train_resampled, X_test_scaled, y_train_resampled, y_test, "svc_model")

# Naive Bayes with optimized parameters
# Note: GaussianNB has fewer parameters to tune
naive_bayes_model = GaussianNB(
    var_smoothing=1e-9          # Stability adjustment
)
# evaluate_model(naive_bayes_model, X_train_resampled, X_test_scaled, y_train_resampled, y_test, "nb_model")

# KNN with optimized parameters
knn_model = KNeighborsClassifier(
    n_neighbors=5,              # Number of neighbors
    weights='distance',         # Weight points by distance
    algorithm='auto',           # Automatically choose best algorithm
    leaf_size=30,              # Leaf size for tree algorithms
    p=2,                       # Power parameter for Minkowski metric (2 = Euclidean)
    metric='minkowski'         # Distance metric to use
)
# evaluate_model(knn_model, X_train_resampled, X_test_scaled, y_train_resampled, y_test, "knn_model")

In [None]:
from sklearn.inspection import permutation_importance

def analyze_feature_importance(
    model,
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_test: pd.Series,
    n_top_features: int = 20
) -> dict:
    """
    Comprehensive feature importance analysis with multiple metrics and visualizations.
    
    Args:
        model: Trained model with feature_importances_ attribute
        X_train: Training data
        X_test: Test data
        y_test: Test labels
        n_top_features: Number of top features to display
    """
    # Get feature importances from model
    feature_importances = model.feature_importances_
    feature_names = X_train.columns
    
    # Create base DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)
    
    # Calculate cumulative importance
    feature_importance_df['Cumulative_Importance'] = np.cumsum(
        feature_importance_df['Importance']
    )
    
    # Calculate permutation importance
    perm_importance = permutation_importance(
        model, X_test, y_test,
        n_repeats=10,
        random_state=42
    )
    
    # Add permutation importance to DataFrame
    feature_importance_df['Permutation_Importance'] = pd.Series(
        perm_importance.importances_mean,
        index=feature_importance_df.index
    )
    
    # Calculate feature importance statistics
    importance_stats = {
        'n_important_features': sum(feature_importance_df['Importance'] > 0.01),
        'top_features_90_percent': sum(feature_importance_df['Cumulative_Importance'] <= 0.9),
        'correlation_matrix': calculate_feature_correlations(X_train, feature_importance_df)
    }
    
    # Create visualizations
    plt.figure(figsize=(20, 15))
    
    # 1. Bar plot of top feature importances
    plt.subplot(2, 2, 1)
    plot_top_features(feature_importance_df, n_top_features, 'Feature Importance (MDI)')
    
    # 2. Bar plot of permutation importances
    plt.subplot(2, 2, 2)
    plot_permutation_importance(feature_importance_df, n_top_features)
    
    # 3. Cumulative importance plot
    plt.subplot(2, 2, 3)
    plot_cumulative_importance(feature_importance_df)
    
    # 4. Feature correlation heatmap
    plt.subplot(2, 2, 4)
    plot_feature_correlations(importance_stats['correlation_matrix'], n_top_features)
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed analysis
    print_feature_importance_analysis(feature_importance_df, importance_stats)
    
    return feature_importance_df, importance_stats

def calculate_feature_correlations(X_train: pd.DataFrame, importance_df: pd.DataFrame) -> pd.DataFrame:
    """Calculate correlations between top important features."""
    top_features = importance_df['Feature'].head(10).tolist()
    return X_train[top_features].corr()

def plot_top_features(df: pd.DataFrame, n_features: int, title: str):
    """Plot top n important features."""
    sns.barplot(
        data=df.head(n_features),
        x='Importance',
        y='Feature',
        palette='viridis'
    )
    plt.title(title)
    plt.xlabel('Importance Score')
    plt.ylabel('Features')

def plot_permutation_importance(df: pd.DataFrame, n_features: int):
    """Plot permutation importance for top features."""
    sns.barplot(
        data=df.head(n_features),
        x='Permutation_Importance',
        y='Feature',
        palette='viridis'
    )
    plt.title('Permutation Importance')
    plt.xlabel('Permutation Importance Score')
    plt.ylabel('Features')

def plot_cumulative_importance(df: pd.DataFrame):
    """Plot cumulative importance curve."""
    plt.plot(range(len(df)), df['Cumulative_Importance'], 'b-')
    plt.hlines(y=0.9, xmin=0, xmax=len(df), color='r', linestyles='--')
    plt.title('Cumulative Feature Importance')
    plt.xlabel('Number of Features')
    plt.ylabel('Cumulative Importance')

def plot_feature_correlations(corr_matrix: pd.DataFrame, n_features: int):
    """Plot correlation heatmap for top features."""
    sns.heatmap(
        corr_matrix,
        annot=True,
        cmap='coolwarm',
        center=0,
        fmt='.2f'
    )
    plt.title('Feature Correlations')

def print_feature_importance_analysis(df: pd.DataFrame, stats: dict):
    """Print detailed analysis of feature importance."""
    print("\nFeature Importance Analysis")
    print("-" * 50)
    
    print(f"\nTop 10 Most Important Features:")
    print(df[['Feature', 'Importance', 'Permutation_Importance']].head(10))
    
    print(f"\nFeature Importance Statistics:")
    print(f"- Number of important features (>1% importance): {stats['n_important_features']}")
    print(f"- Features needed for 90% cumulative importance: {stats['top_features_90_percent']}")
    
    print("\nFeature Categories Distribution:")
    categorical_features = df[df['Feature'].str.contains('cat_', case=False)]
    numerical_features = df[~df['Feature'].str.contains('cat_', case=False)]
    
    print(f"- Categorical features in top 10: {sum(categorical_features['Feature'].head(10).count())}")
    print(f"- Numerical features in top 10: {sum(numerical_features['Feature'].head(10).count())}")

feature_importance_df, importance_stats = analyze_feature_importance(
    xgboost_model,
    X_train,
    X_test_scaled,
    y_test
)

In [None]:
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.xticks(rotation=90)
plt.title('Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
X_train.head()

In [None]:
X_train['age_group'] = pd.cut(fraud_train_df['age'], bins=[0, 18, 25, 35, 45, 55, 65, 100], labels=['0-18', '19-25', '26-35', '36-45', '46-55', '56-65', '65+'])

In [None]:
X_train.head()

In [None]:
X_train['hour'] = pd.to_datetime(fraud_train_df['trans_timestamp'], unit='s').dt.hour
X_train['day_of_week'] = pd.to_datetime(fraud_train_df['trans_timestamp'], unit='s').dt.dayofweek
X_train['is_weekend'] = X_train['day_of_week'].isin([5, 6]).astype(int)
X_train['is_night'] = X_train['hour'].between(22, 6).astype(int)

In [None]:
X_train.head()

In [None]:
X_train['amount_bin'] = pd.qcut(fraud_train_df['amt'], q=5, labels=['very_low', 'low', 'medium', 'high', 'very_high'])
X_train['is_large_transaction'] = (fraud_train_df['amt'] > fraud_train_df['amt'].quantile(0.95)).astype(int)

In [None]:
X_train.head()

In [None]:
X_train['population_density_category'] = pd.qcut(
    X_train['city_pop'], 
    q=5, 
    labels=['rural', 'suburban', 'urban', 'metropolitan', 'dense_metropolitan']
)
X_train['is_remote_transaction'] = (X_train['distance'] > X_train['distance'].quantile(0.75)).astype(int)

In [None]:
X_train.head()

In [None]:
features = pd.get_dummies(X_train, drop_first=True)
features.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, y_train, test_size=0.2, random_state=42)

In [None]:
xgb_model_improved = xgb.XGBClassifier(random_state=42)
evaluate_model(xgb_model_improved, X_train, X_test, y_train, y_test, "xgb_model_improved")

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

xgb_model_improved = xgb.XGBClassifier(random_state=42)
evaluate_model(xgb_model_improved, X_train_resampled, X_test, y_train_resampled, y_test, "xgb_model_smote")

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

xgb_params = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'min_child_weight': randint(1, 7),
    'gamma': uniform(0, 0.5)
}

In [None]:
xgb_base = xgb.XGBClassifier(random_state=42, n_jobs=-1)

# Initialize RandomizedSearchCV
xgb_random = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=xgb_params,
    n_iter=25,  # Number of parameter settings sampled
    cv=3,       # Number of folds
    verbose=2,
    random_state=42,
    n_jobs=-1,  # Use all CPU cores
    scoring='f1'  # Optimize for F1 score
)

# Fit the model with SMOTE-resampled data
xgb_random.fit(X_train_resampled, y_train_resampled)

# Get best model and parameters
best_xgb = xgb_random.best_estimator_
print("Best parameters:", xgb_random.best_params_)

# Evaluate the optimized model
evaluate_model(best_xgb, X_train_resampled, X_test, y_train_resampled, y_test, "xgb_optimized")

In [None]:
voting_model = VotingClassifier(estimators=[
    ('xgb', best_xgb),
    ('rf', RandomForestClassifier(
        n_estimators=500,
        max_depth=8,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )),
    ('dt', DecisionTreeClassifier(
        max_depth=6,
        min_samples_split=10,
        min_samples_leaf=4,
        class_weight='balanced',
        random_state=42
    ))
], voting='soft', weights=[2, 1, 1])

evaluate_model(voting_model, X_train_resampled, X_test, y_train_resampled, y_test, "voting_model")