# Cryptocurrency Fraud Detection

This notebook explores the cryptocurrency dataset and builds a Random Forest model for fraud detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import joblib

# Set style
plt.style.use('ggplot')
sns.set(style="whitegrid")

## 1. Load and Explore the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../data/crypto_volatility_fraud_dataset.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check class distribution
fraud_counts = df['fraud_label'].value_counts()
print(f"Fraud distribution:\n{fraud_counts}")
print(f"Fraud percentage: {fraud_counts[1] / len(df) * 100:.2f}%")

# Plot fraud distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='fraud_label', data=df)
plt.title('Fraud vs Non-Fraud Transactions')
plt.xlabel('Fraud Label (1 = Fraud)')
plt.ylabel('Count')
plt.show()

## 2. Data Preprocessing

In [None]:
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Extract time-based features
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month

# Fill missing values if any
df = df.fillna(0)

In [None]:
# Correlation analysis
plt.figure(figsize=(14, 12))
correlation = df.drop(columns=['date']).corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Feature distributions by fraud label
features = ['price', 'volatility', 'exchange_inflow', 'whale_activity', 'active_addresses']

fig, axes = plt.subplots(len(features), 1, figsize=(12, 15))
for i, feature in enumerate(features):
    sns.boxplot(x='fraud_label', y=feature, data=df, ax=axes[i])
    axes[i].set_title(f'{feature} by Fraud Label')
    axes[i].set_xlabel('Fraud Label (1 = Fraud)')
    
plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create price change features
df['price_change'] = df['price'].pct_change().fillna(0)
df['volume_change'] = df['on_chain_volume'].pct_change().fillna(0)

# Create ratio features
df['volume_to_price_ratio'] = df['on_chain_volume'] / df['price']
df['inflow_to_volume_ratio'] = df['exchange_inflow'] / df['on_chain_volume']

# Drop date column for modeling
df_model = df.drop(columns=['date'])

## 4. Model Training

In [None]:
# Prepare data for modeling
X = df_model.drop(columns=['fraud_label'])
y = df_model['fraud_label']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)

# Predictions
y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(15))
plt.title('Top 15 Feature Importances')
plt.tight_layout()
plt.show()

## 5. Hyperparameter Tuning

In [None]:
# Grid search for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42, class_weight='balanced'),
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1',
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best F1 score: {grid_search.best_score_:.4f}")

In [None]:
# Train final model with best parameters
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)

# Evaluate final model
y_pred_best = best_rf.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

## 6. Save the Model

In [None]:
# Save the best model
joblib.dump(best_rf, '../models/random_forest.pkl')
joblib.dump(scaler, '../models/fraud_scaler.pkl')
print("✅ Model and scaler saved successfully!")