# üöÄ Fraud Detection Training on Kaggle

This notebook is optimized for Kaggle's environment with GPU acceleration.

## üìä Dataset
- Upload your `final_fraud_detection_dataset.csv`
- The notebook will automatically load and preprocess your data

## üéØ Models
- Traditional ML: TF-IDF + Logistic Regression/SVM
- Deep Learning: BERT-based classifier

## ‚ö° Kaggle Advantages
- Free GPU access (Tesla P100)
- Pre-installed ML libraries
- Easy dataset upload
- Community sharing

In [None]:
# Install additional packages if needed
!pip install transformers torch --quiet

# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Environment ready!")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Load your dataset
try:
    df = pd.read_csv('/kaggle/input/fraud-detection-dataset/final_fraud_detection_dataset.csv')
    print(f"‚úÖ Dataset loaded: {len(df)} samples")
    print(f"Columns: {df.columns.tolist()}")
    print(f"Label distribution: {df['binary_label'].value_counts()}")
except FileNotFoundError:
    print("‚ùå Dataset not found. Please upload your CSV file.")
    # Create sample data for demonstration
    print("üìù Using sample data instead...")
    # [Sample data creation code here]

In [None]:
# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Map labels
df['label'] = df['binary_label'].map({1: 'fraud', 0: 'normal'})

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Training label distribution: {y_train.value_counts().to_dict()}")

In [None]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")
print("‚úÖ Text vectorization complete!")

In [None]:
# Train traditional ML models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train_encoded)

# SVM
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train_tfidf, y_train_encoded)

print("‚úÖ Models trained!")

In [None]:
# Evaluate models
models = {'Logistic Regression': lr_model, 'SVM': svm_model}

for name, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    print(f"\nüîç {name} Results:")
    print(classification_report(y_test_encoded, y_pred, target_names=le.classes_))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_encoded, y_pred)
    print(f"Confusion Matrix:")
    print(cm)

In [None]:
# BERT Training (GPU accelerated)
import torch
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    AdamW, get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader

class FraudDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

print("üöÄ Initializing BERT...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

In [None]:
# Prepare BERT datasets
train_dataset = FraudDataset(X_train, y_train_encoded, tokenizer)
test_dataset = FraudDataset(X_test, y_test_encoded, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print(f"Training batches: {len(train_loader)}")
print(f"Testing batches: {len(test_loader)}")

In [None]:
# Training loop
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader) * 3
)

model.train()
for epoch in range(3):
    print(f"\nüöÄ Epoch {epoch + 1}/3")
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")

print("‚úÖ BERT training complete!")

In [None]:
# Evaluate BERT model
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label']
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.numpy())

print("\nüéØ BERT Evaluation Results:")
print(classification_report(true_labels, predictions, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(true_labels, predictions)
print(f"\nConfusion Matrix:")
print(cm)

In [None]:
# Save models for download
import joblib
import os

# Create output directory
os.makedirs('/kaggle/working/models', exist_ok=True)

# Save traditional ML models
joblib.dump(lr_model, '/kaggle/working/models/logistic_regression.pkl')
joblib.dump(svm_model, '/kaggle/working/models/svm.pkl')
joblib.dump(tfidf, '/kaggle/working/models/tfidf_vectorizer.pkl')
joblib.dump(le, '/kaggle/working/models/label_encoder.pkl')

# Save BERT model
model.save_pretrained('/kaggle/working/models/bert_model')
tokenizer.save_pretrained('/kaggle/working/models/bert_tokenizer')

print("üíæ Models saved to /kaggle/working/models/")
print("Download them from the Output tab!")

# üìä Results Summary

## üéØ Performance Comparison
- Compare all models' F1-scores, precision, and recall
- BERT typically performs best but requires more resources

## üí° Next Steps
1. **Download Models**: Get your trained models from the Output tab
2. **Deploy**: Use the saved models in production
3. **Experiment**: Try different hyperparameters
4. **Share**: Publish your notebook to Kaggle community

## ‚ö° Kaggle Tips
- Use GPU accelerator for faster training
- Save models regularly to avoid losing progress
- Monitor memory usage with large datasets
- Use the Discussion forum for questions