In [3]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Step 1: Read data from fraud and not_fraud folders
def load_conversations_from_files(fraud_dir="fraud", not_fraud_dir="not_fraud"):
    conversations = []
    labels = []
    
    for folder, label in [(fraud_dir, 1), (not_fraud_dir, 0)]:
        if os.path.exists(folder):
            for filename in os.listdir(folder):
                if filename.endswith(".txt"):
                    with open(os.path.join(folder, filename), 'r', encoding='utf-8') as f:
                        conversation = f.read().strip()
                        conversations.append(conversation)
                        labels.append(label)
        else:
            print(f"Warning: Directory '{folder}' not found.")
    
    if not conversations:
        raise ValueError("No conversation files found.")
    
    return pd.DataFrame({'conversation': conversations, 'label': labels})

# Load data
try:
    df = load_conversations_from_files()
    print(f"Loaded {len(df)} conversations from files.")
except ValueError as e:
    print(e)
    df = pd.DataFrame({
        'conversation': [
            "Urgent! Your bank account is compromised. Share your PIN now.",
            "Hi, this is your doctorâ€™s office confirming tomorrowâ€™s appointment."
        ],
        'label': [1, 0]
    })
    print("Using fallback sample data (2 samples).")

# Step 2: Prepare data for BERT
class ConversationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):  # Increased max_len
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['conversation'], df['label'], test_size=0.25, stratify=df['label'], random_state=42
)
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))
print("Fraud in train:", y_train.sum(), "| Not fraud in train:", len(y_train) - y_train.sum())
print("Fraud in test:", y_test.sum(), "| Not fraud in test:", len(y_test) - y_test.sum())

# Create datasets
train_dataset = ConversationDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len=256)
test_dataset = ConversationDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_len=256)

# Step 3: Train the model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # Increased from 5
    per_device_train_batch_size=4,  # Reduced for better gradient updates
    per_device_eval_batch_size=4,
    warmup_steps=200,  # Adjusted for longer training
    weight_decay=0.05,  # Increased to prevent overfitting
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-5,  # Explicitly set for better fine-tuning
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {'accuracy': (p.predictions.argmax(-1) == p.label_ids).mean()}
)

trainer.train()
trainer.save_model('./results/checkpoint-best')
print("Model saved to './results/checkpoint-best'.")

Loaded 51 conversations from files.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training samples: 38
Testing samples: 13
Fraud in train: 19 | Not fraud in train: 19
Fraud in test: 7 | Not fraud in test: 6


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7122,0.723762,0.461538
2,0.7158,0.689297,0.461538
3,0.6735,0.635067,0.538462
4,0.5948,0.556011,0.923077
5,0.4887,0.384005,1.0
6,0.3883,0.337122,1.0
7,0.2734,0.283251,0.923077
8,0.2025,0.231979,0.923077
9,0.1541,0.200229,0.923077
10,0.1343,0.087622,1.0


Model saved to './results/checkpoint-best'.


In [4]:
# Step 4: Predict new conversation script with confidence
def predict_conversation_script(file_path, model, tokenizer, max_len=256):
    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' not found.")
        return None
    
    with open(file_path, 'r', encoding='utf-8') as f:
        conversation_script = f.read().strip()
    
    encoding = tokenizer(
        conversation_script,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    with torch.no_grad():
        outputs = model(encoding['input_ids'], attention_mask=encoding['attention_mask'])
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).tolist()[0]
        prediction = logits.argmax().item()
        confidence = probs[prediction] * 100
        fraud_prob = probs[1] * 100  # Fraud probability
        not_fraud_prob = probs[0] * 100  # Not Fraud probability
    return {
        'label': "Fraud" if prediction == 1 else "Not Fraud",
        'confidence': confidence,
        'fraud_prob': fraud_prob,
        'not_fraud_prob': not_fraud_prob,
        'script': conversation_script
    }

# Test with a conversation from a file
test_file = "test_conversation.txt"
result = predict_conversation_script(test_file, model, tokenizer)
if result:
    print(f"\nTest Conversation Script from '{test_file}':\n'{result['script']}'\n")
    print(f"Prediction: {result['label']} (Confidence: {result['confidence']:.2f}%)")
    print(f"Fraud Probability: {result['fraud_prob']:.2f}% | Not Fraud Probability: {result['not_fraud_prob']:.2f}%")


Test Conversation Script from 'test_conversation.txt':
'Hi, this is Amit from India Post.
He said your package is arriving today.
He told me itâ€™s been shipped from Bangalore.
He mentioned that itâ€™s a small parcel for you.
I asked if youâ€™d be home, and he said to confirm.
He said itâ€™ll arrive by 4 PMâ€”track it online!
He added that the tracking is #987654321.
He asked if the address is still correct.
I told him Iâ€™d check with youâ€”he was fine with that.
He said to call 555-5678 if youâ€™re out.
Meanwhile, this is Priya from your bank.
She said thereâ€™s a problem with your account too.
She told me someone tried a â‚¹20,000 withdrawal.
She mentioned that itâ€™s flagged as suspicious.
I asked what to do, and she replied quickly.
She said to send your PIN and card number now.
She warned that your savings are at risk!
She said to reply within 10 minutesâ€”urgent!
She assured me itâ€™s safe once you do.
Act fast, she urgedâ€”donâ€™t lose your money!'

Prediction: Fraud (Confiden