In [1]:
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('ScamDataset.csv', delimiter=',', names=['message', 'label'])

original_shape = data.shape

# Removing duplicate rows
data = data.drop_duplicates()

# Shape after removing duplicates
new_shape = data.shape

original_shape, new_shape


((20670, 2), (15241, 2))

In [3]:
data['label'] = data['label'].map({'normal': 0, 'fraud': 1})

data

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Oh k. . I will come tomorrow,0
2,There was an error with your subscription. Ple...,1
3,Well if I'm that desperate I'll just call arma...,0
4,That's the trouble with classes that go well -...,0
...,...,...
20664,Important notice: Your tax refund requires urg...,1
20665,Important notice: Your tax refund requires urg...,1
20666,"amount of $2,798 to cancel your order or to co...",1
20668,Important notice: Your tax refund requires urg...,1


In [4]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token, so we use the eos token instead

In [5]:
# Preprocess the dataset
def preprocess_data(data, tokenizer, max_length=512):
    inputs = tokenizer(data['message'].tolist(), return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    labels = torch.tensor(data['label'].tolist())  # Assuming the column 'label' contains 0 for normal and 1 for fraud
    return inputs, labels

inputs, labels = preprocess_data(data, tokenizer)

In [6]:
# Ensure consistent length
num_samples = len(labels)
input_ids = inputs['input_ids'][:num_samples]
attention_mask = inputs['attention_mask'][:num_samples]


In [7]:

# Split the data into training and validation sets
train_input_ids, val_input_ids, train_attention_mask, val_attention_mask, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42)

In [8]:
# Create data loaders
batch_size = 8
train_data = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_input_ids, val_attention_mask, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

In [9]:
# Load GPT-2 model for sequence classification
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = tokenizer.pad_token_id  # Set the pad token ID

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [11]:
# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss()



In [12]:
# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        model.zero_grad()

        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    model.eval()
    total_eval_loss = 0

    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(val_dataloader)

    print(f'Epoch {epoch + 1}')
    print(f'Training loss: {avg_train_loss}')
    print(f'Validation loss: {avg_val_loss}')

Epoch 1
Training loss: 0.04836317777813867
Validation loss: 0.04412184201874208


KeyboardInterrupt: 

In [None]:
# Evaluate the model
def evaluate_model(model, dataloader):
    model.eval()
    predictions, true_labels = [], []

    for batch in dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits

        predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.extend(b_labels.cpu().numpy())

    return predictions, true_labels

predictions, true_labels = evaluate_model(model, val_dataloader)

# Print classification report
print(classification_report(true_labels, predictions, target_names=['Normal', 'Fraud']))