In [3]:
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import GPT2ForSequenceClassification, GPT2Tokenizer, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np


import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



  from .autonotebook import tqdm as notebook_tqdm


In [71]:

# # Preprocess messages
# nltk.download('punkt')
# nltk.download('stopwords')

# # Load dataset
# df = pd.read_csv('ScamDataset', sep='\t', names=['Label', 'message'])

# stemmer = PorterStemmer()
# stop_words = set(stopwords.words('english'))

# def preprocess(text):
#     tokens = word_tokenize(text)
#     stemmed = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
#     return ' '.join(stemmed)

# df['processed_message'] = df['message'].apply(preprocess)
# X = df['processed_message'].tolist()

# df['Label'] = df['Label'].map({'normal': 0, 'fraud': 1})  # Adjust as necessary based on your actual labels
# y= df['Label'].tolist()


In [72]:

# Load dataset
# data = pd.read_csv('GeneratedDataset.csv', delimiter=',', names=['message', 'Label'])
data = pd.read_csv('ScamDataset.csv', delimiter=',', names=['message', 'Label'])
# data = pd.read_csv('data.csv', delimiter=',', names=['message', 'Label'])

# data = pd.read_csv('ScamDataset', delimiter='\t', names=['Label', 'message'])


data['Label'] = data['Label'].map({'normal': 0, 'fraud': 1})

data['Label']
# Displaying the shape of the dataset before removing duplicates
original_shape = data.shape

# Removing duplicate rows
data = data.drop_duplicates()

# Shape after removing duplicates
new_shape = data.shape

original_shape, new_shape
# stemmer = PorterStemmer()
# stop_words = set(stopwords.words('english'))

# def preprocess(text):
#     tokens = word_tokenize(text)
#     stemmed = [stemmer.stem(word) for word in tokens if word.isalpha() and word not in stop_words]
#     return ' '.join(stemmed)

# Count the number of 'fraud' entries
fraud_count = (data['Label'] == 1).sum()
non_fraud_count = (data['Label'] == 0).sum()
print(f'Number of fraud data: {fraud_count}')
print(f'Number of non fraud data: {non_fraud_count}')
print(f"original_shape = ", original_shape)
print(f"new_shape = ", new_shape)

# data['processed_message'] = data['message'].apply(preprocess)
# X = data['processed_message'].tolist()
X = data['message'].tolist()

y= data['Label'].tolist()


Number of fraud data: 1560
Number of non fraud data: 6916
original_shape =  (13331, 2)
new_shape =  (8476, 2)


In [73]:
# Tokenization
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

tokenizer.pad_token = tokenizer.eos_token


encodings = tokenizer(X, truncation=True, padding=True, max_length=512, return_tensors="pt")
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']


In [74]:
# Dataset preparation
dataset = TensorDataset(input_ids, attention_mask, torch.tensor(y))
train_size = int(0.75 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [75]:
# Dataloaders
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [76]:
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2ForSequenceClassification.from_pretrained('gpt2', pad_token_id=tokenizer.pad_token_id)
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [77]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [78]:
# Training loop
model.train()
for epoch in range(8):  # Number of epochs
    for batch in train_dataloader:
        b_input_ids, b_attention_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_attention_mask = b_attention_mask.to(device)
        b_labels = b_labels.to(device)

        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.003605794860050082
Epoch 2, Loss: 0.0032496172934770584
Epoch 3, Loss: 0.00025080342311412096
Epoch 4, Loss: 5.4927902965573594e-05
Epoch 5, Loss: 3.8739268347853795e-05
Epoch 6, Loss: 2.8133331397839356e-06
Epoch 7, Loss: 0.0016191434115171432
Epoch 8, Loss: 9.504611807642505e-05


In [79]:
# Evaluation
model.eval()
predictions, true_labels = [], []
for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_attention_mask)

    logits = outputs.logits
    predictions.append(torch.argmax(logits, dim=-1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

In [80]:
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)
accuracy = accuracy_score(true_labels, predictions)
print(f"Validation Accuracy: {accuracy}")


Validation Accuracy: 0.9924492685228882


In [81]:
model.save_pretrained('my_gpt_model')

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained('my_gpt_model')

In [10]:
# Example of preparing an input for prediction
texts = ["Hello, I'm calling from the survey department. You've been selected to participate in a paid survey. Can you confirm your bank details for the payment?"]
encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Move tensors to the same device as model
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']

# Predict
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1)

print(predictions)

tensor([1])
