In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('dataset-merged.csv')

In [3]:
df = df.drop('sr', axis=1)
df.columns

Index(['text', 'label', 'wcount'], dtype='object')

In [4]:
df = df.dropna()
df.isnull().sum()

text      0
label     0
wcount    0
dtype: int64

In [5]:
# Preprocess the text data
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

In [6]:
df['text'] = df['text'].apply(preprocess_text)

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [8]:
# Useing BERT tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [9]:
# Tokenize the text
max_len = 100  # Maximum length of the sequences
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='pt')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='pt')

In [10]:
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [11]:
# creating dataset and dataloader to handle batching

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(X_train_tokens, y_train)
test_dataset = NewsDataset(X_test_tokens, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [12]:
# LSTM Model

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim, n_layers, bidirectional, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        out = self.fc(hidden)
        return out

In [13]:
# Hyperparameters
embedding_dim = 128
hidden_dim = 256
output_dim = 2
n_layers = 2
bidirectional = True
dropout = 0.3
vocab_size = tokenizer.vocab_size

# initailaizing the model
model = LSTMClassifier(embedding_dim, hidden_dim, vocab_size, output_dim, n_layers, bidirectional, dropout)

# Move the model to CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [15]:
# Setting up loss function and Adam optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5, verbose=True)



In [16]:
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = None
        self.early_stop = False
        self.counter = 0

    def __call__(self, val_loss, model):
        score = -val_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), 'checkpoint.pt')

early_stopping = EarlyStopping(patience=3, delta=0.001)

In [17]:
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    for batch in train_loader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += torch.sum(preds == labels).item()
    return total_loss / len(test_loader), correct / len(test_loader.dataset)

In [18]:
n_epochs = 10
for epoch in range(n_epochs):
    train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate_model(model, test_loader, criterion, device)
    print(f'Epoch {epoch+1}, Val Loss: {val_loss}, Val Acc: {val_acc}')
    
    scheduler.step(val_loss)
    early_stopping(val_loss, model)
    
    if early_stopping.early_stop:
        print("Early stopping")
        break

# Load the best model
model.load_state_dict(torch.load('checkpoint.pt'))

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Val Loss: 0.5202626100292912, Val Acc: 0.7474452554744525
Epoch 2, Val Loss: 0.4755208820656494, Val Acc: 0.7710948905109489
Epoch 3, Val Loss: 0.42537654950111, Val Acc: 0.8128467153284672
Epoch 4, Val Loss: 0.41522919414220033, Val Acc: 0.80992700729927
Epoch 5, Val Loss: 0.4355546053223036, Val Acc: 0.8414598540145986
Epoch 6, Val Loss: 0.4305753585089136, Val Acc: 0.8446715328467154
Epoch 7, Val Loss: 0.4650661021608997, Val Acc: 0.8335766423357664
Early stopping


<All keys matched successfully>

In [19]:
# Evaluate the model on the test data
test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Loss: 0.41522919414220033, Test Accuracy: 0.80992700729927
