In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import re

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
df = pd.read_csv('dataset-merged.csv')

In [23]:
df.columns

Index(['sr', 'text', 'label', 'wcount'], dtype='object')

In [24]:
df = df.drop('sr', axis=1)
df.columns

Index(['text', 'label', 'wcount'], dtype='object')

In [25]:
df = df.dropna()
df.isnull().sum()

text      0
label     0
wcount    0
dtype: int64

In [29]:
# Preprocess the text data
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

In [30]:
df['text'] = df['text'].apply(preprocess_text)

In [31]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [32]:
# Useing BERT tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [33]:
# Tokenize the text
max_len = 100  # Maximum length of the sequences
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='pt')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=max_len, return_tensors='pt')

In [34]:
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [35]:
# creating dataset and dataloader to handle batching

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(X_train_tokens, y_train)
test_dataset = NewsDataset(X_test_tokens, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [36]:
# LSTM Model

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim, n_layers, bidirectional, dropout):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        out = self.fc(hidden)
        return out

In [47]:
# Hyperparameters
embedding_dim = 128
hidden_dim = 256
output_dim = 2
n_layers = 8
bidirectional = True
dropout = 0.3
vocab_size = tokenizer.vocab_size

# initailaizing the model
model = LSTMClassifier(embedding_dim, hidden_dim, vocab_size, output_dim, n_layers, bidirectional, dropout)

# Move the model to CUDA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [48]:
# Setting up loss function and Adam optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [49]:
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    for batch in train_loader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += torch.sum(preds == labels).item()
    return total_loss / len(test_loader), correct / len(test_loader.dataset)

In [50]:
n_epochs = 10
for epoch in range(n_epochs):
    train_model(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate_model(model, test_loader, criterion, device)
    print(f'Epoch {epoch+1}, Val Loss: {val_loss}, Val Acc: {val_acc}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch 1, Val Loss: 0.5538217243221071, Val Acc: 0.7191240875912409
Epoch 2, Val Loss: 0.4908814444034188, Val Acc: 0.7693430656934307
Epoch 3, Val Loss: 0.5684072830610805, Val Acc: 0.7170802919708029
Epoch 4, Val Loss: 0.419455757709565, Val Acc: 0.8186861313868613
Epoch 5, Val Loss: 0.4856743061984027, Val Acc: 0.7944525547445256
Epoch 6, Val Loss: 0.41693154184354675, Val Acc: 0.8294890510948905
Epoch 7, Val Loss: 0.3838138275400356, Val Acc: 0.8426277372262774
Epoch 8, Val Loss: 0.4031199230640023, Val Acc: 0.8198540145985401
Epoch 9, Val Loss: 0.3804402712870527, Val Acc: 0.8446715328467154
Epoch 10, Val Loss: 0.38689802380071747, Val Acc: 0.8458394160583942


In [51]:
test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Test Loss: 0.38689802380071747, Test Accuracy: 0.8458394160583942
