In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter



In [3]:
df = pd.read_csv("D:/Intern/DataSets/bbc-text.csv")
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
stop_words = set(stopwords.words('english'))
nlp = spacy.load("en_core_web_sm")

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    doc = nlp(" ".join(tokens))
    tokens = [tokens.lemma_ for tokens in doc if not tokens.is_stop]
    
    return tokens

In [6]:
df["cleaned_text"] = df["text"].apply(preprocess_text)

In [7]:
all_words = [word for token in df["cleaned_text"] for word in token]
word_freq = Counter(all_words)

In [8]:
word_freq.most_common(10)

[('say', 8248),
 ('year', 3312),
 ('mr', 3005),
 ('people', 2046),
 ('new', 1996),
 ('time', 1585),
 ('good', 1537),
 ('m', 1457),
 ('game', 1453),
 ('world', 1229)]

In [9]:
vocab = {word : idx+2 for idx, (word, _) in enumerate(word_freq.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

In [10]:
def encode_text(tokens, max_len=100):
    return [vocab.get(word, vocab["<UNK>"]) for word in tokens[:max_len]] + [vocab["<PAD>"]] * (max_len - len(tokens))

In [11]:
MAX_LEN = 200
df["encoded_text"] = df["cleaned_text"].apply(lambda x: encode_text(x, MAX_LEN))

In [12]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["category"])

In [13]:
df.head()

Unnamed: 0,category,text,cleaned_text,encoded_text,label
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hand, viewer, home, theatre, syst...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",4
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, leave, book, worldcom, boss, ...","[230, 231, 232, 152, 230, 231, 233, 234, 235, ...",0
2,sport,tigers wary of farrell gamble leicester say ...,"[tiger, wary, farrell, gamble, leicester, rush...","[332, 333, 334, 335, 336, 337, 338, 339, 340, ...",3
3,sport,yeading face newcastle in fa cup premiership s...,"[yeade, face, newcastle, fa, cup, premiership,...","[401, 325, 402, 403, 404, 405, 402, 406, 325, ...",3
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, raid, box, office, ocean, crime, caper...","[539, 540, 62, 541, 539, 542, 543, 544, 545, 5...",1


In [14]:
X_train, X_val, y_train, y_val = train_test_split(
	df["encoded_text"].tolist(),
	df["label"].values,
	test_size=0.2,
	random_state=42
)

In [15]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = np.array(labels)   # always numpy
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)


In [16]:
train_ds = NewsDataset(X_train, y_train)
val_ds = NewsDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64, shuffle=False)

In [17]:
class NewsRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, n_layers=1, dropout=0.5):
        super(NewsRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, n_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
model = NewsRNN(
	vocab_size=len(vocab),
	embed_size=128,
	hidden_size=256,
	output_size=len(df["label"].unique()),
	n_layers=2,
	dropout=0.5
).to(device)

In [20]:
criterian = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [21]:
# Convert data to tensors before training
def train_loop(model, train_loader, val_loader, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss, total_correct = 0, 0
        
        for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
            X_batch = torch.tensor(X_batch).to(device)  # Move to device
            y_batch = torch.tensor(y_batch).to(device)  # Move to device
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterian(outputs, y_batch)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_correct += (outputs.argmax(1) == y_batch).sum().item()
        
        train_acc = total_correct / len(train_loader.dataset)
        
        model.eval()
        val_loss, val_correct = 0, 0
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = torch.tensor(X_batch).to(device)  # Move to device
                y_batch = torch.tensor(y_batch).to(device)  # Move to device
                
                outputs = model(X_batch)
                loss = criterian(outputs, y_batch)
                val_loss += loss.item()
                val_correct += (outputs.argmax(1) == y_batch).sum().item()
        
        val_acc = val_correct / len(val_loader.dataset)
        
        print(f"Epoch {epoch+1}/{epochs}", 
              f"Train Loss: {total_loss/len(train_loader):.4f}", 
              f"Train Acc: {train_acc:.4f}",
              f"Val Loss: {val_loss/len(val_loader):.4f}",
              f"Val Acc: {val_acc:.4f}")


In [22]:
train_loop(model, train_loader, val_loader, epochs=20)

  X_batch = torch.tensor(X_batch).to(device)  # Move to device
  y_batch = torch.tensor(y_batch).to(device)  # Move to device
  X_batch = torch.tensor(X_batch).to(device)  # Move to device
  y_batch = torch.tensor(y_batch).to(device)  # Move to device


Epoch 1/20 Train Loss: 1.5650 Train Acc: 0.2787 Val Loss: 1.5505 Val Acc: 0.3236
Epoch 2/20 Train Loss: 1.4828 Train Acc: 0.3472 Val Loss: 1.5178 Val Acc: 0.3461
Epoch 3/20 Train Loss: 1.3625 Train Acc: 0.4331 Val Loss: 1.4526 Val Acc: 0.3101
Epoch 4/20 Train Loss: 1.1376 Train Acc: 0.5264 Val Loss: 1.2028 Val Acc: 0.5056
Epoch 5/20 Train Loss: 0.8359 Train Acc: 0.6680 Val Loss: 0.9496 Val Acc: 0.6360
Epoch 6/20 Train Loss: 0.5378 Train Acc: 0.8129 Val Loss: 0.6829 Val Acc: 0.7843
Epoch 7/20 Train Loss: 0.3276 Train Acc: 0.8955 Val Loss: 0.7670 Val Acc: 0.7483
Epoch 8/20 Train Loss: 0.1825 Train Acc: 0.9444 Val Loss: 0.5842 Val Acc: 0.8270
Epoch 9/20 Train Loss: 0.1149 Train Acc: 0.9674 Val Loss: 0.6338 Val Acc: 0.8360
Epoch 10/20 Train Loss: 0.0754 Train Acc: 0.9820 Val Loss: 0.6149 Val Acc: 0.8449
Epoch 11/20 Train Loss: 0.0298 Train Acc: 0.9944 Val Loss: 0.6536 Val Acc: 0.8494
Epoch 12/20 Train Loss: 0.0189 Train Acc: 0.9961 Val Loss: 0.6709 Val Acc: 0.8584
Epoch 13/20 Train Loss: 0

In [23]:
def prediction(model, text, vocab, max_len=200):
	model.eval()
	tokens = preprocess_text(text)
	encoded = encode_text(tokens, max_len)
	input_tensor = torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(device)  # Add batch dimension and move to device
	
	with torch.no_grad():
		output = model(input_tensor)
		predicted_label = output.argmax(1).item()
	
	return le.inverse_transform([predicted_label])[0]

In [24]:
prediction(model, "The government is planning to introduce new economic reforms to boost growth.", vocab)

'politics'

In [25]:
prediction(model, "Stock markets saw a sharp decline as global oil prices dropped.", vocab)

'business'

In [26]:
prediction(model, "The company announced record profits for the second quarter.", vocab)

'sport'

In [27]:
prediction(model, "Music industry revenues are rising due to online streaming services.", vocab)

'tech'