In [1]:
import re
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import spacy
from tqdm import tqdm  # Import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from sklearn.metrics import accuracy_score, classification_report

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv(r'E:\SNU Chennai\projects\NLP project\NEWS classification\news article\news-article-categories.csv')

# Combine 'title' and 'body' columns into a single 'text' column
df['text'] = df['title'] + " " + df['body']

# Drop rows where 'text' is missing
df = df.dropna(subset=['text'])

# Preprocessing function using spaCy
def preprocess_text(text):
    # Process text with spaCy
    doc = nlp(text.lower())
    
    # Remove stopwords and non-alphabetic tokens, then lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    # Join back into a string
    return " ".join(tokens)

# Apply preprocessing to the 'text' column with tqdm
print("Preprocessing text...")
tqdm.pandas()  # Enable tqdm progress_apply
df['clean_text'] = df['text'].progress_apply(preprocess_text)  # Use progress_apply instead of apply

# Encode labels
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Load pre-trained BERT tokenizer and model
print("Loading pre-trained BERT model...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

  from .autonotebook import tqdm as notebook_tqdm



Using device: cuda
Preprocessing text...


100%|██████████| 6872/6872 [05:48<00:00, 19.72it/s]


Loading pre-trained BERT model...


In [2]:
# Function to count tokens in a single row of clean_text
def count_tokens(text):
    return len(tokenizer.tokenize(text))

# Apply to your preprocessed text
print("Calculating token lengths...")
df['token_count'] = df['clean_text'].progress_apply(count_tokens)

# Show statistics
print(f"\n📊 Max token length: {df['token_count'].max()}")
print(f"📈 95th percentile token length: {df['token_count'].quantile(0.95)}")
print(f"📉 Average token length: {df['token_count'].mean():.2f}")

Calculating token lengths...


100%|██████████| 6872/6872 [00:19<00:00, 360.63it/s]


📊 Max token length: 5515
📈 95th percentile token length: 881.4499999999998
📉 Average token length: 363.28





In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import spacy
from transformers import BertTokenizer, BertModel

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
df = pd.read_csv(r'E:\SNU Chennai\projects\NLP project\NEWS classification\news article\news-article-categories.csv')
df['text'] = df['title'] + " " + df['body']
df = df.dropna(subset=['text'])

# Text preprocessing
nlp = spacy.load("en_core_web_sm")
def preprocess_text(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

print("Preprocessing text...")
tqdm.pandas()
df['clean_text'] = df['text'].progress_apply(preprocess_text)

# Label encoding
label_encoder = LabelEncoder()
df['category_encoded'] = label_encoder.fit_transform(df['category'])

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['clean_text'], df['category_encoded'], test_size=0.2, random_state=42
)

# Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

def encode_text(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

train_encodings = encode_text(train_texts)
val_encodings = encode_text(val_texts)

train_labels = torch.tensor(train_labels.values, dtype=torch.long)
val_labels = torch.tensor(val_labels.values, dtype=torch.long)

# Dataset
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model
class BERTBiLSTM(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(BERTBiLSTM, self).__init__()
        self.bert = bert_model
        self.bilstm = nn.LSTM(768, 64, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(128, num_classes)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_output = outputs.last_hidden_state
        lstm_out, _ = self.bilstm(bert_output)
        final_feature = lstm_out[:, -1, :]
        out = self.dropout(final_feature)
        return self.fc(out)

num_classes = len(label_encoder.classes_)
model = BERTBiLSTM(bert_model, num_classes).to(device)

# Unfreeze final BERT layer and pooler
for name, param in model.bert.named_parameters():
    
    if 'layer.11' in name or 'pooler' in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Training and evaluation
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)

def train_model(model, loader, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            optimizer.zero_grad()
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_)
    return acc, report

# Run training and evaluation
train_model(model, train_loader, epochs=5)
acc, report = evaluate_model(model, val_loader)
print(f"\nValidation Accuracy: {acc:.4f}")
print("\nClassification Report:\n", report)

Using device: cuda
Preprocessing text...


100%|██████████| 6872/6872 [06:17<00:00, 18.20it/s]
Epoch 1/5: 100%|██████████| 344/344 [02:56<00:00,  1.95it/s]


Epoch 1, Loss: 2.4524


Epoch 2/5: 100%|██████████| 344/344 [03:14<00:00,  1.77it/s]


Epoch 2, Loss: 1.6357


Epoch 3/5: 100%|██████████| 344/344 [03:07<00:00,  1.83it/s]


Epoch 3, Loss: 1.1939


Epoch 4/5: 100%|██████████| 344/344 [02:59<00:00,  1.91it/s]


Epoch 4, Loss: 1.0257


Epoch 5/5: 100%|██████████| 344/344 [05:09<00:00,  1.11it/s]


Epoch 5, Loss: 0.9274


Evaluating: 100%|██████████| 86/86 [01:03<00:00,  1.34it/s]


Validation Accuracy: 0.7687

Classification Report:
                 precision    recall  f1-score   support

ARTS & CULTURE       0.86      0.87      0.86       205
      BUSINESS       0.70      0.65      0.67       114
        COMEDY       0.76      0.80      0.78        74
         CRIME       0.76      0.82      0.79        57
     EDUCATION       0.88      0.79      0.83       108
 ENTERTAINMENT       0.81      0.75      0.78       100
   ENVIRONMENT       0.72      0.75      0.73        97
         MEDIA       0.61      0.70      0.65        66
      POLITICS       0.80      0.63      0.71       103
      RELIGION       0.83      0.90      0.86       101
       SCIENCE       0.74      0.73      0.73        55
        SPORTS       0.83      0.95      0.88       101
          TECH       0.67      0.79      0.72        84
         WOMEN       0.67      0.56      0.61       110

      accuracy                           0.77      1375
     macro avg       0.76      0.76      0.76   




In [6]:
# Convert back encoded labels to original category names
inv_label_map = {i: label for i, label in enumerate(label_encoder.classes_)}

# Add this function
def print_misclassifications(model, loader, texts, true_labels):
    model.eval()
    misclassified = []
    index = 0
    with torch.no_grad():
        for batch in tqdm(loader, desc="Checking Misclassifications"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1)

            for i in range(len(preds)):
                true_label = labels[i].item()
                pred_label = preds[i].item()
                if true_label != pred_label:
                    misclassified.append({
                        'text': texts.iloc[index],
                        'actual': inv_label_map[true_label],
                        'predicted': inv_label_map[pred_label]
                    })
                index += 1

    # Print first few misclassified examples
    print(f"\nTotal misclassifications: {len(misclassified)}\n")
    for i, sample in enumerate(misclassified[:10]):
        print(f"Sample {i+1}:")
        print(f"Text       : {sample['text'][:200]}...")
        print(f"Actual     : {sample['actual']}")
        print(f"Predicted  : {sample['predicted']}\n")

# Call this after evaluation
print_misclassifications(model, val_loader, val_texts.reset_index(drop=True), val_labels)


Checking Misclassifications: 100%|██████████| 86/86 [01:09<00:00,  1.24it/s]


Total misclassifications: 318

Sample 1:
Text       : johns chair board woman catalyst president ceo deborah gillis arrive london year ago astonish find have animate conversation male custom officer woman board unlike united states deborah delighted issu...
Actual     : WOMEN
Predicted  : BUSINESS

Sample 2:
Text       : late paycheck mess life sharisse tracey know deep level family live paycheck paycheck check come pay bill saving speak easy ignore reality check post regularly fully understand live paycheck paycheck ...
Actual     : WOMEN
Predicted  : EDUCATION

Sample 3:
Text       : asian team participate global robotic challenge xinxin zhang research intern east west center washington graduate student public policy university chicago note article originally appear east west cent...
Actual     : WOMEN
Predicted  : EDUCATION

Sample 4:
Text       : charge assumption race gender close colleague observe college head student display behavior disrespectful likely different dare leader d


