In [1]:
import torch
from torch import nn, optim
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch.nn.functional as F
from tqdm import tqdm
from transformers import XLNetTokenizer, XLNetForSequenceClassification

In [2]:
# Load dataset
df = pd.read_csv('C:/Users/q/Desktop/jupyter/preprocessed_text_and_tag.csv', sep=';')


In [3]:
df

Unnamed: 0,processed_text,most_popular_tag
0,глава аджария аслан абашидзе принять решение у...,Мир
1,июль российский железный дорога начать продава...,Общество
2,использование биткойнов оплата товар услуга да...,Финансы
3,число официально подтвердить случай заражение ...,Общество
4,президент россия владимир путин телефон обсуди...,Политика
...,...,...
1271186,облачная прояснениями погода ожидаться петербу...,Культура
1271187,медиапросторам разойтись видео задержание нача...,Россия
1271188,польский сторона граница украина застрять выех...,Украина
1271189,президент россия владимир путин следующий неде...,Митинг


In [4]:
# Save original most_popular_tag column for later comparison
original_most_popular_tag = df['most_popular_tag']

# Encode categorical labels to numerical values
label_encoder = LabelEncoder()
df['most_popular_tag'] = label_encoder.fit_transform(df['most_popular_tag'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['most_popular_tag'], test_size=0.2, random_state=42)


In [5]:
# Initialize the XLNet tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')



In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
# Create DataLoader for easier data handling
MAX_LEN = 128
BATCH_SIZE = 16

train_dataset = TextDataset(X_train.tolist(), y_train.tolist(), tokenizer, MAX_LEN)
test_dataset = TextDataset(X_test.tolist(), y_test.tolist(), tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Set device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
device

device(type='cuda')

In [9]:
# Initialize the XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=len(label_encoder.classes_))
model = model.to(device)


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return F_loss.mean()

In [11]:
# Define loss function and optimizer
criterion = FocalLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)


In [12]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    train_preds = []
    train_labels = []
    
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds = torch.argmax(outputs.logits, dim=1)
        train_preds.extend(preds.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())
    
    train_loss /= len(train_loader)
    train_accuracy = accuracy_score(train_labels, train_preds)
    train_f1 = f1_score(train_labels, train_preds, average='weighted')
    
    model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)

            val_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    val_loss /= len(test_loader)
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds, average='weighted')
    
    print(f'Epoch {epoch+1}/{num_epochs}')
    print(f'Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train F1 Score: {train_f1:.4f}')
    print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation F1 Score: {val_f1:.4f}')


100%|██████████████████████████████████████████████████████████████████████████| 63560/63560 [4:25:45<00:00,  3.99it/s]


Epoch 1/5
Train Loss: 2.7221, Train Accuracy: 0.1973, Train F1 Score: 0.0774
Validation Loss: 2.7197, Validation Accuracy: 0.1995, Validation F1 Score: 0.0664


100%|██████████████████████████████████████████████████████████████████████████| 63560/63560 [4:21:48<00:00,  4.05it/s]


Epoch 2/5
Train Loss: 2.7162, Train Accuracy: 0.2002, Train F1 Score: 0.0674
Validation Loss: 2.7159, Validation Accuracy: 0.1995, Validation F1 Score: 0.0664


  7%|█████▏                                                                     | 4348/63560 [18:21<4:09:59,  3.95it/s]


KeyboardInterrupt: 