In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

https://discuss.pytorch.org/t/logistic-regression-implemented-using-pytorch-performs-worse-than-sklearns-logistic-regression/52447/5


In [3]:
data_file= 'data/dataset.pkl'

In [4]:
#dictionary to map parties to num for tensors
party_dict={'SPÖ':0,'ÖVP':1,'FPÖ':2,'Grüne':3,'LIF':4,'BZÖ':5,'NEOS':6,'STRONACH':7,'PILZ':8,'independent':9}

In [5]:
'''
takes data set and filters out independet speeches to a seperate list,
takes other speeches as list and
according party-labels are mapped to number as a list
'''
def load_data(data_file):
    df = pd.read_pickle(data_file)
    independent_df= df[df['party']=='independent']
    df = df[~(df['party']=='independent')]
    #for testing I am using only part of data
    df= df[2000:3000]
    #print(df['party'])
    texts = df['text'].tolist()
    df['party'] = df['party'].map(party_dict)
    labels = df['party'].tolist()
    test_texts= independent_df['text'].tolist()
    mps = df['speaker'].tolist()
    print(df.value_counts('party'))
    #print(labels)
    return texts, labels, test_texts,mps

In [6]:
texts,labels,test_texts,mps = load_data(data_file)

party
1    329
0    281
2    194
3    188
4      8
dtype: int64


In [33]:
# read lemmatized texts
stanza_texts= pd.read_pickle('data\lemmatized.texts.pkl')
lemmatized_texts= stanza_texts['lemmatized_texts']


In [16]:
class TfidfDataset(Dataset):
    '''
    Takes a list or array of TF-IDF vectors and a list of labels,
    returns tensors of features and labels.
    '''
    def __init__(self, tfidf_vectors, labels):
        self.tfidf_vectors = tfidf_vectors
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        features = self.tfidf_vectors[idx]
        label = self.labels[idx]
        return {
            'features': torch.tensor(features, dtype=torch.float32),  # TF-IDF vector as float tensor
            'label': torch.tensor(label, dtype=torch.long)            # label as long tensor
        }

In [17]:
class TfidfVectorizerWrapper:
    def __init__(self, max_features=5000):
        self.vectorizer = TfidfVectorizer(max_features=max_features)

    def fit_transform(self, texts):
        return self.vectorizer.fit_transform(texts).toarray()

    def transform(self, texts):
        return self.vectorizer.transform(texts).toarray()

In [18]:
class LogRegClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LogRegClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)

    def forward(self, features):
        features = features.float()
        # TF-IDF tensor of shape [batch_size, input_dim]
        return self.fc(features)

In [19]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0

    loss_fn = nn.CrossEntropyLoss()

    for batch in data_loader:
        # Move data to the appropriate device
        features = batch['features'].to(device).float()  # Dense input vectors
        labels = batch['label'].to(device).long()        # Class indices (not one-hot)

        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(features)  # Should output [batch_size, num_classes]

        # Compute loss (raw logits, no softmax)
        loss = loss_fn(outputs, labels)

        # Backward pass and optimizer step
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Track total loss
        total_loss += loss.item()

    # Average loss over the entire data loader
    avg_loss = total_loss / len(data_loader)
    return avg_loss


In [20]:
from sklearn.metrics import accuracy_score, classification_report
import torch.nn.functional as F  # for softmax

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    all_probs = []
    
    with torch.no_grad():
        for batch in data_loader:
            features = batch['features'].to(device)  # Use TF-IDF features
            labels = batch['label'].to(device)
            
            outputs = model(features)  # No input_ids or attention_mask here
            
            _, preds = torch.max(outputs, dim=1)
            probs = F.softmax(outputs, dim=1)  # Softmax over class logits
            
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
            all_probs.extend(probs.cpu().tolist())
    
    accuracy = accuracy_score(actual_labels, predictions)
    report = classification_report(actual_labels, predictions)
    return accuracy, report, all_probs


In [34]:
train_texts, val_texts, train_labels, val_labels = train_test_split(lemmatized_texts, labels, test_size=0.2, random_state=42)

In [35]:
print(lemmatized_texts)

0      Dame und Herr ! wir sichern mit der Konjunktur...
1      drei BMWMotor kommen aus Österreich , der Rund...
2      auch in der Zukunft zu sichern . wenn ich ein ...
3      aber ein bemerkenswert Aussage machen , als er...
4      und Herr , so ein dringlich Anfrage lassen ja ...
                             ...                        
995    kaufen , müssen sie mit er schimpfen , weil si...
996    mit Demut vor mein Funktion und in der Bewußts...
997                                                     
998    Faktum . Kollege Cap , sie haben meinen , wir ...
999    sie geben ja der selber in Ihr huschen - pfusc...
Name: lemmatized_texts, Length: 1000, dtype: object


In [26]:
num_epochs= 2
max_length= 128
num_classes = 5
batch_size=2
learning_rate = 2e-5


In [38]:
# Vectorize texts

vectorizer = TfidfVectorizerWrapper(max_features=max_length)
X = vectorizer.fit_transform(lemmatized_texts)
input_dim=X.shape[1]
print(input_dim)

128


In [39]:
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)

In [40]:
# Create datasets & loaders
train_dataset = TfidfDataset(X_train, train_labels)
val_dataset = TfidfDataset(X_val, val_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LogRegClassifier(input_dim=input_dim, num_classes=num_classes)

In [42]:
optimizer = AdamW(model.parameters(), lr=learning_rate)

total_steps = len(train_texts) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [44]:
logreg_loss_graph=[]


# Training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    loss = train(model, train_loader, optimizer, scheduler, device)
    accuracy, report, probs = evaluate(model, val_loader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)
    print(f"Loss: {loss:.4f}")
    logreg_loss_graph.append(loss)



Epoch 1/2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy: 0.1400
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        64
           1       0.00      0.00      0.00        63
           2       0.19      0.46      0.27        41
           3       0.14      0.25      0.18        28
           4       0.04      0.50      0.07         4

    accuracy                           0.14       200
   macro avg       0.07      0.24      0.10       200
weighted avg       0.06      0.14      0.08       200

Loss: 1.6323
Epoch 2/2
Validation Accuracy: 0.1550
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        64
           1       0.00      0.00      0.00        63
           2       0.19      0.54      0.28        41
           3       0.14      0.25      0.18        28
           4       0.06      0.50      0.11         4

    accuracy                           0.15       200
   macro avg       0.08      0.26      0.11       20

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
