## Pre-trained model

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

In [2]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

df = pd.read_csv(url_data + 'new_df.csv')
y_liberal = pd.read_csv(url_data + 'y_liberal.csv')
y_data = pd.read_csv(url_data + 'y_data.csv')
X_text = pd.read_csv(url_data + 'X_text.csv')
X_text_list= X_text['Processed'].tolist()

In [3]:
# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Combine X_text and y_data for processing
combined_df = pd.concat([X_text, y_data], axis=1)

results_list = []

# Process batches of data
batch_size = 8  # Adjust based on your GPU memory
for i in tqdm(range(0, len(combined_df), batch_size), desc="Processing batches"):
    batch = combined_df.iloc[i:i+batch_size]
    texts = batch['Processed'].tolist()

    # Prepare hypotheses for all labels and texts
    labels = ['Conservative', 'Liberal']
    batch_results = []
    for label in labels:
        hypotheses = [f"This example is {label}." for _ in texts]
        inputs = tokenizer(texts, hypotheses, return_tensors='pt', padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = logits.softmax(dim=1)
            entailment_probs = probs[:, 2]  # Index 2 corresponds to "entailment"

        # Store probabilities with texts
        for prob, text in zip(entailment_probs, texts):
            batch_results.append((text, label, prob.item()))

    # Find the highest probability for each text in the batch
    for j in range(len(texts)):
        text_results = batch_results[j*len(labels):(j+1)*len(labels)]
        max_result = max(text_results, key=lambda x: x[2])
        results_list.append({
            'text': max_result[0],
            'label': max_result[1],
            'probability': max_result[2]
        })

# Save results to a CSV file
results_df = pd.DataFrame(results_list)
results_df.to_csv(r'C:\Users\bugat\Prosjekter\Tekstanalyse\git_NLP\Tekstanalyse\git_NLP_data\Bart_results.csv', index=False)

Processing batches:   6%|▌         | 100/1607 [1:25:03<35:49:45, 85.59s/it] 