In [16]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
# Load your CSV file
file_path = r'Data\combined_prepros.csv'
df = pd.read_csv(file_path)

# Check the first few rows of the dataset
print("Initial Data Sample:\n", df.head())

# Select the data columns for features and labels
X = df['tweet'].tolist()
y = df['Label']

Initial Data Sample:
              id                                              tweet  \
0  1.190000e+18  @mention HOPEFULLY NONE ENTITY LOSE SEAT ST GE...   
1  1.190000e+18  @mention Jacob baby way forward go alliance Br...   
2  1.200000e+18  @mention care climate change amp poverty causi...   
3  1.190000e+18  @mention @mention @mention dogs matter rspca p...   
4  1.200000e+18  moved canvassing Bomere Heath Fighting ever la...   

      annotations.supernarrative_1  Label  
0  Political hate and polarisation      9  
1                          Anti-EU      1  
2         Distrust in institutions      4  
3  Political hate and polarisation      9  
4                              NaN      0  


In [4]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:

# Check the first few training examples to ensure they are strings
print("\nFirst 5 Training Texts:\n", X_train[:5])


First 5 Training Texts:
 ['politics differently Come join PeoplesAssembly @mention tomorrow night party politics talked open equal conversation makes strong community Everybody welcome https', 'Cheers signed PledgeforPubs celebrate promote great beer Support action help pubs thrive represent interests pub goers beer cider drinkers @mention ge2019 info visit https', 'Every Christmas day young volunteers St Joseph church Rammy organise meal Bury Parish Church anyone isolated well vulnerable families homeless place go Christmas Day always great day', 'BBC Demand Jo Swinson included TV debates run General Election Sign Petition https via @mention', '@mention said commit @mention deal know']


In [6]:
# Load pre-trained DistilBERT model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [7]:
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

model.safetensors:  70%|#######   | 189M/268M [00:00<?, ?B/s]

In [10]:
# Function to get DistilBERT embeddings for a list of texts
def get_distilbert_embeddings(texts, batch_size=32):
    embeddings_list = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # Ensure inputs are a list of strings
        if not isinstance(batch_texts, list):
            batch_texts = [batch_texts]

        # Print the current batch of texts to verify format
        #print("\nCurrent Batch of Texts:\n", batch_texts)

        # Tokenize the input texts directly using the tokenizer object
        encoded_inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=144,
            return_tensors='pt'
        )

        # Process the inputs with DistilBERT to obtain outputs
        with torch.no_grad():
            outputs = distilbert_model(**encoded_inputs)

        # Extract the embedding of the [CLS] token for classification
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        embeddings_list.append(batch_embeddings)

    return np.concatenate(embeddings_list, axis=0)

In [11]:
# Obtain DistilBERT embeddings for the training and testing data
X_train_embeddings = get_distilbert_embeddings(X_train)
X_test_embeddings = get_distilbert_embeddings(X_test)

In [12]:
# Standardize embeddings
scaler = StandardScaler()
X_train_embeddings = scaler.fit_transform(X_train_embeddings)
X_test_embeddings = scaler.transform(X_test_embeddings)

In [13]:
# Train a simple classifier on the DistilBERT embeddings
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_embeddings, y_train)

In [15]:

# Make predictions
y_pred = classifier.predict(X_test_embeddings)

In [17]:
# Print accuracy
print(accuracy_score(y_test, y_pred))

0.5375
