In [1]:
import pandas as pd

In [2]:
pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the dataset.

In [5]:
df = pd.read_csv("/content/drive/MyDrive/CA4023/ParlVote+.csv")

In [6]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np

Bert only takes integer labels, so I converted the partys to integers.

In [7]:
unique_parties = df['party'].unique()
label_dict = {party: i for i, party in enumerate(unique_parties)}

# Print the label dictionary
print(label_dict)

{'labour': 0, 'labourco-operative': 1, 'scottish-national-party': 2, 'conservative': 3, 'liberal-democrat': 4, 'plaid-cymru': 5, 'uup': 6, 'social-democratic-and-labour-party': 7, 'independent': 8, 'dup': 9, 'independent-conservative': 10, 'independent-ulster-unionist': 11, 'respect': 12, 'ukip': 13, 'green': 14, 'alliance': 15}


Create the training, testing and validation sets.

In [8]:
ml_df = df[['speech', 'party']]

# Separate features and labels
X = ml_df['speech']
y = ml_df['party']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62, stratify=y)

# Split the training set further into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=62, stratify=y_train)


Bert requires the feature column to be called "text" and the label column to be called "label".

In [9]:
train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})
val_df = pd.DataFrame({"text": X_val, "label": y_val})

Convert the dataframes to HuggingFace datasets.

In [10]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

In [11]:
train_dataset = train_dataset.remove_columns("__index_level_0__")
test_dataset = test_dataset.remove_columns("__index_level_0__")
val_dataset = val_dataset.remove_columns("__index_level_0__")

In [12]:
print(len(train_dataset), len(test_dataset), len(val_dataset))

21318 6663 5330


In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=16)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenize the speeches.

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_dataset(dataset,tokenizer):
  tokenized_dataset = []
  for item in dataset:
    tokenized = tokenizer(item["text"],padding="max_length", truncation=True)
    item.update(tokenized)
    item['label'] = label_dict[item['label']]
    tokenized_dataset.append(item)
  return tokenized_dataset

tokenized_train = tokenize_dataset(train_dataset,tokenizer)
tokenized_val = tokenize_dataset(val_dataset,tokenizer)
tokenized_test = tokenize_dataset(test_dataset,tokenizer)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [15]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer",evaluation_strategy="epoch", num_train_epochs=3)

In [16]:
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

My attempt to use class weights.


In [47]:
class_weights = {}

# Use the same formula as scikit-learn's balanced weights
for party in np.unique(train_dataset['label']):
    occurrence = np.bincount(y_train == party)[1]
    class_weights[party] = float(len(y_train) / (len(np.unique(y_train)) * occurrence))

updated_class_weights = {label_dict[key]: value for key, value in class_weights.items()}
print(updated_class_weights)

# Create a list of weights with indices matching the keys
weights_list = [updated_class_weights[key] for key in sorted(updated_class_weights.keys())]
print(weights_list)

{15: 148.04166666666666, 3: 0.15422791989813636, 9: 3.6010135135135135, 14: 18.00506756756757, 8: 9.125856164383562, 10: 444.125, 11: 222.0625, 0: 0.15897565922920892, 1: 2.659431137724551, 4: 0.7284718425369054, 5: 6.197093023255814, 12: 333.09375, 2: 1.4593373493975903, 7: 11.011363636363637, 13: 148.04166666666666, 6: 13.595663265306122}
[0.15897565922920892, 2.659431137724551, 1.4593373493975903, 0.15422791989813636, 0.7284718425369054, 6.197093023255814, 13.595663265306122, 11.011363636363637, 9.125856164383562, 3.6010135135135135, 444.125, 222.0625, 333.09375, 148.04166666666666, 18.00506756756757, 148.04166666666666]


Create a CustomTrainer with a weighted loss function.

In [48]:
import torch
from transformers import Trainer
from torch import nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weights_list, device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

Set the model parameters.

Choose between the unweighted and weighted trainer.

In [49]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics = compute_metrics
)

In [50]:
# from transformers import Trainer

# trainer = CustomTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_val,
#     compute_metrics = compute_metrics

SyntaxError: ignored

Train the model.

In [51]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.315,1.255088,0.512008
2,1.1525,1.165745,0.55666
3,0.9308,1.220331,0.565291


TrainOutput(global_step=7995, training_loss=1.162077277149537, metrics={'train_runtime': 2230.9923, 'train_samples_per_second': 28.666, 'train_steps_per_second': 3.584, 'total_flos': 1.6829119594856448e+16, 'train_loss': 1.162077277149537, 'epoch': 3.0})

In [83]:
trainer.save_model("/content/drive/MyDrive/CA4023/bert")

In [53]:
trainer.evaluate(tokenized_test)

{'eval_loss': 1.196746826171875,
 'eval_accuracy': 0.579468707789284,
 'eval_runtime': 67.9908,
 'eval_samples_per_second': 97.999,
 'eval_steps_per_second': 12.252,
 'epoch': 3.0}

In [54]:
predictions = trainer.predict(tokenized_test)

In [55]:
print(predictions)

PredictionOutput(predictions=array([[ 4.9581633 ,  2.0783215 , -0.14955413, ..., -3.9742467 ,
        -1.971783  , -4.3814197 ],
       [ 3.15072   ,  0.7886949 , -0.03045169, ..., -3.596786  ,
        -1.6494777 , -4.144554  ],
       [ 3.9917543 ,  0.63038516, -1.3032236 , ..., -3.797861  ,
        -1.8224882 , -3.7930496 ],
       ...,
       [ 1.6123435 , -1.2843546 , -1.1176488 , ..., -3.315163  ,
        -1.491772  , -3.1000147 ],
       [ 5.788758  ,  2.4049222 ,  0.19824277, ..., -3.8444047 ,
        -1.5294923 , -3.827639  ],
       [ 3.937896  ,  1.1568533 , -0.868874  , ..., -4.0261736 ,
        -2.1152086 , -4.6193185 ]], dtype=float32), label_ids=array([0, 2, 0, ..., 3, 0, 0]), metrics={'test_loss': 1.196746826171875, 'test_accuracy': 0.579468707789284, 'test_runtime': 70.7997, 'test_samples_per_second': 94.111, 'test_steps_per_second': 11.766})


Retrieve the predictions.

In [56]:
def get_predictions_by_type(dataset):

  y_test = []
  y_pred = []
  for i,item in enumerate(dataset):
    y_test.append(item['label'])
    predicted = np.argmax(predictions.predictions[i])
    y_pred.append(predicted)
  return y_pred, y_test

y_pred, y_test = get_predictions_by_type(tokenized_test)

In [57]:
print(y_test)

[0, 2, 0, 3, 3, 4, 0, 0, 0, 3, 3, 0, 0, 0, 2, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 2, 3, 4, 0, 3, 3, 0, 3, 3, 3, 0, 0, 0, 3, 3, 4, 0, 0, 4, 3, 0, 0, 0, 0, 3, 3, 1, 3, 3, 0, 5, 0, 3, 0, 0, 3, 0, 3, 4, 0, 3, 4, 0, 0, 1, 0, 3, 3, 0, 3, 0, 0, 4, 0, 3, 3, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 3, 3, 0, 4, 4, 0, 4, 3, 3, 1, 0, 3, 0, 0, 0, 3, 4, 3, 3, 0, 0, 4, 3, 0, 0, 0, 5, 3, 3, 0, 3, 3, 3, 0, 3, 3, 3, 4, 3, 3, 0, 3, 3, 3, 0, 0, 3, 3, 0, 1, 3, 4, 3, 0, 3, 4, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 3, 0, 4, 5, 0, 0, 3, 9, 4, 2, 0, 4, 3, 0, 4, 0, 3, 3, 0, 3, 0, 0, 0, 4, 3, 0, 4, 0, 3, 0, 7, 3, 3, 0, 0, 3, 0, 0, 2, 3, 3, 3, 4, 3, 0, 4, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 2, 3, 3, 4, 4, 3, 3, 3, 3, 0, 0, 3, 0, 3, 0, 3, 1, 2, 3, 0, 4, 2, 3, 0, 0, 3, 3, 3, 3, 3, 4, 3, 0, 4, 3, 3, 3, 0, 9, 0, 2, 9, 0, 5, 3, 0, 0, 3, 1, 0, 0, 0, 0, 3, 2, 0, 3, 3, 0, 3, 0, 0, 0, 3, 0, 3, 3, 3, 8, 0, 0, 3, 0, 3, 3, 4, 0, 8, 3, 1, 3, 0, 0, 3, 0, 2, 0, 9, 0, 2, 0, 3, 0, 0, 3, 0, 3, 0, 3, 3, 0, 0, 3, 0, 3, 0, 4, 4, 0, 0, 4, 0, 0, 3, 0, 0, 0, 3, 3, 0, 

In [58]:
from sklearn.metrics import accuracy_score, classification_report

Get the party names back from the integers.

In [59]:
int_dict = {v: k for k, v in label_dict.items()}

In [60]:
print(int_dict)

{0: 'labour', 1: 'labourco-operative', 2: 'scottish-national-party', 3: 'conservative', 4: 'liberal-democrat', 5: 'plaid-cymru', 6: 'uup', 7: 'social-democratic-and-labour-party', 8: 'independent', 9: 'dup', 10: 'independent-conservative', 11: 'independent-ulster-unionist', 12: 'respect', 13: 'ukip', 14: 'green', 15: 'alliance'}


In [71]:
y_test = [int_dict[label] for label in y_test]
y_pred = [int_dict[label] for label in y_pred]

In [62]:
print(y_test)

['labour', 'scottish-national-party', 'labour', 'conservative', 'conservative', 'liberal-democrat', 'labour', 'labour', 'labour', 'conservative', 'conservative', 'labour', 'labour', 'labour', 'scottish-national-party', 'conservative', 'conservative', 'conservative', 'labour', 'labour', 'labour', 'conservative', 'conservative', 'conservative', 'conservative', 'scottish-national-party', 'conservative', 'liberal-democrat', 'labour', 'conservative', 'conservative', 'labour', 'conservative', 'conservative', 'conservative', 'labour', 'labour', 'labour', 'conservative', 'conservative', 'liberal-democrat', 'labour', 'labour', 'liberal-democrat', 'conservative', 'labour', 'labour', 'labour', 'labour', 'conservative', 'conservative', 'labourco-operative', 'conservative', 'conservative', 'labour', 'plaid-cymru', 'labour', 'conservative', 'labour', 'labour', 'conservative', 'labour', 'conservative', 'liberal-democrat', 'labour', 'conservative', 'liberal-democrat', 'labour', 'labour', 'labourco-ope

In [63]:
# Calculate overall accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {overall_accuracy}\n")

# Calculate precision, recall, and F1-score for each class
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Overall Accuracy: 0.579468707789284


Classification Report:
                                    precision    recall  f1-score   support

                          alliance       0.00      0.00      0.00         2
                      conservative       0.62      0.66      0.64      2700
                               dup       0.33      0.41      0.36       116
                             green       0.00      0.00      0.00        23
                       independent       0.00      0.00      0.00        46
          independent-conservative       0.00      0.00      0.00         1
       independent-ulster-unionist       0.00      0.00      0.00         2
                            labour       0.59      0.70      0.64      2619
                labourco-operative       0.00      0.00      0.00       156
                  liberal-democrat       0.33      0.16      0.21       572
                       plaid-cymru       0.31      0.22      0.26        67
                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### ChatGPT Comparison

In [64]:
chatgpt_samples = pd.read_csv("/content/drive/MyDrive/CA4023/chatgpt_samples.csv")

Tokenize the ChatGPT samples

In [65]:
chatgpt_df = pd.DataFrame({"text": chatgpt_samples['speech'], "label": chatgpt_samples['party']})
chatgpt_dataset = Dataset.from_pandas(chatgpt_df)
tokenized_chatgpt = tokenize_dataset(chatgpt_dataset, tokenizer)

In [66]:
from transformers import BertConfig, BertModel
# model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/CA4023/bert")

In [68]:
predictions = trainer.predict(tokenized_chatgpt)

In [77]:
y_pred, y_test = get_predictions_by_type(tokenized_chatgpt)

In [78]:
print(y_test)

[0, 1, 0, 0, 0, 0, 4, 1, 0, 0, 3, 2, 4, 1, 0, 3, 4, 3, 9, 3, 5, 4, 4, 0, 3, 0, 0, 3, 3, 4]


In [79]:
int_dict = {v: k for k, v in label_dict.items()}
y_test = [int_dict[label] for label in y_test]
y_pred = [int_dict[label] for label in y_pred]

In [81]:
# Calculate overall accuracy
overall_accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {overall_accuracy}\n")

# Calculate precision, recall, and F1-score for each class
report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(report)

Overall Accuracy: 0.43333333333333335


Classification Report:
                         precision    recall  f1-score   support

           conservative       0.43      0.86      0.57         7
                    dup       1.00      1.00      1.00         1
                 labour       0.45      0.45      0.45        11
     labourco-operative       0.00      0.00      0.00         3
       liberal-democrat       0.50      0.17      0.25         6
            plaid-cymru       0.00      0.00      0.00         1
scottish-national-party       0.00      0.00      0.00         1

               accuracy                           0.43        30
              macro avg       0.34      0.35      0.33        30
           weighted avg       0.40      0.43      0.38        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
