### **Part: 3 Supervised Machine Learning**

This notebook showcases my attempts at building a supervised machine learning model that can predict the parties from their speeches.

In [1]:
import pandas as pd

Mount Google Drive to Read in the files.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Choose the local or google file path depending on how you are running the notebook.

In [3]:
# df = pd.read_csv("/content/drive/MyDrive/CA4023/ParlVote+.csv")
df = pd.read_csv("../data/ParlVote+.csv")

Import the model type.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

Clean and stem the speeches.

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\FX
[nltk_data]     8320\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\FX
[nltk_data]     8320\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\FX
[nltk_data]     8320\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
stop_words = set(stopwords.words('english'))

# Remove common parliamentary terms
parliamentay_stop_words = {"hon", "would", "people", "member", "right", "friend", "bill", "house", "government", "minister", "gentleman", "lady", "mr", "speaker", "one", "members", "said", "many", "made", "time", "want", "us", "“", "”", "’"}
#lemmatizer = WordNetLemmatizer()
updated_stop_words = stop_words.union(parliamentay_stop_words) # Can slightly degrade machine learning performance
punctuation_translation = str.maketrans('', '', string.punctuation)
stemmer = PorterStemmer()

def clean_text(text):
    # Lowercasing
    text = text.lower()
    
    # Removing Punctuation
    text = text.translate(punctuation_translation)

    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing Stopwords and lemmatization
    #tokens = [lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]


    return " ".join(tokens)

In [7]:
df['ml_speech'] = df['speech'].apply(lambda x: clean_text(x))

In [8]:
print(df['ml_speech'])

0        right hon gentleman recit catalogu two third c...
1        sure whether occur right hon gentleman late wi...
2        right hon gentleman leav subject prison tell u...
3        thank right hon member penrith border generos ...
4        thank right hon friend give way congratul appo...
                               ...                        
33306    point hon gentleman eu nation given vote scott...
33307    point order madam deputi speaker hope move man...
33308    point order mr speaker today ’ vote lay preced...
33309    point order mr speaker know sometim uncomfort ...
33310    point order mr speaker three half year liber d...
Name: ml_speech, Length: 33311, dtype: object


In [9]:
# Stratified Samplings
y = df['party']

# Use stratified sampling to split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=62, stratify=y)

In [10]:
X_train, X_test, y_train, y_test = train_df['ml_speech'], test_df['ml_speech'], train_df['party'], test_df['party']

In [11]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

### **Logistic Regression**

Use TfidfVectorizer to turn the words into numerical features

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=250000, ngram_range=(1,4), stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Train the model and calculate the metrics

250,000 features
ngram range  1-4
remove stop words
stem
max iter 1000
C = 10
is the best so far

In [13]:
#model = LogisticRegression(multi_class="multinomial", max_iter=500, class_weights="balanced")

model = LogisticRegression(multi_class="multinomial", max_iter=1000, C=10)
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.6462554404922708


Classification Report:
                                    precision    recall  f1-score   support

                          alliance       0.00      0.00      0.00         2
                      conservative       0.66      0.77      0.71      2700
                               dup       0.51      0.26      0.34       116
                             green       1.00      0.04      0.08        23
                       independent       0.67      0.09      0.15        46
          independent-conservative       0.00      0.00      0.00         1
       independent-ulster-unionist       0.00      0.00      0.00         2
                            labour       0.64      0.75      0.69      2619
                labourco-operative       0.20      0.01      0.01       156
                  liberal-democrat       0.60      0.24      0.34       572
                       plaid-cymru       0.57      0.12      0.20        67
                         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Use Cross-Validation to optimise the model's parameters.

I have commented this out as it takes a long time to run.

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [None]:
# # Create the pipeline with TfidfVectorizer and LogisticRegression
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(max_features=100000, stop_words="english")),
#     ('model', LogisticRegression(multi_class='multinomial'))
# ])

# # Set up the parameter grid
# param_grid = {
#     'tfidf__ngram_range': [(1, 3), (1, 4)],
#     'model__C': [0.1, 1, 10]
# }

# # Use accuracy as the scoring metric
# scorer = make_scorer(accuracy_score)

Perform grid search with 4-fold cross-validation.

In [None]:
# # n_jobs will determine how much of your processor is used
# grid_search = GridSearchCV(pipeline, param_grid, cv=4, scoring=scorer, n_jobs=4)
# grid_search.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:

# # Print the best parameters and corresponding accuracy
# print("Best Parameters: ", grid_search.best_params_)
# print("Best Accuracy: {:.2f}".format(grid_search.best_score_))

# # Make predictions using the best model
# y_pred = grid_search.predict(X_test)

# # Calculate metrics on the test set
# calculate_metrics(y_test, y_pred)


Best Parameters:  {'model__C': 10, 'tfidf__ngram_range': (1, 4)}
Best Accuracy: 0.62
Overall Accuracy: 0.6426534594026715


Classification Report:
                                    precision    recall  f1-score   support

                          alliance       1.00      0.50      0.67         2
                      conservative       0.66      0.76      0.71      2700
                               dup       0.53      0.27      0.36       116
                             green       0.75      0.13      0.22        23
                       independent       0.67      0.13      0.22        46
          independent-conservative       0.00      0.00      0.00         1
       independent-ulster-unionist       0.00      0.00      0.00         2
                            labour       0.64      0.74      0.69      2619
                labourco-operative       0.36      0.03      0.06       156
                  liberal-democrat       0.52      0.27      0.35       572
                

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Class weights can help the model to identify underrepresented classes.

In this case they lower the accuracy, but increase the macro average, recall and f1_score.

In [15]:
weight_model = LogisticRegression(multi_class="multinomial", max_iter=250000, C=10, class_weight="balanced")
weight_model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = weight_model.predict(X_test_tfidf)

calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.625093801590875


Classification Report:
                                    precision    recall  f1-score   support

                          alliance       0.00      0.00      0.00         2
                      conservative       0.69      0.70      0.70      2700
                               dup       0.46      0.41      0.43       116
                             green       0.46      0.26      0.33        23
                       independent       0.29      0.15      0.20        46
          independent-conservative       0.00      0.00      0.00         1
       independent-ulster-unionist       0.00      0.00      0.00         2
                            labour       0.68      0.67      0.67      2619
                labourco-operative       0.20      0.12      0.15       156
                  liberal-democrat       0.38      0.45      0.41       572
                       plaid-cymru       0.40      0.31      0.35        67
                          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### **BILSTM**

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from keras.layers import Dropout
import numpy as np
from keras.metrics import Precision, Recall




Show the nth percentile of speech length to calculate word length of BILSTM input.

In [18]:
# Calculate speech lengths
speech_lengths = df['speech'].apply(lambda x: len(x.split()))

# Calculate the percentile
print(np.percentile(speech_lengths, 60))

581.0


Show the average speech length.

In [19]:
mean_length = df['ml_speech'].apply(lambda x: len(x.split())).mean()
print(mean_length)

330.2887634715259


Do the stratified sampling

In [20]:
# Stratified Sampling
X = df['ml_speech']  # Features
y = df['party']  # Target variable

# Use stratified sampling to split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=62, stratify=y)

Create a validation set.

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=62, stratify = y_train)

Show the size of the vocabulary.

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Vocabulary size is the length of the word index + 1
vocab_size = len(tokenizer.word_index) + 1

print("Vocabulary Size:", vocab_size)

Vocabulary Size: 129589


Tokenize and pad the speeches.

In [23]:
max_words = 25000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

In [24]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [25]:
max_length = 300
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post', truncating='post')

Convert party labels to one hot vectors.

In [26]:
label_binarizer = LabelBinarizer()
y_train_one_hot = label_binarizer.fit_transform(y_train)
y_test_one_hot = label_binarizer.transform(y_test)
y_val_one_hot = label_binarizer.transform(y_val)

Replicate Sklearn's balanced class weights

In [27]:
class_weights = {}

for party in np.unique(y_train):
    occurrence = np.bincount(y_train == party)[1]
    class_weights[party] = len(y_train) / (len(np.unique(y_train)) * occurrence)

# Classes have to be represented by their index.
class_indices = {class_name: index for index, class_name in enumerate(label_binarizer.classes_)}
index_class_weights = {}
for party, weight in class_weights.items():
    index_class_weights[class_indices[party]] = weight

class_weights
print(index_class_weights)

{0: 149.89375, 1: 0.1542275439860068, 2: 3.603215144230769, 3: 18.059487951807228, 4: 9.084469696969697, 5: 374.734375, 6: 249.82291666666666, 7: 0.15897099374270868, 8: 2.662411190053286, 9: 0.7287007778317939, 10: 6.193956611570248, 11: 374.734375, 12: 1.4581104085603114, 13: 11.021599264705882, 14: 149.89375, 15: 13.503941441441441}


In [28]:
class_indices = {class_name: index for index, class_name in enumerate(label_binarizer.classes_)}
index_class_weights = {}
for party, weight in class_weights.items():
    index_class_weights[class_indices[party]] = weight

class_weights
print(index_class_weights)

{0: 149.89375, 1: 0.1542275439860068, 2: 3.603215144230769, 3: 18.059487951807228, 4: 9.084469696969697, 5: 374.734375, 6: 249.82291666666666, 7: 0.15897099374270868, 8: 2.662411190053286, 9: 0.7287007778317939, 10: 6.193956611570248, 11: 374.734375, 12: 1.4581104085603114, 13: 11.021599264705882, 14: 149.89375, 15: 13.503941441441441}


Build the BiLSTM

In [29]:
embedding_dim = 200
bilstm_model = Sequential()
bilstm_model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length))
bilstm_model.add(Bidirectional(LSTM(64)))  # return_sequences are needed to use multiple layers
bilstm_model.add(Dropout(0.2))
# bilstm_model.add(LSTM(32))  # Add another LSTM layer
bilstm_model.add(Dense(units=len(label_binarizer.classes_), activation='softmax'))
bilstm_model.compile('adam',loss='categorical_crossentropy', metrics=['accuracy'])





Train the model

In [30]:
#bilstm_model.fit(X_train_pad, y_train_one_hot, epochs=10, batch_size=32, validation_split=0.1, class_weight=index_class_weights)
bilstm_model.fit(X_train_pad, y_train_one_hot, epochs=5, batch_size=32, validation_data=(X_val_pad, y_val_one_hot))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1b795d6ce10>

Evaluate the model on the test set

In [31]:
# Evaluate the classes on the test set
y_pred_one_hot = bilstm_model.predict(X_test_pad)



In [32]:
# Convert one-hot encoded predictions back to class labels

y_pred = label_binarizer.inverse_transform(y_pred_one_hot)

# Convert true labels to class labels
y_true = label_binarizer.inverse_transform(y_test_one_hot)

# Compute accuracy
calculate_metrics(y_true, y_pred)

Overall Accuracy: 0.4915203361849017


Classification Report:
                                    precision    recall  f1-score   support

                          alliance       0.00      0.00      0.00         2
                      conservative       0.55      0.54      0.55      2700
                               dup       0.27      0.27      0.27       116
                             green       0.00      0.00      0.00        23
                       independent       0.00      0.00      0.00        46
          independent-conservative       0.00      0.00      0.00         1
       independent-ulster-unionist       0.00      0.00      0.00         2
                            labour       0.51      0.62      0.56      2619
                labourco-operative       0.06      0.01      0.01       156
                  liberal-democrat       0.22      0.14      0.17       572
                       plaid-cymru       0.08      0.01      0.02        67
                         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### ChatGPT Comparison

Test the models on the sample used with chat_gpt.

Clean the samples

In [34]:
chatgpt_samples = pd.read_csv("../data/chatgpt_samples.csv")

In [35]:
chatgpt_samples['ml_speech'] = chatgpt_samples['speech'].apply(lambda x: clean_text(x))

Transform the sample using the tfidf_vectorizer

Make sure to use (max_features=250000, ngram_range=(1,4), stop_words="english")

In [36]:
X_chatgpt_tfidf = tfidf_vectorizer.transform(chatgpt_samples['ml_speech'])

Choose one of the logistic regression models.

In [37]:
y_pred = model.predict(X_chatgpt_tfidf)
calculate_metrics(chatgpt_samples['party'], y_pred)

Overall Accuracy: 0.5666666666666667


Classification Report:
                         precision    recall  f1-score   support

           conservative       0.50      1.00      0.67         7
                    dup       1.00      1.00      1.00         1
                 labour       0.54      0.64      0.58        11
     labourco-operative       0.00      0.00      0.00         3
       liberal-democrat       1.00      0.33      0.50         6
            plaid-cymru       0.00      0.00      0.00         1
scottish-national-party       0.00      0.00      0.00         1

               accuracy                           0.57        30
              macro avg       0.43      0.42      0.39        30
           weighted avg       0.55      0.57      0.50        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
y_pred = weight_model.predict(X_chatgpt_tfidf)
calculate_metrics(chatgpt_samples['party'], y_pred)

Overall Accuracy: 0.43333333333333335


Classification Report:
                                    precision    recall  f1-score   support

                      conservative       0.50      0.71      0.59         7
                               dup       0.00      0.00      0.00         1
                            labour       0.67      0.36      0.47        11
                labourco-operative       0.00      0.00      0.00         3
                  liberal-democrat       0.44      0.67      0.53         6
                       plaid-cymru       0.00      0.00      0.00         1
           scottish-national-party       0.00      0.00      0.00         1
social-democratic-and-labour-party       0.00      0.00      0.00         0

                          accuracy                           0.43        30
                         macro avg       0.20      0.22      0.20        30
                      weighted avg       0.45      0.43      0.42        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Convert to BILSTM Input

In [45]:
X_chatgpt_seq = tokenizer.texts_to_sequences(chatgpt_samples['ml_speech'])
y_chatgpt_one_hot = label_binarizer.transform(chatgpt_samples['party'])
X_chatgpt_pad = pad_sequences(X_chatgpt_seq, maxlen=max_length, padding='post', truncating='post')

In [46]:
y_pred_one_hot = bilstm_model.predict(X_chatgpt_pad)

y_pred = label_binarizer.inverse_transform(y_pred_one_hot)

# Convert true labels to class labels
y_true = label_binarizer.inverse_transform(y_chat_one_hot)

# Compute accuracy
calculate_metrics(y_true, y_pred)

Overall Accuracy: 0.4


Classification Report:
                         precision    recall  f1-score   support

           conservative       0.42      0.71      0.53         7
                    dup       1.00      1.00      1.00         1
                 labour       0.36      0.36      0.36        11
     labourco-operative       0.00      0.00      0.00         3
       liberal-democrat       0.25      0.17      0.20         6
            plaid-cymru       0.00      0.00      0.00         1
scottish-national-party       0.50      1.00      0.67         1

               accuracy                           0.40        30
              macro avg       0.36      0.46      0.39        30
           weighted avg       0.33      0.40      0.35        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
