In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import re

nltk.download('stopwords')

# Read the dataset
df = pd.read_csv("C:/Users/wadev/OneDrive/Desktop/viplavi/BBK Folder/Semester-3/Natural Language Processing/NLP Project/NLP Project/wade-viplavi-13922741-NLP-cw24/wade-viplavi-13922741-NLP-cw24\p2-texts\hansard40000.csv")
# Print the column names to verify
print(df.columns)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wadev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['speech', 'party', 'constituency', 'date', 'speech_class',
       'major_heading', 'year', 'speakername'],
      dtype='object')


In [2]:
# Preprocess the dataframe
df['party'] = df['party'].replace('Labour (Co-op)', 'Labour')
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39995  I totally agree with everything that the right...       Speaker   
39996  Message to attend the Lords Commissioners deli...           NaN   
39997  I have to acquaint the House that the House ha...       Speaker   
39998  I have further to acquaint the House that the ...       Speaker   
39999  The Commission was also for proroguing this pr...       Speaker   

                    constituency        date speech_class  \
0               Portsmouth South  2020-09-14      

In [3]:
# Filter to keep only the four most common parties (excluding 'Speaker')
top_parties = df[df['party'] != 'Speaker']['party'].value_counts().nlargest(4).index
df = df[df['party'].isin(top_parties)]
print(df['party'].unique()
     )

['Labour' 'Conservative' 'Scottish National Party' 'Liberal Democrat']


In [4]:
# Keep only rows where 'speech_class' is 'Speech'
df = df[df['speech_class'] == 'Speech']
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39985  I will answer my hon. Friend. East West Rail, ...  Conservative   
39990  The hon. Gentleman is absolutely right to poin...  Conservative   
39991  Cutting-edge maritime projects such as the Hol...  Conservative   
39992  My hon. Friend is a brilliant champion of conn...  Conservative   
39994  On a point of order, Mr Speaker. As a further ...  Conservative   

                    constituency        date speech_class      major_heading  \
0               Portsmouth Sout

In [5]:
# Remove speeches with less than 1500 characters
df = df[df['speech'].str.len() >= 1500]
print(df.head())

                                                speech  \
99   I am delighted to announce that last Friday we...   
100  I thank the Secretary of State for advance sig...   
101  After the right hon. Lady’s congratulations to...   
104  I congratulate the Secretary of State. I recog...   
188  I beg to move, That the Bill be now read a Sec...   

                       party                  constituency        date  \
99              Conservative            South West Norfolk  2020-09-14   
100                   Labour  Islington South and Finsbury  2020-09-14   
101             Conservative            South West Norfolk  2020-09-14   
104  Scottish National Party                   Dundee East  2020-09-14   
188             Conservative    Uxbridge and South Ruislip  2020-09-14   

    speech_class                        major_heading  year       speakername  
99        Speech           Japan Free Trade Agreement  2020   Elizabeth Truss  
100       Speech           Japan Free Trade Ag

In [6]:
# Check for NaN values in the DataFrame
print(df.isna().sum())

speech           0
party            0
constituency     2
date             0
speech_class     0
major_heading    0
year             0
speakername      0
dtype: int64


In [7]:
# Remove rows with NaN values
df = df.dropna()
df

Unnamed: 0,speech,party,constituency,date,speech_class,major_heading,year,speakername
99,I am delighted to announce that last Friday we...,Conservative,South West Norfolk,2020-09-14,Speech,Japan Free Trade Agreement,2020,Elizabeth Truss
100,I thank the Secretary of State for advance sig...,Labour,Islington South and Finsbury,2020-09-14,Speech,Japan Free Trade Agreement,2020,Emily Thornberry
101,After the right hon. Lady’s congratulations to...,Conservative,South West Norfolk,2020-09-14,Speech,Japan Free Trade Agreement,2020,Elizabeth Truss
104,I congratulate the Secretary of State. I recog...,Scottish National Party,Dundee East,2020-09-14,Speech,Japan Free Trade Agreement,2020,Stewart Hosie
188,"I beg to move, That the Bill be now read a Sec...",Conservative,Uxbridge and South Ruislip,2020-09-14,Speech,United Kingdom Internal Market Bill,2020,Boris Johnson
...,...,...,...,...,...,...,...,...
39826,My right hon. Friend raises a question of cons...,Conservative,North East Somerset,2021-04-28,Speech,Amendments to the Independent Complaints and G...,2021,Jacob Rees-Mogg
39827,"Before we move on to the other motions, I too ...",Conservative,Ribble Valley,2021-04-28,Speech,Amendments to the Independent Complaints and G...,2021,Nigel Evans
39834,"Thank you, Mr Deputy Speaker, and I am very gr...",Conservative,South West Bedfordshire,2021-04-28,Speech,National Minimum Wage Enforcement,2021,Andrew Selous
39835,I congratulate my hon. Friend the Member for S...,Conservative,Sutton and Cheam,2021-04-28,Speech,National Minimum Wage Enforcement,2021,Paul Scully


In [8]:
# Verify that there are no NaN values
print(df.isna().sum())

speech           0
party            0
constituency     0
date             0
speech_class     0
major_heading    0
year             0
speakername      0
dtype: int64


In [9]:
# Print the dimensions of the resulting dataframe
print(df.shape)

(6472, 8)


In [10]:
# Vectorize the speeches using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=4000)
X = vectorizer.fit_transform(df['speech'])
y = df['party']
print(y)

99                  Conservative
100                       Labour
101                 Conservative
104      Scottish National Party
188                 Conservative
                  ...           
39826               Conservative
39827               Conservative
39834               Conservative
39835               Conservative
39837               Conservative
Name: party, Length: 6472, dtype: object


In [11]:
# Split the data into a train and test set using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=99)

In [12]:
# Train a RandomForest classifier
rf_clf = RandomForestClassifier(n_estimators=400, random_state=99)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("RandomForest Classifier")
print("F1 Score:", f1_score(y_test, y_pred_rf, average='macro'))
print(classification_report(y_test, y_pred_rf))

RandomForest Classifier
F1 Score: 0.43868697020125846
                         precision    recall  f1-score   support

           Conservative       0.71      0.99      0.83       731
                 Labour       0.81      0.51      0.63       402
       Liberal Democrat       0.00      0.00      0.00        50
Scottish National Party       0.91      0.18      0.30       112

               accuracy                           0.73      1295
              macro avg       0.61      0.42      0.44      1295
           weighted avg       0.73      0.73      0.69      1295



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
# Train an SVM classifier
svm_clf = SVC(kernel='linear', random_state=99)
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print("SVM Classifier")
print("F1 Score:", f1_score(y_test, y_pred_svm, average='macro'))
print(classification_report(y_test, y_pred_svm))

SVM Classifier
F1 Score: 0.6021347544436162
                         precision    recall  f1-score   support

           Conservative       0.82      0.93      0.88       731
                 Labour       0.77      0.76      0.76       402
       Liberal Democrat       0.80      0.08      0.15        50
Scottish National Party       0.84      0.50      0.63       112

               accuracy                           0.81      1295
              macro avg       0.81      0.57      0.60      1295
           weighted avg       0.81      0.81      0.79      1295



In [16]:
# Adjust the Tfidfvectorizer parameters to include unigrams, bi-grams, and tri-grams
vectorizer = TfidfVectorizer(stop_words='english', max_features=4000, ngram_range=(1, 3))
X = vectorizer.fit_transform(df['speech'])
y = df['party']

In [17]:
# Split the data into a train and test set using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=99)

In [18]:
# Train and evaluate classifiers again with the new vectorizer settings
# RandomForest Classifier
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("RandomForest Classifier with ngrams")
print("F1 Score:", f1_score(y_test, y_pred_rf, average='macro'))
print(classification_report(y_test, y_pred_rf))

RandomForest Classifier with ngrams
F1 Score: 0.4939674638381497
                         precision    recall  f1-score   support

           Conservative       0.73      0.97      0.83       731
                 Labour       0.81      0.54      0.65       402
       Liberal Democrat       0.00      0.00      0.00        50
Scottish National Party       0.85      0.35      0.49       112

               accuracy                           0.75      1295
              macro avg       0.60      0.47      0.49      1295
           weighted avg       0.73      0.75      0.71      1295



In [19]:
# SVM Classifier
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print("SVM Classifier with ngrams")
print("F1 Score:", f1_score(y_test, y_pred_svm, average='macro'))
print(classification_report(y_test, y_pred_svm))

SVM Classifier with ngrams
F1 Score: 0.6049185227419375
                         precision    recall  f1-score   support

           Conservative       0.84      0.94      0.89       731
                 Labour       0.78      0.78      0.78       402
       Liberal Democrat       1.00      0.04      0.08        50
Scottish National Party       0.86      0.55      0.67       112

               accuracy                           0.82      1295
              macro avg       0.87      0.58      0.60      1295
           weighted avg       0.83      0.82      0.80      1295



In [20]:
# Custom tokenizer
stop_words = set(stopwords.words('english'))

def custom_tokenizer(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords
    tokens = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
    # Perform additional custom processing if needed
    return tokens

In [21]:
# Use the custom tokenizer in TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=4000, ngram_range=(1, 3))
X = vectorizer.fit_transform(df['speech'])
y = df['party']



In [22]:
# Split the data into a train and test set using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=99)

In [23]:
# Train and evaluate classifiers again with the custom tokenizer
# RandomForest Classifier
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print("RandomForest Classifier with custom tokenizer")
print("F1 Score:", f1_score(y_test, y_pred_rf, average='macro'))
print(classification_report(y_test, y_pred_rf))

RandomForest Classifier with custom tokenizer
F1 Score: 0.4965019587413562
                         precision    recall  f1-score   support

           Conservative       0.73      0.97      0.83       731
                 Labour       0.81      0.55      0.66       402
       Liberal Democrat       0.50      0.02      0.04        50
Scottish National Party       0.83      0.31      0.45       112

               accuracy                           0.75      1295
              macro avg       0.72      0.47      0.50      1295
           weighted avg       0.75      0.75      0.72      1295



In [24]:
# SVM Classifier
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)
print("SVM Classifier with custom tokenizer")
print("F1 Score:", f1_score(y_test, y_pred_svm, average='macro'))
print(classification_report(y_test, y_pred_svm))

SVM Classifier with custom tokenizer
F1 Score: 0.6039486885350873
                         precision    recall  f1-score   support

           Conservative       0.84      0.95      0.89       731
                 Labour       0.79      0.77      0.78       402
       Liberal Democrat       1.00      0.04      0.08        50
Scottish National Party       0.85      0.55      0.67       112

               accuracy                           0.82      1295
              macro avg       0.87      0.58      0.60      1295
           weighted avg       0.83      0.82      0.80      1295



In [14]:
print("SVM Classifier with custom tokenizer")
print("F1 Score:", f1_score(y_test, y_pred_svm, average='macro'))
print(classification_report(y_test, y_pred_svm))

SVM Classifier with custom tokenizer
F1 Score: 0.6021347544436162
                         precision    recall  f1-score   support

           Conservative       0.82      0.93      0.88       731
                 Labour       0.77      0.76      0.76       402
       Liberal Democrat       0.80      0.08      0.15        50
Scottish National Party       0.84      0.50      0.63       112

               accuracy                           0.81      1295
              macro avg       0.81      0.57      0.60      1295
           weighted avg       0.81      0.81      0.79      1295

