In [40]:
import pandas as pd
import plotly.express as px
import numpy as np
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tobyb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Dataset Overview

In [31]:
#Output key overview statistics of the dataframes
parliament_reports_df = pd.read_csv('../data/combined_parliament_reports.csv')
print("Parliament Dataframe shape:")
print(parliament_reports_df.shape)
print("---------------------------------------------------")
print("Parliament Dataframe Column Types:")
print(parliament_reports_df.dtypes)
print("---------------------------------------------------")
print("Parliament Dataframe Describe Outputs:")
print(parliament_reports_df.describe())
print("---------------------------------------------------")
print("Parliament Dataframe Missing Values Count:")
print(parliament_reports_df.isnull().sum())

Parliament Dataframe shape:
(1091, 4)
---------------------------------------------------
Parliament Dataframe Column Types:
Name      object
Speech    object
Time      object
Date      object
dtype: object
---------------------------------------------------
Parliament Dataframe Describe Outputs:
                Name   Speech   Time      Date
count           1091     1091     27      1091
unique            59     1083     14         4
top     The Convener  Indeed.  11:45  07/01/25
freq             208        3      3       422
---------------------------------------------------
Parliament Dataframe Missing Values Count:
Name         0
Speech       0
Time      1064
Date         0
dtype: int64


In [9]:
# Barchat showing dates and amount of speeches within each day
fig = px.histogram(parliament_reports_df, x='Date', title='Date Distribution')
fig.update_layout(yaxis_title='Count of Speeches')
fig.show()

In [18]:
#Showing the top ten names of the people who have given speeches
top_names = parliament_reports_df['Name'].value_counts().reset_index().head(10)
top_names.columns = ['Name', 'count']

fig = px.bar(top_names, x='count', y='Name', title='Top Ten Names Distribution')
fig.show()

In [10]:
#Box diagram showing disbribution of potential outliers and more importantly a better shape idea of the data
fig = px.box(parliament_reports_df, x='Name', title='Age Distribution with Outliers')
fig.show()

In [23]:
#Z-score- measuring how far a data point is away from the mean
name_counts = parliament_reports_df['Name'].value_counts()
names_based_on_count = name_counts.index.tolist()
count = name_counts.values.tolist()
z_score = np.abs(stats.zscore(name_counts))

outliers = pd.DataFrame({
    'Name': names_based_on_count,
    'count': count,
    'z_score': z_score
})

outliers = outliers[outliers['z_score'] > 1]
outliers


Unnamed: 0,Name,count,z_score
0,The Convener,208,5.918969
1,Kate Forbes,104,2.670709
2,Stephen Boyle,77,1.827411


In [39]:
# Word frequency analysis
# Text cleaning and preprocessing
stop_words = set(stopwords.words('english'))
parliament_reports_df['Cleaned_Text'] = parliament_reports_df['Speech'].str.lower().str.replace(r'[^\w\s]', '').apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

vectorizer = CountVectorizer()
word_counts = vectorizer.fit_transform(parliament_reports_df['Cleaned_Text'])
word_counts_df = pd.DataFrame(word_counts.toarray(), columns=vectorizer.get_feature_names_out())

# Display the most common words
word_freq = word_counts_df.sum().sort_values(ascending=False)
word_freq.head(10)


government    448
would         301
scottish      270
scotland      256
that          247
public        220
one           179
budget        176
people        174
point         145
dtype: int64

In [43]:
# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(parliament_reports_df['Cleaned_Text']))
# Convert the word cloud to an image array
wordcloud_image = wordcloud.to_array()

# Create a Plotly figure
fig = px.imshow(wordcloud_image, title='Word Cloud')

# Update layout to remove axis labels and ticks
fig.update_layout(
    xaxis=dict(showticklabels=False),
    yaxis=dict(showticklabels=False)
)

# Show the plot
fig.show()