## <font color="blue">Interactivity</font>

<font color="blue">All cells are alterable but some cells that are easy to work with interactively are highlighted during the notebook. These are highlighted with the phrase **Alterable Cell** above them in blue and have instructions.</font>

<font color="blue">There is also a **DIY Section** towards the end of the Notebook that will allow you to modify parameters and use the code for your own purposes.</font>

<font color="blue">**Running Cells**</font>

- <font color="blue">To increase performance on the interactive graphics, click **Runtime** in the menu at the top of the Notebook, select **Change runtime type** and then click **T4 GPU**.</font>
- <font color="blue">To run all the cells, click **Runtime** in the menu at the top of the Notebook and select **Run All**.</font>
- <font color="blue">Shortcut to run a single cell, click in the cell and type **Ctrl + Enter (PC)** or **Cmnd + Enter (Mac)**.
- <font color="blue">If Notebook is not running correctly, try running all the cells again, as cells being run in the wrong order can sometimes cause issues.

## Import Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/parnell_files

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
!pip install bertopic

In [None]:
!pip install gensim

In [None]:
!pip install hvplot

In [None]:
!pip install pandas

In [None]:
#disable unnecessary warnings
import warnings
warnings.filterwarnings('ignore')
#enable interactive visualisations
%matplotlib inline

In [None]:
#libraries for working with files
import os
import glob
import requests
from pathlib import Path
from natsort import natsorted, os_sorted

#libraries for data extraction and parsing
from bs4 import BeautifulSoup

#libraries for data analysis and manipulation
import re
import string
import numpy as np
import pandas as pd
import geopandas
import collections
import networkx as nx
from datetime import datetime
from collections import Counter

#NLP Libraries
import spacy
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from bertopic import BERTopic

#libraries for visualization
import matplotlib.pyplot as plt
import holoviews as hv
import hvplot.networkx as hvnx
import plotly.express as px
from shapely.geometry import box

# Optimize notebook and Spacy settings
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set DPI for Matplotlib figures
plt.rcParams['figure.dpi'] = 75

In [None]:
def _render(self, **kw):
    """
    Rendering method for bokeh in Jupyter.
    Returns the rendered output as a MIMEBundle.
    """
    hv.extension('bokeh')
    return hv.Store.render(self)
hv.core.Dimensioned._repr_mimebundle_ = _render

### Data Extraction Functions

Functions which perform the various aspects of getting from a list of file paths through to extracting specific parts of each file.

In [None]:
def soup_objects(file_paths):
    '''
    Takes either a list of file paths or a file path.
    Returns a list of beautiful soup objects or single
    beautiful soup object, depending on the input.
    '''
    if type(file_paths) == list:
        soup_list = []
        for path in file_paths:
            with path.open("r", encoding="utf-8") as xml:
                source = BeautifulSoup(xml, "lxml-xml")
                soup_list.append(source)
        return soup_list
    else:
        with file_paths.open("r", encoding="utf-8") as xml:
            soup_object = BeautifulSoup(xml, "lxml-xml")
        return soup_object

In [None]:
def tei_extractor(soup_obj, element, attributes=False):
    '''
    Takes Beautiful soup object or list of objects,
    element using element name and, where necessary, attributes.
    Returns list of elements for all input files or a list of
    elements for input file, depending on input.
    '''
    attrib_dict ={}
    if attributes:
        attrib_dict = {attr: True for attr in attributes}

    if type(soup_obj) == list:
        elem_ls = [obj.find(element, attrib_dict) for obj in soup_obj]
        return elem_ls
    else:
        elem_ls = soup_obj.find_all(element, attrib_dict)
        return elem_ls

In [None]:
def tei_values(object_list, attribute=False):
    '''
    Takes a list of beautiful soup elements, if attribute
    value is being extracted include name of that attribute.
    Return element or attribute value depending on input(s)
    '''
    if attribute:
        values = [obj[attribute] for obj in object_list]
        return values
    else:
        values = [obj.get_text() for obj in object_list]
        return values

### Data Cleaning and Dataframe Functions

Functions to perform text cleaning, remove stopwords, convert results into dataframe format and cleans dataframe format data.

In [None]:
def text_cleaning(text):
    '''
    Takes as input a string, removes/replaces special characters, newlines,
    possessive apostrophes, hyphens, underscores, digits and makes single space.
    Keeps punctuation in place.
    Returns clean string.
    '''
    text = text.replace(u"\xa0", u" ").replace("&", "and").replace("|", " ")
    text = text.replace("\n", " ").replace("’", "'").replace("'s ", ' ')
    text = text.replace("-", " "). replace("–", " ").replace("_", " ").replace("—", " ")
    non_digit_text = re.sub(r"\b\d+\b", "", text)
    sing_space_text = re.sub(r"\s\s+", " ", non_digit_text)
    sing_space_text = sing_space_text.strip()
    return sing_space_text

In [None]:
def punct_removal(text):
    '''
    Takes as input a string and removes punctuation, removes extra spacing.
    Returns string without punctuation.
    '''
    text = re.sub(r"(?<!\w)'|'(?!\w)", ' ', text)
    no_punc_text = re.sub(r"[^\w\s\']", ' ', text)
    sing_space_text = re.sub(r'\s+', ' ', no_punc_text).strip()
    return sing_space_text

In [None]:
def remove_stopwords(text, stopwords):
    ''' 
    Take as input a string and list of stopwords, tokenizes
    string and removes words contained in stopwords.
    Returns re-joined string without stopwords.
    '''
    tokenized_text = text.split()
    non_stop_text = [token for token in tokenized_text if token not in stopwords]
    return ' '.join(non_stop_text)

In [None]:
def create_dataframe(data, columns):
    '''
    Takes as input a list of lists of data and a list of columns.
    Returns a dataframe.
    '''
    df = pd.DataFrame(data)
    df = df.transpose()
    df.columns = columns
    return df

In [None]:
def dataframe_cleaning(dataframe, clean_column=None):
    '''
    Takes as input a dataframe and makes lowercase, strips leading and
    trailing spaces, standardises apostrophes. Applies text_cleaning function
    to column if identified as clean column parameter.
    Returns lowercase/cleaned dataframe.
    '''
    lower_dataframe = dataframe.applymap(lambda x: x.lower())
    lower_dataframe = lower_dataframe.applymap(lambda x: x.replace("’", "'"))
    if clean_column:
        lower_dataframe[clean_column] = lower_dataframe[clean_column].apply(lambda x: text_cleaning(x))
    clean_dataframe = lower_dataframe.applymap(lambda x: x.strip())
    return clean_dataframe

In [None]:
def dataframe_sentence_tokenize(dataframe, column, tokenizer):
    """
    Takes as input a dataframe containing a column of strings, name of a column
    to tokenize and a sentence tokenizer.
    Applies tokenizer to the column, creating a list of sentences in each row,
    uses explode to expand sentences, so each one has its own row.
    Changes name of column to 'sentence' and applies punctuation removal function
    now that column has been divided into sentences.
    Returns dataframe where each row contains a single sentence with punctuation removed.
    """
    dataframe[column] = dataframe[column].apply(lambda x: tokenizer.tokenize(x))
    sents_df = dataframe.explode(column)
    sents_df = sents_df.rename(columns={column:'sentence'})
    sents_df['sentence'] = sents_df['sentence'].apply(lambda x: punct_removal(x))
    return sents_df

### Data Filtering Functions

Functions to filter dataframe using different parameters.

In [None]:
def dataframe_cooccurrance_count(dataframe, column, word):
    """
    Takes as input a dataframe containing a column of strings, name of column,
    and word to exclude from counts.
    Converts column contents into a list and uses Counter to find word
    frequency for all words in list, excluding word to exclude from count.
    Return sorted count list or empty list if input dataframe is empty.
    """
    if dataframe.empty:
        return []
    else:
        co_occur_words = ' '.join(dataframe[column]).split()
        word_count = Counter(co_occur_words)
        del word_count[word]
        sort_count = sorted(word_count.items(), key=lambda item: item[1], reverse=True)
    return sort_count

In [None]:
def count_keywords_by_year(dataframe, column, keywords, word_boundary):
    """
    Takes as input a dataframe with date and text columns,
    the name of the text column, a list of keywords or phrases.
    If column value incorrect raise value error, if keywords not list or empty print error message.
    If word boundary is set to True ensures only whole word matches are counted.
    If word_boundary is set to False counts word as a substring as well.
    Counts occurrences of each keyword in the text column, group these counts
    by the year in which they occur.
    """
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    if not isinstance(keywords, list) or not keywords:
        print("Keywords list is empty")
    
    dataframe['date'] = pd.to_datetime(dataframe['date']) 
    dataframe['year'] = dataframe['date'].dt.year
    df_year_groups = dataframe.groupby('year')
    #apply dataframe_count_strings function to each group
    #counting instances of each keyword within group
    apply_func = lambda df: dataframe_count_strings(df, column, keywords, word_boundary)
    keyword_counts_by_year = df_year_groups.apply(apply_func)    
    keyword_counts_by_year = keyword_counts_by_year.reset_index(name='keyword_counts')
    #expands count dictionaries for each year so they form columns in dataframe
    keyword_counts_by_year = keyword_counts_by_year.join(
        pd.json_normalize(keyword_counts_by_year["keyword_counts"])
    ).drop(columns=["keyword_counts"])

    return keyword_counts_by_year

In [None]:
def dataframe_count_strings(dataframe, column, keywords, word_boundary):
    """
    Takes as inputs dataframe, column name, keywords list and word boundary parameter.
    If column value incorrect raise value error, if keywords not list or empty print error message.
    Counts occurrences of specified keywords in the given column of dataFrame.
    If word boundary is set to True ensures only whole word matches are counted.
    If word_boundary is set to False counts word as a substring as well.
    Returns a dictionary where the keys are the keywords and values their counts for the column.
    """
    if column not in dataframe.columns:
        raise ValueError(f"Column '{column}' not found in DataFrame.")
    if not isinstance(keywords, list) or not keywords:
        print("Keywords list is empty")
    
    processed_keywords = {word: fr"\b{re.escape(word)}\b" if word_boundary else re.escape(word) for word in keywords}
    word_counts = {}
    for word, word_pattern in processed_keywords.items():
        word_count = dataframe[column].str.findall(word_pattern, flags=re.IGNORECASE).str.len().sum()
        word_counts[word] = word_count
    return word_counts

In [None]:
def filter_dataframe_by_keywords(dataframe, column, keywords, word_boundary=True, match_all=True):
    """
    Filters a DataFrame based on keywords list, with options for matching whole words or substrings,
    and for requiring all or any keywords to appear in each row.
    Takes dataframe and column name, searches column for regex matches depending on parameters below.
    - word_boundary: if True, only matches whole words. If False, matches substrings.
    - match_all: if True, only rows where all keywords are present are returned. 
    If False, rows with any keyword are returned.
    Returns filtered DataFrame.
    """
    if not keywords:
        return dataframe
    processed_keywords = [fr"\b{re.escape(word)}\b" if word_boundary else re.escape(word) for word in keywords]   
    pattern = "|".join(processed_keywords)
    if match_all:
        filtered_df = dataframe[dataframe[column].str.contains(pattern, case=False, regex=True)]
        filtered_df = filtered_df[
            filtered_df[column].apply(lambda text: all(re.search(word, text, flags=re.IGNORECASE) 
            for word in processed_keywords))
        ]
    else:
        filtered_df = dataframe[dataframe[column].str.contains(pattern, case=False, regex=True, na=False)]
    return filtered_df

In [None]:
def filter_dataframe_by_keywords_exclude(dataframe, column, keywords, word_boundary=True, match_all=False):
    """
    Filters a DataFrame to exclude rows based on keywords list, 
    with options for matching whole words or substrings,
    and for requiring all or any keywords to appear in each row match.
    Takes dataframe and column name, searches column for regex matches depending on parameters below.
    - word_boundary: if True, only matches whole words. If False, matches substrings.
    - match_all: if True, only rows where all keywords are present are removed. 
    If False, rows with any keyword are removed.
    Returns filtered DataFrame.
    """
    if not keywords:
        return dataframe
    processed_keywords = [fr"\b{re.escape(word)}\b" if word_boundary else re.escape(word) for word in keywords]   
    pattern = "|".join(processed_keywords)
    if match_all:
        filtered_df = filtered_df[
            ~filtered_df[column].apply(lambda text: all(re.search(word, text, flags=re.IGNORECASE) 
            for word in processed_keywords))
        ]
    else:
        filtered_df = dataframe[~dataframe[column].str.contains(pattern, case=False, regex=True, na=False)]
    return filtered_df

In [None]:
def dataframe_date_window(dataframe, column, include_range=None, exclude_range=None):
    """
    Takes as input a dataframe with a datetime column, column name, 
    a range for the date range to filter the dataframe by,
    and a date range to exclude during the filtering process.
    Creates masks for include_range and exclude_range if applicable,
    and applies them as dataframe filters.
    Returns filtered dataframe.
    """
    mask = pd.Series(True, index=dataframe.index)
    #range example: ['1880-01-01', '1885-01-01']
    if include_range:
        include_mask = (dataframe[column] >= include_range[0]) & (dataframe[column] <= include_range[1])
        mask &= include_mask 
    #range example: ['1890-01-01', '1890-12-31']
    if exclude_range:
        exclude_mask = ~((dataframe[column] >= exclude_range[0]) & (dataframe[column] <= exclude_range[1]))
        mask &= exclude_mask
    return dataframe.loc[mask]

In [None]:
def dataframe_column_filter(dataframe, column, include_values=None, exclude_values=None):
    """
    Takes as input a dataframe with a string column, column name,
    a list of values to include during filtering process,
    and a list of values to exclude during filtering process.
    Creates masks for include_values and exclude_values if applicable,
    and applies them as dataframe filters.
    Filters dataframe based on substring matching in the given column.
    Returns filtered dataframe.
    """
    mask = pd.Series(True, index=dataframe.index)
    if include_values:
        include_pattern = '|'.join(include_values)  
        mask &= dataframe[column].str.contains(include_pattern, case=False, na=False)
    if exclude_values:
        exclude_pattern = '|'.join(exclude_values)
        mask &= ~dataframe[column].str.contains(exclude_pattern, case=False, na=False)
    return dataframe.loc[mask]

In [None]:
def dataframe_apply_filters(dataframe, text_column, filters_dictionary, word_boundary, match_all,
                            exclude_word_boundary, exclude_match_all, text_filtering=True):
    """
    Apply multiple filters to a dataframe, taking as input a text column name, 
    filters dictionary, and text filtering parameters.
    If relevant key is found in filters dictionary, apply dataframe filtering
    with filtering function for the relevant category.
    Filtering is both for the inclusion and exclusion of rows.
    Return filtered dataframe.
    """
    filtered_df = dataframe.copy()
    if "keywords" in filters_dictionary and filters_dictionary["keywords"] and text_filtering:
        filtered_df = filter_dataframe_by_keywords(dataframe=filtered_df, column=text_column, 
                                                   keywords=filters_dictionary["keywords"], 
                                                   word_boundary=word_boundary, match_all=match_all)
        
    if "not_keywords" in filters_dictionary and filters_dictionary["not_keywords"] and text_filtering:
        filtered_df = filter_dataframe_by_keywords_exclude(dataframe=filtered_df, column=text_column, 
                                                           keywords=filters_dictionary["not_keywords"],  
                                                           word_boundary=exclude_word_boundary, match_all=exclude_match_all)
    
    if filters_dictionary["places"] or filters_dictionary["not_places"]:
        filtered_df = dataframe_column_filter(dataframe=filtered_df, column="place", 
                                            include_values=filters_dictionary["places"], 
                                            exclude_values=filters_dictionary["not_places"])

        
    if filters_dictionary["publications"] or filters_dictionary["not_publications"]:
        filtered_df = dataframe_column_filter(dataframe=filtered_df, column="publication", 
                                              include_values=filters_dictionary["publications"], 
                                              exclude_values=filters_dictionary["not_publications"])
        
    if filters_dictionary["include_range"] or filters_dictionary["exclude_range"]:
        filtered_df = dataframe_date_window(dataframe=filtered_df, column="date", 
                                            include_range=filters_dictionary["include_range"], 
                                            exclude_range=filters_dictionary["exclude_range"])
    
    return filtered_df

In [None]:
def year_counts(dataframe):
    """
    Takes as input a dataframe, converts date column to year.
    Counts the different years present in dataframe.
    Returns a list where each tuple contains year, sentence count.
    """
    dataframe["year"] = dataframe["date"].dt.year
    year_counts = dataframe.groupby(["year"]).size()
    return list(year_counts.items())

### Visualisation Functions

Functions to produce visualisations.

In [None]:
def display_active_filters(filters_dict, **kwargs):
    """
    Generates a summary of active filters and parameters for user output.
    Takes two input dictionaries and removes items with empty values:
    filters_dict: Dictionary of filtering criteria (e.g., places, publications).
    kwargs: Additional keyword arguments for filtering parameters (e.g., booleans).
    Returns a formatted string listing the active filters and parameters.
    """
    #remove empty lists
    active_filters = {key: value for key, value in filters_dict.items() if value}
    #if parameter has falsey value, like empty list, string etc, make None
    active_parameters = {key: (value if value else None) for key, value in kwargs.items()}

    #print filters and parameters
    output = f"\nActive Filters & Parameters:\n"   
    if active_filters:
        output += "\nFilters Applied:"
        for key, value in active_filters.items():
            output += f"  - {key.replace('_', ' ').title()}: {', '.join(map(str, value))}\n"
    else:
        output += "\nNo active filters applied.\n"
    if active_parameters:
        output += "\nParameters:\n"
        for key, value in active_parameters.items():
            output += f"  - {key.replace('_', ' ').title()}: {value}\n"
    return output

In [None]:
def bertopic_topics(dataframe, topic_model):
    """
    Takes as input a dataframe and Bertopic topic model.
    Extracts sentences for each dataframe row as lists.
    Fits sentence list to topic model, creates dictionary of topics/sentences.
    Returns topics, topic/sentence dictionary and topic model.
    If there is an error due to too few topics, an error message is printed, 
    and the function does not return topics or topic_docs,
    and function returns None value.
    """
    sent_list = dataframe["sentence"].to_list()
    try:
        topics, probs = topic_model.fit_transform(sent_list)
        topic_docs = {topic: [] for topic in set(topics)}
        for topic, doc in zip(topics, sent_list):
            topic_docs[topic].append(doc)
        return (topics, topic_docs, topic_model)
    except Exception as e:
        print("Error in topic model, could be too few topics to process. Try again with different filters and/or parameters.")
        return None

In [None]:
def bertopic_time(dataframe, topic_model):
    """
    Takes as input a dataframe and Bertopic topic model.
    Extracts dates and sentences for each dataframe row as lists.
    Sends sentence list/dates to topics_over_time provided by BERTopic 
    to create visualisation.
    Returns visualisation
    """
    sent_list = dataframe["sentence"].to_list()
    date_list = dataframe["date"].to_list()
    #parameters can be adjusted for visualisation
    topics_over_time = topic_model.topics_over_time(docs=sent_list,
                                                timestamps=date_list,
                                                nr_bins=30
                                              )
    fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=15, height=500, width=1000)
    fig.update_layout(yaxis_title = "Count")
    return fig

In [None]:
def frequency_visualisation(count_object, title):
    """
    Takes as input a dictionary of items and their frequency scores
    or a list of iterables which is converted into a frequency dictionary.
    Returns a Matplotlib bar chart representing
    value frequency, with title string as bar chart title
    or a statement saying input dictionary is empty.
    """
    if not count_object:
        return "Count object is empty, no visualisation returned."
    if isinstance(count_object, list):
        count_object = dict(count_object)
    items, frequency = zip(*count_object.items())
    x_pos = np.arange(len(items)) 
    fig, ax = plt.subplots(figsize=(7, 5))
    fig.tight_layout(pad=2.0)
    ax.bar(x_pos, frequency)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(items, rotation=90)
    ax.set_ylabel('Frequency')
    ax.set_title(title)
    plt.show()

In [None]:
def network_graph(edges, edge_weights=False, node_size_mult=15):
    """
    Takes list of lists of publication/location edges, returns a network graph 
    with node size changing depending on how many times node appears. Coloured 
    markers in graph depending on whether node contains comma (occuring in locations).
    If edge_weights is True, then edge thickness changes increases in proportion to
    the frequency of connections
    """
    #initialise network graph object
    G = nx.Graph()
    #flatten edges list of lists to get nodes
    nodes = [item for ls in edges for item in ls]
    #initialise counter object
    interactions = collections.Counter()
    #count the appearances of nodes
    for item in nodes:
        interactions[item] += 1
    #convert interactions to a dictionary
    interactions = dict(interactions)
    #create lists for node colours and sizes
    col_map = []
    size_list = []
    #loop through each node and number of appearances
    for key, value in interactions.items():
        #add each node to network graph
        G.add_node(key)
        #add value to list created above, made larger using size multiplier parameter
        size_list.append(value*node_size_mult)
        #give places and publications different colours and append colour to list created above
        if ',' in key:
            col_map.append('red')
        else:
            col_map.append('blue')
    #if edge_weights set to true, count each edge and convert to edge width
    if edge_weights:
        #use Counter to count the occurrences of each edge tuple
        edge_tups = [tuple(item) for item in edges]
        #count instances of each tuple
        edge_counts = collections.Counter(edge_tups)
        #convert the Counter object to a list of unique tuples with counts as a 'weight' dictionary at the end
        edges = [(t[0], t[1], {'weight': count}) for t, count in edge_counts.items()]
        #add edges to graph
        G.add_edges_from(edges)
        #get the edge weights as a dictionary
        edge_counts = nx.get_edge_attributes(G, 'weight')
        #create a list of edge widths based on the 'weight' attribute
        edge_widths = [edge_counts.get((u, v), 1) for u, v in G.edges]
        edge_widths = [value/2 for value in edge_widths]
    #if not edge_weights, just add edge tuples to graph with one edge width
    else:
        #add edges to graph
        G.add_edges_from(edges)
        #set all edge widths to 0.5
        edge_widths = 0.5
    #establish graph style and node positions
    pos = nx.spring_layout(G, k=0.1, iterations=20)
    #return visualisation using networkx graph object, size_list, colour_map,
    #labels, edge widths, width/height of image
    return hvnx.draw(G, pos, edge_cmap='viridis', node_size=size_list, node_color=col_map, edge_width=edge_widths, 
              width=1000, height=750)

## Extracting Data from Files

We begin by extracting the data we need from the speech report files and the speech register file.

In [None]:
# get paths for speech reports
dir_path = Path('sources/')
#get filepaths to all the XML files in the directory
xml_files = (file for file in dir_path.iterdir() if file.is_file() and file.name.lower().endswith('.xml'))
#sort xml file paths numerically using os_sorted library
xml_files = os_sorted(xml_files)

#get path for speech register file
speech_file = Path('speeches/parnell_speeches.xml')

In [None]:
#basename returns filename removing directory path.
#split to remove ".xml" extension so that we can use later as identifier
filenames = []
for path in xml_files:
    filename = os.path.basename(path)
    filename = filename.split(".")[0]
    filenames.append(filename)

In [None]:
#extract speech reports and speech register as beautiful soup objects using function
report_objects = soup_objects(xml_files)
speech_object = soup_objects(speech_file)

In [None]:
#speech register beautiful soup objects extracted as lists using function
#speech id, speech place and data
speech_id_objs = tei_extractor(speech_object, element='speech_id')
place_objs = tei_extractor(speech_object, element='place', attributes=['key'])
date_objs = tei_extractor(speech_object, element='date', attributes=['when'])

#speech reports beautiful soup objects extracted as lists using function
#speech id, publication name, text
speech_rep_objs = tei_extractor(report_objects, element='term', attributes=['key'])
publication_objs = tei_extractor(report_objects, element='title', attributes=['key', 'level'])
text_objs = tei_extractor(report_objects, element='body')

In [None]:
#use function to extract values from tei elements extracted above
speech_ids = tei_values(speech_id_objs)
speech_places = tei_values(place_objs)
#if speech place returns blank convert to "unknown"
speech_places = ["unknown" if item == "" else item for item in speech_places]
speech_dates = tei_values(date_objs, attribute='when')

speech_rep_ids = tei_values(speech_rep_objs, attribute='key')
publications = tei_values(publication_objs)
#if publication returns blank convert to "unknown"
publications = ["unknown" if item == "" else item for item in publications]
texts = tei_values(text_objs)

#for speech ids, if more than one id present, shown by inclusion of comma, convert into a list
#some reports refer to more than one speech, so we need to capture them all 
speech_rep_ids = [item.split(',') if ',' in item else item for item in speech_rep_ids]

## Data Preparation

The next stage is to convert the speech data into a dataframe format where we can easily manipulate it and get different subsets prior to analysis/visualisation.

### Dataframes

Below we create different types of dataframe that will be used for different forms of analysis.

### Main Dataframe

Merges together speech report and speech register dataframes. The same report text will sometimes be repeated. This is the result of giving each speech id its own line in cases where a report covers more than one speech. This is necessary for forms of analysis where the speech id is important, but not for text-based analysis. Therefore, we create other dataframes below for text-based analysis.

In [None]:
#prepare data for speech report dataframe, make list of lists of data and list of column names
speech_rep_data = [filenames, speech_rep_ids, publications, texts]
speech_rep_columns = ['filename', 'speech_id', 'publication', 'text']
#use function to turn the above lists into dataframe
speech_rep_df = create_dataframe(speech_rep_data, speech_rep_columns)
#if there is more than one speech id for a report, the report data will appear as row for each id
speech_rep_df = speech_rep_df.explode('speech_id')
#use function to clean the text in the dataframe and standardise it
speech_rep_df = dataframe_cleaning(speech_rep_df, clean_column='text')
speech_rep_df

In [None]:
#prepare data for speech register dataframe, make list of lists of data and list of column names
speech_data = [speech_ids, speech_places, speech_dates]
speech_columns = ['speech_id', 'place', 'date']
#use function to turn the above lists into dataframe
speech_df = create_dataframe(speech_data, speech_columns)
#use function to clean the text in the dataframe and standardise it
speech_df = dataframe_cleaning(speech_df)
#make speech id index so we can use it when we merge dataframes below
speech_df.set_index('speech_id', inplace=True)
#convert date column to datetime format, enables us to manipulate dataframe using dates
speech_df['date'] = pd.to_datetime(speech_df['date'], format='%Y-%m-%d')
#drop empty rows
speech_df = speech_df.dropna(axis=0)
speech_df

In [None]:
#join the speech register and speech report dataframes indexing on speech_id
df_all = speech_rep_df.merge(speech_df, left_on='speech_id', right_index=True)
df_all

### Deduplicated Dataframe

This dataframe removes duplicate rows for the same report. These are created in the main dataframe because sometimes a report will cover more than one speech, making it need more than one row. Removing rows means not all speeches are covered, but report text is not duplicated. This is needed for text analysis.

In [None]:
#create deduplicated dataframe for text-based analysis
#df_all will have multiple rows for same report if it covers multiple speeches
#as each repeated report row will also have the accompanying text/metadata
df_dedup = df_all.loc[~df_all.index.duplicated(keep='first')]
df_dedup

### Sentence Dataframe

The main dataframe divided so that each sentence has a row with accompanying metadata. Again, the same report text will sometimes be repeated. Therefore, we create a deduplicated sentence dataframe below for text-based analysis.

In [None]:
#initialize nltk abbreviation words, these will be added to the sentence tokenizer
#they will prevent the tokenizer from reading some full stops as sentence-enders
punkt_param = PunktParameters()
#we can add our own abbreviation words, e.g. "hon." and "mr." frequently have full stops in the reports
punkt_param.abbrev_types = set(['hon', 'mr', 'rev', 'dr', 'm.p', 'c.s', 'c.v', 'c.e', 't.l', 'j.r', 'j.j', 'a.j',
                            'r.b', 'j.g', 'j.l', 'j.r', 'j.f', 'n.b', 'p.j', 'c.j', 't.d', 'r', 'p.p', 'l.p', 'c.c', 'wm',
                            'capt', 'messrs', 'patk', '1d', '2d', '3d', '4d', '5d', '6d', '7d', '8d', '9d', '10d', '11d',
                            '1/2d', '3/4d', 'prof', 'per cent', 'adm', '2s', '1,400,000/', '400,000/'])

#initialize nltk sentence detector for dividing text into sentences
sentence_tokenizer = PunktSentenceTokenizer(punkt_param)

In [None]:
#create a copy of the original dataframe
speech_sents_df = df_all.copy()
#tokenize so each row contains a speech report sentence, with accompanying metadata
speech_sents_df = dataframe_sentence_tokenize(dataframe=speech_sents_df, column="text", tokenizer=sentence_tokenizer)
speech_sents_df

### Deduplicated Sentence Dataframe

This dataframe removes duplicate sentence rows for the same report. Removing rows means not all speeches are covered, but report text is not duplicated. This is needed for text analysis.

In [None]:
speech_sents_df_dedup = df_dedup.copy()
speech_sents_df_dedup = dataframe_sentence_tokenize(dataframe=speech_sents_df_dedup, 
                                                    column="text", tokenizer=sentence_tokenizer)
speech_sents_df_dedup

In [None]:
#now also able remove punctuation from dataframe not divided into sentences
df_all['text'] = df_all['text'].apply(lambda x: punct_removal(x))
df_all

In [None]:
#now also able remove punctuation from dataframe not divided into sentences
df_dedup['text'] = df_dedup['text'].apply(lambda x: punct_removal(x))
df_dedup

## <font color="blue">DIY Section</font>

<font color="blue">This section is to create your own parameters for analysis across all of the methodologies we have covered together. Change the parameters the cells below as per the instructions to get different results</font>

### <font color="blue">Alterable Cells</font>

### <font color="blue">Global Parameters</font>

<font color="blue">Changing the contents of the cells below will set global parameters for Frequency, Network, and Topic Modelling sections. The frequency section also has its own alterable parameters, so please be aware that they will be affected by the global parameters.</font>

#### <font color="blue">Run Code from this Cell</font>

#### <font color="blue">True/False Parameters</font>

<ul style="color: blue;">
<li>word_boundary - whether or not to match substrings in text-based analysis, such as "labo" for "labour", "labor".</li>
<li>match_all - whether to match all keywords in text-based analysis or match any from list.</li>
<li>exclude_word_boundary - when excluding items in a text-based analysis, whether or not to match substrings.</li>
<li>exclude_match_all - when excluding items in a text-based analysis, whether to match all keywords or any from list.</li>
<li>stopwords_removal - whether to remove stopwords during some forms of text-based analysis.</li>
</ul>

In [None]:
word_boundary = True
match_all = True
exclude_word_boundary = True
exclude_match_all = False
stopwords_removal = True

In [None]:
#import stopwords list, we can then add our own stopwords to this list
#stopwords are common words we can omit from our corpus if they are not useful for analysis
stopwords_ls = stopwords.words('english')
stopwords_ls.extend(["every", "would", "cheers", "hisses", "applause", "could", "upon", "may", "go",
                   "said", "say", "know", "far", "come", "put", "us"])
stopwords_ls = [item.lower() for item in stopwords_ls]
stopword_ls = set(stopwords_ls)

#### <font color="blue">Global Filters</font>

<font color="blue">These are filters that will be applied across all of the types of analysis below. Entering items into any of the fields in the global_filters section applies to the input dataset by default.</font>

<ul style="color: blue;">
<li>keywords - text-based analysis, filter to get rows with keywords e.g. "keywords": ["tenant", "farmer"]</li>
<li>not_keywords - text-based analysis, filter to remove rows with keywords e.g. "not_keywords": ["tenant", "farmer"]</li>
<li>places - filter by whether substring in list matches location in row e.g. "places": ["dublin", "london"] would match rows where any of those items are contained in the "place" column.</li>
<li>not_places - filter by whether substring in list matches location in row e.g. "not_places": ["dublin", "london"], but remove those rows instead.</li>
<li>publications - filter by whether substring in list matches publication in row e.g. "publications": ["freeman", "times"] would match rows where any of those items are contained in the "publication" column.</li>
<li>not_publications - filter by whether substring in list matches publication in row e.g. "not_publications": ["freeman", "times"], but remove those rows instead.</li>
<li>include_range - filter by whether speech date falls into a range of dates e.g. "include_range": ["1880-01-01", "1885-12-31"]</li>
<li>exclude_range - filter by whether speech date falls into a range of dates e.g. "exclude_range": ["1884-01-01", "1884-12-31"], but remove those rows instead.</li>
</ul>

In [None]:
global_filters = {
    "keywords": [],
    "not_keywords": [],
    "places": [],
    "not_places": [],
    "publications": [],
    "not_publications": [],
    "include_range": [],
    "exclude_range": []
}
#make all values lowercase same as dataframes
global_filters = {key: [item.lower() for item in value] for key, value in global_filters.items()}

### <font color="blue">Frequency Parameters</font>

<font color="blue">The cells below contain alterable parameters just for the frequency section, but bear in mind that they will be affected by non-text-based parameters or filters from the Global section above. For instance, if there is a global filter setting an inclusive date range of 1880 to 1885, the frequency parameters and section will only be applied to that date range.</font>

<font color="blue">Amendable list of words to count for word frequency across dataset e.g. count_keywords = ["parnell", "reform"]</font>

In [None]:
count_keywords = ["parnell", "reform"]

<font color="blue">Word to further analyse for the frequency of words in the same sentence as them. Also for visualising the count of a specific keyword by year.</font>

<font color="blue">Must be contained in the appropriate list above  e.g. context_visual_keyword = "parnell".</font>

In [None]:
context_visual_keyword = "parnell"

<font color="blue">Words that will be used to extract the sentences in which they are all contained, can be used to see where the words appear together in their original sentence context e.g. context_keywords = ["parnell", "question"]</font>

In [None]:
context_keywords = ["parnell", "question"]

### <font color="blue">Running the Section</font>

<font color="blue">To run this DIY section code **click in the cell with "keywords"** in it, then click **Runtime** in the menu at the top of the Jupyter Notebook, then click **Run cell and below**.</font>

<font color="blue">To see your results, scroll down and view the sections below.</font>

### <font color="blue">Currently Active Filters</font>

<font color="blue">This section shows the currently active parameters for the frequency section.</font>

In [None]:
active_filters_parameters = display_active_filters(global_filters, word_boundary=word_boundary,
    match_all=match_all, exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all,
    stopwords_removal=stopwords_removal, count_keywords=count_keywords, context_visual_keyword=context_visual_keyword,
    context_keywords=context_keywords)
print(active_filters_parameters)

### Apply Filters to Dataframes

In [None]:
df_all_text_filt = dataframe_apply_filters(dataframe=df_dedup, text_column="text", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all)
df_all_non_text_filt = dataframe_apply_filters(dataframe=df_dedup, text_column="text", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all,
                               text_filtering=False)

In [None]:
df_dedup_text_filt = dataframe_apply_filters(dataframe=df_dedup, text_column="text", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all)
df_dedup_non_text_filt = dataframe_apply_filters(dataframe=df_dedup, text_column="text", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all,
                               text_filtering=False)

In [None]:
sents_df_text_filt =  dataframe_apply_filters(dataframe=speech_sents_df, text_column="sentence", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all)
sents_df_non_text =  dataframe_apply_filters(dataframe=speech_sents_df, text_column="sentence", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all,
                               text_filtering=False)

In [None]:
sents_df_dedup_text_filt =  dataframe_apply_filters(dataframe=speech_sents_df_dedup, text_column="sentence", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all)
sents_df_dedup_non_text =  dataframe_apply_filters(dataframe=speech_sents_df_dedup, text_column="sentence", filters_dictionary=global_filters,
                               word_boundary=word_boundary, match_all=match_all, 
                               exclude_word_boundary=exclude_word_boundary, exclude_match_all=exclude_match_all,
                               text_filtering=False)

## Frequency

In [None]:
#make all user inputs lowercase
count_keywords = [item.lower() for item in count_keywords]
context_keywords = [item.lower() for item in context_keywords]
context_visual_keyword = context_visual_keyword.lower()

### Speech, Report and Sentence Counts

#### Not affected by global filters

In [None]:
speech_year_counts = year_counts(speech_df)
frequency_visualisation(count_object=speech_year_counts, title="Number of Speeches per Year")

In [None]:
report_year_counts = year_counts(df_dedup)
frequency_visualisation(count_object=report_year_counts, title="Number of Speech Reports per Year")

In [None]:
sentence_year_counts = year_counts(speech_sents_df_dedup)
frequency_visualisation(count_object=sentence_year_counts, title="Number of Sentences per Year")

### Keyword Counts

#### Not affected by global text filters, but affected by others

In [None]:
df_count = dataframe_count_strings(dataframe=df_dedup_non_text_filt, column="text", 
                                   keywords=count_keywords, word_boundary=word_boundary)
df_count

In [None]:
frequency_visualisation(count_object=df_count, title="Word Frequency")

### Keyword Counts by Year

#### Not affected by global text filters, but affected by others

In [None]:
year_word_counts_df = count_keywords_by_year(df_dedup_non_text_filt, column="text", 
                                            keywords=count_keywords, word_boundary=word_boundary)
year_word_counts_df

In [None]:
keyword_count_year = year_word_counts_df.groupby("year")[context_visual_keyword].sum()
keyword_count_year = list(keyword_count_year.items())
frequency_visualisation(count_object=keyword_count_year, title=f'Frequency of "{context_visual_keyword}" per Year')

### Sentence and Report Co-Frequency Counts

#### Not affected by global text filters, but affected by others

In [None]:
co_freq_count = {}
sent_co_freq_count = {}

for count_word in count_keywords:
    #filter the speech_sents_df_dedup dataframe for each word in count_keywords,
    #so that rows where sentence contains word are returned
    sent_word_df = filter_dataframe_by_keywords(dataframe=sents_df_dedup_non_text, column="sentence", 
                                                    keywords=[count_word], word_boundary=word_boundary, 
                                                    match_all=True)
    sent_word_count = dataframe_cooccurrance_count(dataframe=sent_word_df, column="sentence", 
                                              word=count_word)
    
    word_df = filter_dataframe_by_keywords(dataframe=df_dedup_non_text_filt, column="text", 
                                                    keywords=[count_word], word_boundary=word_boundary, 
                                                    match_all=True)
    word_count = dataframe_cooccurrance_count(dataframe=word_df, column="text", 
                                              word=count_word)
    
    
    if stopwords_removal:
        sent_word_count = [(k, v) for k, v in sent_word_count if k not in stopwords_ls]
        word_count = [(k, v) for k, v in word_count if k not in stopwords_ls]
    sent_co_freq_count[count_word] = sent_word_count[:15]
    co_freq_count[count_word] = word_count[:15]

In [None]:
co_freq_count

In [None]:
if context_visual_keyword in co_freq_count:
    report_visual_words = co_freq_count[context_visual_keyword]
    frequency_visualisation(count_object=report_visual_words, title=f'Same Report as "{context_visual_keyword}"')
else:
    print("Context visual keyword is not in the count keywords list")

In [None]:
sent_co_freq_count

In [None]:
if context_visual_keyword in sent_co_freq_count:
    sentence_visual_words = sent_co_freq_count[context_visual_keyword]
    frequency_visualisation(count_object=sentence_visual_words, title=f'Same Sentence as "{context_visual_keyword}"')
else:
    print("Context visual keyword is not in the count keywords list")

### Sentence Keywords Context

In [None]:
if context_keywords:

    sentence_keywords_df = filter_dataframe_by_keywords(dataframe=sents_df_non_text, column="sentence", 
                                                        keywords=context_keywords, word_boundary=word_boundary, 
                                                        match_all=True)
    if sentence_keywords_df.empty:
        print("Sentence context search returned no results")
    
    else:
        speech_list = sentence_keywords_df['speech_id'].to_list()
        report_list = sentence_keywords_df['filename'].to_list()
        publication_list = sentence_keywords_df['publication'].to_list()
        location_list = sentence_keywords_df['place'].to_list()
        date_list = sentence_keywords_df['date'].to_list()
        sentence_list = sentence_keywords_df['sentence'].to_list()
        
        for speech, place, report, pub, date, sent in zip(speech_list, location_list, 
                                                           report_list, publication_list, date_list, sentence_list):
            print(report, pub, speech, place, date)
            print(sent)
            print('\n')

else:
    print("Context keywords list is empty")

### Report Keywords Context

In [None]:
if context_keywords:

    report_keywords_df = filter_dataframe_by_keywords(dataframe=df_all_non_text_filt, column="text", 
                                                        keywords=context_keywords, word_boundary=word_boundary, 
                                                        match_all=True)
    if report_keywords_df.empty:
        print("Report context search returned no results")
    
    else:
        speech_list = report_keywords_df['speech_id'].to_list()
        report_list = report_keywords_df['filename'].to_list()
        publication_list = report_keywords_df['publication'].to_list()
        location_list = report_keywords_df['place'].to_list()
        date_list = report_keywords_df['date'].to_list()
        text_list = report_keywords_df['text'].to_list()
        
        for speech, place, report, pub, date, text in zip(speech_list, location_list, 
                                                           report_list, publication_list, date_list, text_list):
            print(report, pub, speech, place, date)
            print(text)
            print('\n')

else:
    print("Context keywords list is empty")

## Network Analysis

In [None]:
edges = df_all_text_filt[['publication', 'place']].values.tolist()
if not edges:
    print("Network is empty, try again with different filters and/or parameters")
network_graph(edges, edge_weights=True)

## Topic Modelling

In [None]:
#initialize BERTopic topic model with parameters, uses a sentence transformers model to calculate topics
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", nr_topics="auto", top_n_words=15, min_topic_size=15)
topic_error_msg = "The topics list has too few topics to process. Try again with different filters and/or parameters."

In [None]:
topic_model_df = sents_df_dedup_text_filt.copy()
if stopwords_removal and not topic_model_df.empty:
    topic_model_df['sentence'] = topic_model_df['sentence'].apply(lambda x: remove_stopwords(x, stopwords_ls))

In [None]:
model_data = None
if not topic_model_df.empty:
    topics_data_all = bertopic_topics(topic_model_df, topic_model)
    if topics_data_all:
        model_data = topics_data_all[2]
else:
    print(topic_error_msg)

### Topics List

In [None]:
if model_data:
    output = model_data.get_topics()
else:
    output = topic_error_msg
output

### Clustering Groups

In [None]:
if model_data:
    output = model_data.visualize_hierarchy()
else:
    output = topic_error_msg
output

### Topics over Time

In [None]:
if model_data:
    output = bertopic_time(topic_model_df, model_data)
else:
    output = topic_error_msg
output