# BERTopic and Parnell

Import additional Python Libraries

In [47]:
#libraries for getting file paths and data extraction from files
import glob
import os
from pathlib import Path
from natsort import natsorted
from natsort import os_sorted
#libraries for data analysis and manipulation
import pandas as pd
import string
import re
import cufflinks as cf
from bs4 import BeautifulSoup
from datetime import datetime
import cufflinks as cf
#nlp libraries
import spacy
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from bertopic import BERTopic
#libraries for visualisations
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

#optimise notebook and spacy settings
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

init_notebook_mode(connected=True) 

## Functions

Functions to extract data from TEI files using Beautiful Soup.

In [48]:
def soup_objects(file_paths):
    '''Takes either a list of file paths or a file path.
    Returns a list of beautiful soup objects or single
    beautiful soup object, depending on the input.
    '''
    if type(file_paths) == list:
        soup_list = []
        for path in file_paths:
            with path.open("r", encoding="utf-8") as xml:
                source = BeautifulSoup(xml, "lxml-xml")
                soup_list.append(source)
        return soup_list
    else:
        with file_paths.open("r", encoding="utf-8") as xml:
            soup_object = BeautifulSoup(xml, "lxml-xml")
        return soup_object

In [49]:
def tei_extractor(soup_obj, element, attributes=False):
    '''Takes Beautiful soup object or list of objects, 
    element using element name and, where necessary, attributes.
    Returns list of elements for all input files or list of
    elements for input file, depending on input.
    '''
    attrib_dict ={}
    if attributes:
        attrib_dict = {attr: True for attr in attributes}
    
    if type(soup_obj) == list:
        elem_ls = [obj.find(element, attrib_dict) for obj in soup_obj]
        return elem_ls 
    else:
        elem_ls = soup_obj.find_all(element, attrib_dict)
        return elem_ls

In [50]:
def tei_values(object_list, attribute=False):
    '''Takes a list of beautiful soup elements, if attribute
    value is being extracted include name of that attribute.
    Return element or attribute value depending on input(s)
    '''
    if attribute:
        values = [obj[attribute] for obj in object_list]
        return values
    else:
        values = [obj.get_text() for obj in object_list]
        return values

Functions to perform text cleaning and convert results into dataframe format.

In [51]:
def text_cleaning(text):
    '''Takes as input a string, removes/replaces special characters, newlines,
    possessive apostrophes, hyphens, underscores, digits and makes single space.
    Returns clean string.
    '''
    text = text.replace(u"\xa0", u" ").replace("&", "and").replace("|", " ")
    text = text.replace("\n", " ").replace("’", "'").replace("'s ", ' ')
    text = text.replace("-", " "). replace("–", " ").replace("_", " ").replace("—", " ")
    non_digit_text = re.sub(r"\b\d+\b", "", text)
    sing_space_text = re.sub(r"\s\s+", " ", non_digit_text)
    sing_space_text = sing_space_text.strip()
    return sing_space_text

In [52]:
def create_dataframe(data, columns):
    '''Takes as input a list of lists of data and a list of columns.
    Returns a dataframe.
    '''
    df = pd.DataFrame(data)
    df = df.transpose()
    df.columns = columns
    return df

In [53]:
def dataframe_cleaning(dataframe, clean_column=None):
    '''Takes as input dataframe and makes lowercase, strips leading and
    trailing spaces, standardises apostrophes. Applies data cleaning function
    to column if identified as clean column parameter.
    Returns lowercase/cleaned dataframe.
    '''
    lower_dataframe = dataframe.applymap(lambda x: x.lower())
    lower_dataframe = lower_dataframe.applymap(lambda x: x.replace("’", "'"))
    if clean_column:
        lower_dataframe[clean_column] = lower_dataframe[clean_column].apply(lambda x: text_cleaning(x))
    clean_dataframe = lower_dataframe.applymap(lambda x: x.strip())
    return clean_dataframe

In [54]:
def punct_removal(text):
    '''Takes as input a string and removes punctuation, removes extra spacing.
    Returns string without punctuation.
    '''
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    text = re.sub(r"\s\s+", " ", text)
    text = text.strip()
    return text

Functions to perform topic modelling and sentence/year counts then visualise the results.

In [55]:
def bertopic_time(dataframe, topic_model):
    '''Takes as input a dataframe and Bertopic topic model tool.
    Extracts dates and sentences for each dataframe row as lists.
    Fit sentence list to topic model, create dictionary of topics/sentences.
    Send sentence list/dates to topics_over_time to create visualisation.
    Return visualisation, topic/sentence dictionary.
    '''
    sent_list = dataframe["sentence"].to_list()
    date_list = dataframe["date"].to_list()
    topics, probs = topic_model.fit_transform(sent_list)
    
    topic_docs = {topic: [] for topic in set(topics)}
    for topic, doc in zip(topics, sent_list):
        topic_docs[topic].append(doc)
    
    topics_over_time = topic_model.topics_over_time(docs=sent_list, 
                                                timestamps=date_list, 
                                              )
    fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=15, height=500, width=1000)
    fig.update_layout(yaxis_title = "Count")
    return (fig, topic_docs)

In [56]:
def year_counts(dataframe):
    '''Takes as input a dataframe.
    Returns as series with number of sentences per year.
    '''
    dataframe["year"] = dataframe["date"].dt.year
    year_counts = dataframe.groupby(["year"]).size()
    return year_counts

In [57]:
def year_counts_plot(year_counts):
    '''Takes as input a year count series.
    Returns a bar chart of number of sentences per year.
    '''
    year_counts_plot = px.bar(year_counts, 
                          title="Sentence Count by Year",
                          labels= {
                              "value":"Count",
                              "year":"Year"
                          }
                         )
    year_counts_plot.update_layout(showlegend=False)
    return year_counts_plot

Functions to alter the data prior to input in topic modelling: lemmatization, removal of stopwords, narrow dataframe by date window or keyword search.

In [58]:
def lemmatization(text):
    '''Takes string and converts to lemmatized string'''
    doc = nlp(text)
    return ' '.join(token.lemma_ for token in doc)

In [59]:
def remove_stopwords(text, stopwords):
    ''' Take as input string and list of stopwords, tokenizes
    string and removes words contained in stopwords.
    Returns re-joined string without stopwords.
    '''
    tokenized_text = text.split()
    non_stop_text = [token for token in tokenized_text if token not in stopwords]
    return ' '.join(non_stop_text)   

In [60]:
def dataframe_date_window(dataframe, start_date, end_date):
    '''Takes as input dataframe, start date and end date.
    Returns dataframe with just the rows where the date is on/between
    the start date and end date.
    '''
    mask = (dataframe['date'] > start_date) & (dataframe['date'] <= end_date)
    win_dataframe = dataframe.loc[mask]
    return win_dataframe

In [61]:
def dataframe_keyword(dataframe, keywords):
    '''Takes as input dataframe and a list of keywords.
    Returns dataframe with just rows where sentence contains any
    of the words in keywords, keywords removed from sentences in
    which they appear.
    '''
    boundary_list = [fr'\b{word}\b' for word in keywords]
    keyword_df = dataframe[dataframe['sentence'].apply
                           (lambda sentence: any(re.search(word, sentence) for word in boundary_list))]
    keyword_df['sentence'] = keyword_df['sentence'].str.replace('|'.join(boundary_list), '', regex=True).str.strip()
    return keyword_df

## Data Extraction

We begin by extracting the data we need from the speech report files and the speech register file.

In [62]:
# get paths for speech reports
dir_path = Path('sources/')
#get filepaths to all the XML files in the directory
xml_files = (file for file in dir_path.iterdir() if file.is_file() and file.name.lower().endswith('.xml'))
#sort xml file paths numerically using os_sorted library
xml_files = os_sorted(xml_files)

#get path for speech register file
speech_file = Path('speeches/parnell_speeches.xml')

#basename returns filename removing directory path.
#split to remove ".xml" extension so that we can use later in dataframe
filenames = []
for path in xml_files:
    filename = os.path.basename(path)
    filename = filename.split(".")[0]
    filenames.append(filename)

In [63]:
#extract speech reports and speech register as beautiful soup objects using function
report_objects = soup_objects(xml_files)
speech_object = soup_objects(speech_file)

In [64]:
#speech register beautiful soup objects extracted as lists using function
speech_objs = tei_extractor(speech_object, element='speech_id')
place_objs = tei_extractor(speech_object, element='place', attributes=['key'])
date_objs = tei_extractor(speech_object, element='date', attributes=['when'])

#speech report beautiful soup objects extracted as lists using function
speech_rep_objs = tei_extractor(report_objects, element='term', attributes=['key'])
publication_objs = tei_extractor(report_objects, element='title', attributes=['key', 'level'])
text_objs = tei_extractor(report_objects, element='body')

In [65]:
#use function to extract values from tei elements extracted above
speech_ids = tei_values(speech_objs)
speech_places = tei_values(place_objs)
speech_dates = tei_values(date_objs, attribute='when')

speech_rep_ids = tei_values(speech_rep_objs, attribute='key')
publications = tei_values(publication_objs)
texts = tei_values(text_objs)

#for speech ids, if more than one id present, shown by inclusion of comma, convert into a list
speech_rep_ids = [item.split(',') if ',' in item else item for item in speech_rep_ids]

## Data Preparation

The next stage is to convert the speech data into a dataframe format where we can easily manipulate it and get different subsets prior to analysis/visualisation.

We begin by converting the speech report data into a dataframe and performing some data cleaning on the text to standardise it and make it more suitable for data analysis. This includes making the text lower case, removing extra spacing and some special characters.

In [66]:
#prepare data for speech report dataframe, make list of lists of data and list of column names
speech_rep_data = [filenames, speech_rep_ids, publications, texts]
speech_rep_columns = ['filename', 'speech_id', 'publication', 'sentence']
#use function to turn the above lists into dataframe
speech_rep_df = create_dataframe(speech_rep_data, speech_rep_columns)
speech_rep_df = speech_rep_df.explode('speech_id')
#use function to clean the text in the dataframe and standardise it
speech_rep_df = dataframe_cleaning(speech_rep_df, clean_column='sentence')
speech_rep_df

Unnamed: 0,filename,speech_id,publication,sentence
0,parnell_source_00001,speech_00001,the nation,the home rule league great meeting in the rotu...
1,parnell_source_00002,speech_00001,the freeman's journal,the home rule league on saturday evening a pub...
2,parnell_source_00003,speech_00001,the nation,"the week ""though beaten, we are not vanquished..."
3,parnell_source_00004,speech_00001,the irish times,irish home rule league a public meeting of the...
4,parnell_source_00005,speech_00002,the freeman's journal,"mr. charles stewart parnell, in seconding the ..."
...,...,...,...,...
656,parnell_source_00660,speech_00396,the freeman's journal,"fellow countrymen and fellow citizens, it is n..."
657,parnell_source_00661,speech_00397,the freeman's journal,"mr. chairman, fellow citizens, and people of t..."
658,parnell_source_00662,speech_00398,the freeman's journal,"people of mallow, i certainly did not expect t..."
659,parnell_source_00663,speech_00399,the freeman's journal,"people of dungarvan, i will, through you, expr..."


We then use the sentence tokenizer to divide each report text into sentences before amending the dataframe so that each sentence has its own row with the appropriate report data for that sentence.

Having done this we remove punctuation from all sentences in the sentence column and remove rows where the sentence is less than 3 words as these tend not to be proper sentences (crowd reactions etc).

In [67]:
#initialize nltk abbreviation words, these will be added to the sentence tokenizer
#they will prevent the tokenizer from reading some full stops as sentence-enders
punkt_param = PunktParameters()
#we can add our own abbreviation words, e.g. "hon." and "mr." frequently have full stops in the reports
punkt_param.abbrev_types = set(['hon', 'mr', 'rev', 'dr', 'm.p', 'c.s', 'c.v', 'c.e', 't.l', 'j.r', 'j.j', 'a.j',
                            'r.b', 'j.g', 'j.l', 'j.r', 'j.f', 'n.b', 'p.j', 'c.j', 't.d', 'r', 'p.p', 'l.p', 'c.c', 'wm',
                            'capt', 'messrs', 'patk', '1d', '2d', '3d', '4d', '5d', '6d', '7d', '8d', '9d', '10d', '11d',
                            '1/2d', '3/4d', 'prof', 'per cent', 'adm', '2s', '1,400,000/', '400,000/'])

#initialize nltk sentence detector for dividing text into sentences
sentence_tokenizer = PunktSentenceTokenizer(punkt_param)

#apply sentence tokenizer to each text in the dataframe to convert into a list of sentences
speech_rep_df['sentence'] = speech_rep_df['sentence'].apply(lambda x: sentence_tokenizer.tokenize(x))
#use explode on the sentence column, so that each sentence is converted into its own row
speech_rep_df = speech_rep_df.explode('sentence')
#now that the text has been divided into sentences, we can remove punctuation using function
speech_rep_df['sentence'] = speech_rep_df['sentence'].apply(lambda x: punct_removal(x))
#we then remove rows with very short sentences from our dataframe, likely to be crowd reactions etc
speech_rep_df = speech_rep_df[speech_rep_df['sentence'].apply(lambda x: len(x.split()) > 3)]
speech_rep_df

Unnamed: 0,filename,speech_id,publication,sentence
0,parnell_source_00001,speech_00001,the nation,the home rule league great meeting in the rotu...
0,parnell_source_00001,speech_00001,the nation,there was an immense attendance the platform t...
0,parnell_source_00001,speech_00001,the nation,mr charles stewart parnell high sheriff of wic...
0,parnell_source_00001,speech_00001,the nation,the following report of the proceedings is tak...
0,parnell_source_00001,speech_00001,the nation,in view of the unwise course adopted by our op...
...,...,...,...,...
660,parnell_source_00664,speech_00400,the freeman's journal,the application is a perfectly disgraceful one...
660,parnell_source_00664,speech_00400,the freeman's journal,we have had a good legal opinion that all the ...
660,parnell_source_00664,speech_00400,the freeman's journal,the chairman said they had already sent out fo...
660,parnell_source_00664,speech_00400,the freeman's journal,the chairman you must settle that yourselves


We then convert the speech register into a dataframe before applying some data cleaning to make lowercase and remove spacing. We also convert dates into datetime format.

In [68]:
#prepare data for speech register dataframe, make list of lists of data and list of column names
speech_data = [speech_ids, speech_places, speech_dates]
speech_columns = ["speech_id", 'place', 'date']
#use function to turn the above lists into dataframe
speech_df = create_dataframe(speech_data, speech_columns)
#use function to clean the text in the dataframe and standardise it
speech_df = dataframe_cleaning(speech_df)
#make speech id index so we can use it when we merge dataframes below
speech_df.set_index('speech_id', inplace=True)
#convert date column to datetime format, enables us to manipulate dataframe using dates
speech_df['date'] = pd.to_datetime(speech_df['date'], format="%Y-%m-%d")
speech_df

Unnamed: 0_level_0,place,date
speech_id,Unnamed: 1_level_1,Unnamed: 2_level_1
speech_00001,"dublin, ireland",1874-07-11
speech_00002,"dublin, ireland",1875-01-21
speech_00003,"dublin, ireland",1875-01-22
speech_00004,"navan, ireland",1875-04-12
speech_00005,"london, england",1875-04-26
...,...,...
speech_00396,"cork, ireland",1881-10-02
speech_00397,"cork, ireland",1881-10-02
speech_00398,"mallow, ireland",1881-10-03
speech_00399,"dungarvan, ireland",1881-10-05


Having converted our speech reports and register into dataframes, we then join the dataframes together using the speech id contained in both dataframes. We end up with each sentence row in the final dataframe containing the appropriate speech data as well.

In [69]:
#join the speech register and speech report dataframes indexing on speech_id
df_all = speech_rep_df.merge(speech_df, left_on='speech_id', right_index=True)
df_all

Unnamed: 0,filename,speech_id,publication,sentence,place,date
0,parnell_source_00001,speech_00001,the nation,the home rule league great meeting in the rotu...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,there was an immense attendance the platform t...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,mr charles stewart parnell high sheriff of wic...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,the following report of the proceedings is tak...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,in view of the unwise course adopted by our op...,"dublin, ireland",1874-07-11
...,...,...,...,...,...,...
660,parnell_source_00664,speech_00400,the freeman's journal,the application is a perfectly disgraceful one...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,we have had a good legal opinion that all the ...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,the chairman said they had already sent out fo...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,the chairman you must settle that yourselves,"waterford, ireland",1881-10-05


## Topic Modelling

Having prepared our main dataframe, we are now able to begin our topic modelling stage, which will involve manipulating the data in different ways before performing topic modelling on the resulting data subsets and getting visualisations of the results.

Before we do this, however, we first do a count of the number of sentences in each year of the dataset to use alongside our topic modelling process.

In [70]:
#make copy of main dataframe
df_year_counts = df_all.copy()
#use function to get a count of all sentences by year
year_counts_all = year_counts(df_year_counts)
#use function to get visualisation of year/sentence count
report_sent_nums = year_counts_plot(year_counts_all)
report_sent_nums

We first initialize our topic model, setting our embedding model, which is a Sentence Transformers model, alongside the minimum topic size, which is the number of times a topic needs to occur in order to be included. Other parameters are also available and can be seen on the BERTopic website.

Sentence Transformers creates embeddings for all the input sentences in our dataset and uses the similarity of these embeddings to calculate sentence similarity. In the context of BERTopic these embeddings are used to calculate topics.

BERTopic is stochastic by nature and this means that every time the algorithm is run we get different results, although the difference between runs should ideally be relatively small.

In [71]:
#initialize BERTopic topic model with parameters, uses a sentence transformers model to calculate topics
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", nr_topics="auto", min_topic_size=15)

Our first visualisation shows the topics over time for the whole dataset.

In [72]:
#use function to get visualisation of topics for the whole dataset
all_vis_obj = bertopic_time(df_all, topic_model)
all_vis = all_vis_obj[0]
all_vis

Next we lemmatize all of the sentences, so that words are reduced to their root forms - e.g. 'running' becomes 'run'. This reduces unnecessary variation in the dataset as related words are treated as being the same. Having performed lemmatization we then visualize topics over time for the lemmatized dataset.

In [73]:
#copy dataframe from original
df_lemma = df_all.copy()
#use function to lemmatize all of the sentences in dataframe
df_lemma['sentence'] = df_lemma['sentence'].apply(lambda x: lemmatization(x))
df_lemma

Unnamed: 0,filename,speech_id,publication,sentence,place,date
0,parnell_source_00001,speech_00001,the nation,the home rule league great meeting in the rotu...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,there be an immense attendance the platform th...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,mr charles stewart parnell high sheriff of wic...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,the following report of the proceeding be take...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,in view of the unwise course adopt by our oppo...,"dublin, ireland",1874-07-11
...,...,...,...,...,...,...
660,parnell_source_00664,speech_00400,the freeman's journal,the application be a perfectly disgraceful one...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,we have have a good legal opinion that all the...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,the chairman say they have already send out fo...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,the chairman you must settle that yourself,"waterford, ireland",1881-10-05


In [74]:
#use function to get visualisation of topics for the whole dataset with lemmatized text
lemma_vis_obj = bertopic_time(df_lemma, topic_model)
lemma_vis = lemma_vis_obj[0]
lemma_vis

Next we remove stopwords from the lemmatized sentence dataframe, stopwords are words that will tend to lack specific meaning within a dataset due to their high frequency of appearance. We are also able to extend them to include our own stopwords specific to a dataset, such as crowd reactions in this case. Having done this we visualize topics over time with our new data.

In [75]:
#initialize stopwords list
stop_words = stopwords.words('english')
#extend stop words to include extra words
stop_words.extend(['every', 'would', 'cheer', 'hiss', 'applause', 'groan' 'could', 'upon', 'may', 'go',
                   'said', 'say', 'know', 'far', 'come', 'put', 'us', 'parnell', 'ireland', 'irish', 'mp',
                   'mr', 'dr', 'laughter', 'laugh', 'italics', 'irishman'])

#copy from lemmatized dataframe
df_stop = df_lemma.copy()
#use function to remove stopwords from all sentences in dataframe
df_stop['sentence'] = df_stop['sentence'].apply(lambda x: remove_stopwords(x, stop_words))
df_stop

Unnamed: 0,filename,speech_id,publication,sentence,place,date
0,parnell_source_00001,speech_00001,the nation,home rule league great meeting rotundo great s...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,immense attendance platform gallery admission ...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,charles stewart high sheriff wicklow occupy ch...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,following report proceeding take somewhat abri...,"dublin, ireland",1874-07-11
0,parnell_source_00001,speech_00001,the nation,view unwise course adopt opponent notably engl...,"dublin, ireland",1874-07-11
...,...,...,...,...,...,...
660,parnell_source_00664,speech_00400,the freeman's journal,application perfectly disgraceful one one I sa...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,good legal opinion sale since passing land act...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,chairman already send form request particular ...,"waterford, ireland",1881-10-05
660,parnell_source_00664,speech_00400,the freeman's journal,chairman must settle,"waterford, ireland",1881-10-05


In [76]:
#use function to get visualisation of topics for the whole dataset with lemmatized text and stopwords removed
stop_vis_obj = bertopic_time(df_stop, topic_model)
stop_vis = stop_vis_obj[0]
stop_vis

Sometimes reports for the same speech are more or less identical to other reports, due to them coming from the same original transcription. In order to prevent repetition from affecting the topic modelling process, we can remove instances of sentence repetition where the sentences are related to the same speech from our dataframe. We end up with a lemmatized dataframe with stopwords removed and repeated sentences related to the same speech removed. Having done this we visualize topics over time with our new data.

In [77]:
#drop duplicate sentences if they are related to same speech id to remove reports which are copies of others
df_drop = df_stop.drop_duplicates(subset=['speech_id', 'sentence'])
#use function to get visualisation of topics for the dataset with repetition removed with lemmatized text, stopwords
drop_vis_obj = bertopic_time(df_drop, topic_model)
drop_vis = drop_vis_obj[0]
drop_vis

Having performed performed various manipulations on the sentences, we can further narrow the data by time period to see how this affects the topics visualised over time.

In [78]:
#use function to restrict dataframe to a date window
df_80_81 = dataframe_date_window(df_drop, '1880-01-01', '1881-12-31')
#use function to get visualisation of topics for the date window with lemmatized text, stopwords, repetition removed
df_80_81_vis_obj = bertopic_time(df_80_81, topic_model)
df_80_81_vis = df_80_81_vis_obj[0]
df_80_81_vis

Our final visualisation takes our dataframe with lemmatized text, stopwords and repetition related to same speech removed and narrows by sentences containing any of a list of keywords. We end up with a visualisation of topics over time for these keywords.

In [79]:
keywords = ['land', 'landlord']
#use function to restrict dataframe to rows where sentence contains any of keywords
df_keyword = dataframe_keyword(df_drop, keywords)
#use function to get visualisation of topics for keyword(s) with lemmatized text, stopwords, repetition removed
df_keyword_vis_obj = bertopic_time(df_keyword, topic_model)
df_keyword_vis = df_keyword_vis_obj[0]
df_keyword_vis