Import libraries to use for cleaning data and preliminary data analysis

In [1]:
import glob
import string
import spacy
import os
import re
import pickle
import pandas as pd
import cufflinks as cf

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from pathlib import Path
from natsort import os_sorted
from bs4 import BeautifulSoup
from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

init_notebook_mode(connected=True) 

Function to extract the content of the TEI files, each containing a letter and its metadata. Returns a list of Beautiful Soup objects suitable for parsing using the tags within the files.

In [2]:
def soup_objects(file_paths):
    soup_list = []
    for path in file_paths:
        with path.open("r", encoding="utf-8") as xml:
            source = BeautifulSoup(xml, "lxml-xml")
            soup_list.append(source)
    return soup_list

Function which takes a list of file paths from the letters directory, sends it to the function which creates Beautiful soup objects and then extracts the body text from each letter file. The body texts are then cleaned up to remove special characters, newlines, extra spaces, numerals and are also lower-cased. The object being to standardise the texts for better processing. Returns a list of clean body texts.

In [3]:
def text_cleaning(file_paths):
    soup_list = soup_objects(file_paths)
    clean_texts = []
    for obj in soup_list:
        body = obj.find("body")
        text = body.get_text()
        text = text.replace(u"\xa0", u" ").replace("&", "and").replace("|", " ").replace("\n", " ")
        text = text.replace("-", " "). replace("–", " ").replace("_", " ").replace("—", " ")
        text = text.lower()
        text = re.sub(r"\b\d+\b", "", text)
        text = re.sub(r"\s\s+", " ", text)
        text = text.strip()
        clean_texts.append(text)
    return clean_texts

Function which takes a list of file paths from the letters directory, sends it to the function which creates Beautiful soup objects and then extracts the date for each letter. Where there is a date range the function calculates an average. Returns a list of dates for each letter.

In [4]:
def date_extract(file_paths):
    soup_list = soup_objects(file_paths)
    corpus_dates = []
    for obj in soup_list:
        try:
            doc_date = obj.find("date")["when"].strip()
            doc_date = datetime.strptime(doc_date, "%Y-%m-%d")
            corpus_dates.append(doc_date)
        except KeyError:
            first_date = obj.find("date")["notBefore"].strip()
            last_date = obj.find("date")["notAfter"].strip()
            first_date = datetime.strptime(first_date, "%Y-%m-%d")
            last_date = datetime.strptime(last_date, "%Y-%m-%d")
            doc_date = first_date + (last_date - first_date)/2
            corpus_dates.append(doc_date)
    return corpus_dates

Function which takes a list of file paths from the letters directory, sends it to the function which creates Beautiful soup objects and then extracts the correspondents for each letter. Returns a list of sender/correspondent tuples.

In [5]:
def correspondents(file_paths):
    soup_list = soup_objects(file_paths)
    senders_recipients = []
    for obj in soup_list:
        sender = obj.find("correspAction", {"type": "sent"}).find("persName")
        sender_text = sender.get_text()
        recipient = obj.find("correspAction", {"type": "received"}).find("persName")
        recipient_text = recipient.get_text()
        sender_recipient = (sender_text, recipient_text)
        senders_recipients.append(sender_recipient)
    return senders_recipients

Function which takes a dataframe of letter data, extracts the word count of each document and returns a bar plot of document lengths.

In [6]:
def doc_length_plot(dataframe):
    doc_len = dataframe["text"].copy()
    doc_len = doc_len.apply(len).value_counts()
    doc_len.sort_index(inplace=True)
    return doc_len.iplot(
    kind="bar",
    xTitle="Word Count",
    linecolor="Black",
    yTitle="Document Count",
    title="Document Length Distribution"
    )

Function which takes a dataframe of letter data and counts the numbers of all words across the whole corpus. Returns a bar plot of the words which appear most often.

In [7]:
def word_frequency_plot(dataframe):
    word_freq = dataframe["text"].copy()
    word_freq = word_freq.explode().value_counts()
    return word_freq[:50].iplot(
    kind="bar",
    linecolor="Black",
    yTitle="Word Count",
    title="Most Regular Words"
    )

Function which takes a dataframe of letter data. Takes letter texts and converts into Spacy nlp object, which is then converted into parts-of-speech tags for each word in each text. The parts-of-speech across the whole corpus are then counted up. Returns a bar plot of different parts-of-speech and how often they appear in the corpus.

In [8]:
def pos_count_plot(dataframe):
    pos_count = dataframe["text"].copy()
    pos_count = pos_count.map(" ".join).map(nlp)
    pos_count = pos_count.map(lambda x: [token.pos_ for token in x])
    pos_count = pos_count.explode().dropna().value_counts()
    return pos_count.iplot(
    kind="bar",
    linecolor="Black",
    yTitle="Count",
    xTitle="Part-of-Speech",
    title="Part-of-Speech Tags by Number",
    ) 

Function which takes a string and removes all punctuation from it, then removes any extra spacing. Returns string without punctuation.

In [9]:
def punct_removal(text):
    text = text.replace(".", " ")
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    text = re.sub(r"\s\s+", " ", text)
    text = text.strip()
    return text

Function which takes NLTK sentence_tokenizer and string and returns a list of sentences.

In [10]:
def sent_tokens(tokenizer, text):
    sentences = tokenizer.tokenize(text)
    return sentences

Function which converts string into a list of tokens.

In [11]:
def tokenize_text(text):
    token_text = list(simple_preprocess(text, deacc=True))
    return token_text

Function which takes tokenized text and stopwords and returns tokenized text without stopwords.

In [12]:
def remove_stopwords(tokenized_text, stopwords):
    non_stop_text = [token for token in tokenized_text if token not in stopwords]
    return(non_stop_text)    

Function which takes tokenized texts and converts to bigrams where applicable. Returns text with bigrams.

In [13]:
def bigrams(text, tokenized_texts, corpus_bigrams):
    bigram_mod = Phraser(corpus_bigrams)
    bigram_text = bigram_mod[text]
    return bigram_text

Function which takes tokenized text, lemmatizes it and removes text not in allowed parts-of-speech. Returns tokenized word list.

In [14]:
def lemmatization(tokenized_text, allowed_postags=["PROPN", "NOUN"]):
    doc = nlp(" ".join(tokenized_text)) 
    return [token.lemma_ for token in doc if token.pos_ in allowed_postags]

Initialize stopwords list, including French stopwords as there are letters in French. Extend stopwords list to include corpus-specific stopwords.

In [15]:
stopwords = list(set(stopwords.words("english")) | set(stopwords.words("french")))
stopwords.extend(["sir", "henslow", "dear", "letter", "mr", "mrs", "revd", "lord", "prof", "day", "yrs", "yr", "thank", "way", "friend",
                    "year", "week", "month", "time", "th", "st", "ld", "thank", "kind", "good", "make", "beg", "suppose",
                    "enclose", "small", "regard", "andc", "ditto", "loss", "mention", "regret", "account", "long", "obedient",
                    "appear", "examine", "loss", "take", "expect", "monday", "tuesday", "wednesday", "thursday", "friday", "servt",
                    "saturday", "sunday", "hear", "contain", "tell", "enough", "say", "arrive", "large", "get", "well", "servant",
                    "due", "give", "pay", "many", "name", "character", "refrain", "able", "open", "several", "believe", "oblige",
                    "leave", "introduce", "serv", "add", "date", "see", "compliment", "today", "write", "find", "tomorrow",
                    "want", "yesterday", "much", "spend", "part", "last", "first", "oblige", "arrival", "leaving", "hope", "send",
                    "think", "find", "see", "good", "make", "take", "know", "get", "write", "great", "come", "go", "wish", "last",
                    "much", "leave", "present", "work", "receive", "return", "many", "name", "hear", "feel", "little", "place",
                    "list", "thing", "part", "able", "tell", "oblige", "vote", "new", "man", "visit", "regard", "glad", "make",
                    "find", "give", "write", "see", "take", "receive", "go", "rs", "oblige", "good", "get", "leave", "state",
                    "remain", "pleasure", "look", "seem", "use", "meet", "do", "say", "bring", "put", "next", "show", "come", "name",
                    "make", "find", "give", "see", "take", "go", "get", "leave", "do", "say", "call", "let", "opportunity", "offer",
                    "faithful", "faithfully"
                 ])

Create file paths for letters files and sort numerically. Also get letter numbers without file paths for use as letter identifiers.

In [16]:
file_paths = "letters/*.xml"

xml_files = []
for path in glob.glob(file_paths):
    path = Path(path)    
    xml_files.append(path)
    
#sort file paths numerically
xml_files = os_sorted(xml_files)

#extract letter number from full file path
filenames = []
for path in xml_files:
    filename = os.path.basename(path)
    filenames.append(filename)

Takes file paths and sends to text_cleaning function to extract text and clean up the data, then return as list of texts for each letter.

In [17]:
clean_texts = text_cleaning(xml_files)
clean_texts[:5]

['mead place, lambeth april my dear sir, i return your fossils. i hope they will come safely. be so good as to make my best remembrances to prof. sedgwick and do me the favour to return the amm. sedgwickii with many thanks. i suppose you attended prof. clark’s lecture on satr y or know that i lent him some mot. c iron to show his pupils. i beg to trouble with the enclosed letter for him or rather small parcell, and also for prof. sedgwick any help in either of my catalogues or regarding fossils andc andc will be thankfully rec. d by dear sir. yours very faithfully j. sowerby isle of man . anomites punctatus. martin f. .f.. . do . spirifer sowerby m. c. lin tran v. pt page . and producti j.s.m.c. . same as . with a spirifer on the other side . and . productus scoticus m.c..f. . and . spirifer flattish . cardium new? see m.c. . productus . same as ? . productus finely striated . same as . productus stria thready with some intermediate ones . productus t . trilobite a . amm. henslowi m.c.

At this stage we begin processing each text in two different ways, as a list of sentences and as a string for each text. This is because the different topic modelling tools we will ultimately use require different inputs, some with each letter text as one string and others divided into a list of sentences.

For the sentence list version, we first initialize nltk PunktParamaters, which allows us to prevent abbreviations from ending sentences too early. We are able to add our own corpus-specific abbreviations to this tool. We then add this to the nltk sentence tokenizer tool and send the clean texts and the tokenizer to the sent_tokens function. This returns a list of texts, each divided into a list of sentences.

We then use the punct_removal function to remove any additional punctuation from each sentence.

In [18]:
punkt_param = PunktParameters()
punkt_param.abbrev_types = set(["hon", "mr", "rev", "dr", "prof", "rec." "m.p", "yrs", "obed", "servt", "n.b",
                                "st", "sq", "j.s", "mons", "yr", "wh", "yrs", "geo", "socy", "ld", "abundt",
                                "shd", "cd", "sd", "p.s", "soc", "phil", "sufft", "esq", "recd", "wd", "necessy",
                                "dups", "nat", "hist", "br", "assoc", "p.m", "/", "£", "no", "edinb", "revd", "ch",
                                "esr", "arrangt", "cal", "hort", "m.s", "st", "organizn", "portd", "simy", "oxf",
                                "ed", "geol", "foss", "bot", "unnecessy", "brongts", "fig", "difft", "pecop", "pl",
                                "suffy", "nos", "messrs", "univy", "carb", "sep", "esqr", "jn", "secrety", "lieutt",
                                "lieut", "obd", "wm", "livl", "th.s", "e.i", "mus", "betn", "ult", "var", "doz",
                                "camb", "edit", "lin", "hyd", "hyb", "lath", "oxon", "cyp", "vars", "developt", "rt",
                                "bp", "c.s", "lithogs", "wch", "secry", "secy", "trans", "m.a", "j.c", "mec", "g.c",
                                "mem", "edinh", "coll", "figs", "figd", "birm", "bridgwr", "silurn", "str", "obt",
                                "1st", "2nd", "3rd", "4th", "5th", "6th", "7th", "8th", "9th", "10th", "profr", 
                                "5h", "rod", "s.w", "coll", "quere", "m.p", "lit", "capt", "bs", "pr", "viz", "lab",
                                "morn", "h.h.s"
                                ])
sentence_tokenizer = PunktSentenceTokenizer(punkt_param)

sentence_texts = [sent_tokens(sentence_tokenizer, text) for text in clean_texts]
no_punct_sent_texts = [[punct_removal(sentence) for sentence in sublist] for sublist in sentence_texts]
no_punct_sent_texts[:2]

[['mead place lambeth april my dear sir i return your fossils',
  'i hope they will come safely',
  'be so good as to make my best remembrances to prof sedgwick and do me the favour to return the amm',
  'sedgwickii with many thanks',
  'i suppose you attended prof clark’s lecture on satr y or know that i lent him some mot',
  'c iron to show his pupils',
  'i beg to trouble with the enclosed letter for him or rather small parcell and also for prof sedgwick any help in either of my catalogues or regarding fossils andc andc will be thankfully rec',
  'd by dear sir',
  'yours very faithfully j sowerby isle of man',
  'anomites punctatus',
  'martin f',
  'f',
  'do',
  'spirifer sowerby m c lin tran v pt page',
  'and producti j s m c',
  '',
  'same as',
  'with a spirifer on the other side',
  'and',
  'productus scoticus m c f',
  'and',
  'spirifer flattish',
  'cardium new',
  'see m c',
  '',
  'productus',
  'same as',
  '',
  'productus finely striated',
  'same as',
  'productu

For texts that are not divided into sentences it is only necessary to pass them to the punct_removal function to remove punctuation.

In [19]:
no_punct_texts = [punct_removal(text) for text in clean_texts]
no_punct_texts[:2]

['mead place lambeth april my dear sir i return your fossils i hope they will come safely be so good as to make my best remembrances to prof sedgwick and do me the favour to return the amm sedgwickii with many thanks i suppose you attended prof clark’s lecture on satr y or know that i lent him some mot c iron to show his pupils i beg to trouble with the enclosed letter for him or rather small parcell and also for prof sedgwick any help in either of my catalogues or regarding fossils andc andc will be thankfully rec d by dear sir yours very faithfully j sowerby isle of man anomites punctatus martin f f do spirifer sowerby m c lin tran v pt page and producti j s m c same as with a spirifer on the other side and productus scoticus m c f and spirifer flattish cardium new see m c productus same as productus finely striated same as productus stria thready with some intermediate ones productus t trilobite a amm henslowi m c n nautilus complanatus m c pe pentacrinitis c caryophyllea lamarck di

Use the date_extract function to extract the date for each letter, using the letter file paths as input

In [20]:
text_dates = date_extract(xml_files)
print(text_dates[:10])

[datetime.datetime(1820, 4, 24, 0, 0), datetime.datetime(1821, 11, 15, 0, 0), datetime.datetime(1821, 7, 2, 0, 0), datetime.datetime(1822, 12, 16, 0, 0), datetime.datetime(1822, 11, 11, 0, 0), datetime.datetime(1822, 7, 2, 0, 0), datetime.datetime(1823, 1, 7, 0, 0), datetime.datetime(1823, 4, 2, 0, 0), datetime.datetime(1823, 4, 8, 0, 0), datetime.datetime(1823, 4, 19, 0, 0)]


Use the correspondents function to extract the sender and recipient for each letter, using the letter file paths as input. Change output into 2 lists: one for senders, one for recipients.

In [21]:
senders_recipients = correspondents(xml_files)
senders, recipients = list(zip(*senders_recipients))

Create Pandas dataframe for non-sentence strings of letter text content, with letter id, date, sender and recipient. Using a dataframe enables easier extraction and manipulation of data.

In [22]:
df = pd.DataFrame(list(zip(filenames, text_dates, senders, recipients, no_punct_texts)), 
                  columns=["letter", "date", "sender", "recipient", "text"])

Create same type of dataframe as above for sentence divided versions of letter texts. Use explode method to give each sentence its own row in the dataframe with the letter metadata for each sentence included in the row.

In [23]:
sent_df = pd.DataFrame(list(zip(filenames, text_dates, senders, recipients, no_punct_sent_texts)),
                      columns=["letter", "date", "sender", "recipient", "text"])
sent_df = sent_df.explode("text", ignore_index=True)

Tokenize the texts in both dataframes into lists of words for use in preliminary data analysis and further processing below.

In [24]:
df["text"] = df["text"].map(tokenize_text)
sent_df["text"] = sent_df["text"].map(tokenize_text)

Send one of the dataframes to the word_frequency_plot function to get bar chart of top words in the corpus by frequency, prior to the removal of stopwords and some parts-of-speech.

In [25]:
word_frequency_plot(df)

Send one of the dataframes to the doc_length_plot function to get a bar plot showing the distribution of the corpus in terms of document length.

In [26]:
doc_length_plot(df)

Send one of the dataframes to the pos_count_plot function to get a bar plot showing the count of the different parts-of-speech across the corpus.

In [27]:
pos_count_plot(df)

Remove stop words from both dataframes using the remove_stopwords function, with text column and stopwords list as parameters. Returns column without stopwords.

In [28]:
df["text"] = df["text"].apply(remove_stopwords, stopwords=stopwords)
sent_df["text"] = sent_df["text"].apply(remove_stopwords, stopwords=stopwords)

Get text column from non-sentence dataframe and turn into list. Use list to find bigrams, returns bigrams for whole corpus. Parameters: min_count - words only included if above this number, threshold - higher number for fewer included phrases.

In [29]:
text_list = df["text"].tolist()
corpus_bigrams = Phrases(text_list, min_count=5, threshold=100)

Apply bigrams function to the text columns of both dataframes, with additional parameters of tokenized text list and corpus bigrams created above. Returns columns for both dataframes with words converted into bigrams where appropriate.

In [30]:
df["text"] = df["text"].apply(bigrams, tokenized_texts=text_list, corpus_bigrams=corpus_bigrams)
sent_df["text"] = sent_df["text"].apply(bigrams, tokenized_texts=text_list, corpus_bigrams=corpus_bigrams)

Apply lemmatization function to both dataframes, with tokenized text from each row as input. There is also an optional allowed_postags parameter which enables us to decide which parts-of-speech are included, this parameter has a default value which can be seen in the function but this can be changed. 

Returns lemmatized, tokenized text for each row with excluded parts-of-speech removed.

In [31]:
df["text"] = df["text"].apply(lemmatization)
sent_df["text"] = sent_df["text"].apply(lemmatization)
sent_text_list = sent_df["text"].tolist()
sent_text_list

[['mead', 'lambeth', 'april', 'fossil'],
 [],
 ['sedgwick', 'favour', 'amm'],
 ['thank'],
 ['clark', 'lecture', 'satr', 'mot'],
 ['iron', 'pupil'],
 ['trouble', 'parcell', 'help', 'fossil'],
 [],
 ['sowerby', 'isle'],
 ['punctatus'],
 ['martin'],
 [],
 [],
 ['sowerby', 'lin', 'tran', 'pt', 'page'],
 [],
 [],
 [],
 ['side'],
 [],
 ['scoticu'],
 [],
 ['flattish'],
 ['cardium'],
 [],
 [],
 ['productus'],
 [],
 [],
 ['productus'],
 [],
 ['productus', 'stria', 'thready', 'one'],
 ['productus'],
 [],
 ['amm'],
 [],
 [],
 ['nautilus', 'complanatus'],
 [],
 ['pentacrinitis'],
 ['caryophyllea'],
 ['lamarck'],
 [],
 [],
 ['entrochi', 'carypohyllea', 'scoria'],
 ['november',
  'analysis',
  'grain',
  'mineral',
  'anglesea',
  'form',
  'consist',
  'gr',
  'silica',
  'alumina',
  'soda',
  'lime',
  'water',
  'absorption',
  'zn',
  'iron',
  'grain'],
 ['ch', 'mineral', 'gelatinizes', 'acid', 'friction'],
 ['analcine', 'variety'],
 ['clarke'],
 ['evening',
  'result',
  'examination',
  'spe

We now apply the plots used above to our corpus to see what changes to the corpus have been effected by our data processing.

In [32]:
doc_length_plot(df)

In [33]:
word_frequency_plot(df)

In [34]:
pos_count_plot(df)

Finally we save our dataframes for use in the next notebooks which implement our different topic models.

In [35]:
df.to_pickle("pickle/henslow_texts.pkl")
sent_df.to_pickle("pickle/henslow_sentences.pkl")