Import libraries for Term Frequency Inverse Document Frequency topic modelling and suppress warnings as these libraries throw up some unnecessary warnings

In [37]:
import warnings
warnings.filterwarnings('ignore')

In [38]:
import pickle
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
pio.renderers.default='iframe'

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

Load dataframe of clean data created in notebook zero.

In [39]:
df = pd.read_pickle("pickle/henslow_texts.pkl")
df.head()

Unnamed: 0,letter,date,sender,recipient,text
0,letters_1.xml,1820-04-24,"Sowerby, James","Henslow, J. S.","[mead, lambeth, april, fossils, remembrance, f..."
1,letters_2.xml,1821-11-15,"Clarke, E. D.","Henslow, J. S.","[november, analysis, grain, mineral, anglesea,..."
2,letters_3.xml,1821-07-02,"Cumming, James","Henslow, J. S.","[evening, result, specimen, goodness, insert, ..."
3,letters_4.xml,1822-12-16,"Henslow, J. S.","Jenyns, Leonard","[december, leonard, addenda, plant, wynch, boo..."
4,letters_5.xml,1822-11-11,"Brewster, David","Henslow, J. S.","[edinburgh, coates, crescent, november, prince..."


Join tokenized text together in text column

In [40]:
df["text"] = df["text"].str.join(" ")
df.head()

Unnamed: 0,letter,date,sender,recipient,text
0,letters_1.xml,1820-04-24,"Sowerby, James","Henslow, J. S.",mead lambeth april fossils remembrance favour ...
1,letters_2.xml,1821-11-15,"Clarke, E. D.","Henslow, J. S.",november analysis grain mineral anglesea form ...
2,letters_3.xml,1821-07-02,"Cumming, James","Henslow, J. S.",evening result specimen goodness insert paper ...
3,letters_4.xml,1822-12-16,"Henslow, J. S.","Jenyns, Leonard",december leonard addenda plant wynch book morn...
4,letters_5.xml,1822-11-11,"Brewster, David","Henslow, J. S.",edinburgh coates crescent november prince cimi...


Send text column to list for later use.

In [41]:
df_list = df["text"].to_list()
df_list[:5]

['mead lambeth april fossils remembrance favour amm sedgwickii thank clark lecture mot iron pupil trouble parcell help catalogue fossil isle punctatus martin sowerby lin tran pt page producti spirifer side productus scoticus spirifer cardium productus productus productus stria thready one productus trilobite amm henslowi nautilus complanatus pentacrinitis caryophyllea madriporite tubipore entrochi carypohyllea scoria',
 'november analysis grain mineral anglesea form gr silica alumina soda lime water absorption iron grain ch mineral gelatinize friction analcine variety clarke',
 'evening result specimen goodness insert paper manner crystal chemical attack symbol crystal trapezoidal dodecahedron analcine iron friction scratch glass blowpipe acid gravity crystal mass exposure heat per_cent mineral method digestion acid residue potash solution redness muriate iron_alumina water lime solution evaporation crystal salt silex solution acid alumina iron ammonia lime oxalate iron_alumina potash 

Send date column to list for later use.

In [42]:
date_list = df["date"].to_list()
date_list[:5]

[Timestamp('1820-04-24 00:00:00'),
 Timestamp('1821-11-15 00:00:00'),
 Timestamp('1821-07-02 00:00:00'),
 Timestamp('1822-12-16 00:00:00'),
 Timestamp('1822-11-11 00:00:00')]

Initialize sklearn TFIDF vectorizer with max_df and min_df parameters, which remove words that appear too frequently and infrequently.

In [43]:
vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_df = 0.8,
                                min_df = 5,
                            )

Once the vectorizer has been initialized, we process our letter texts through it. This converts them into a matrix of TFIDF values, with the numbers in each row representing the TFIDF scores for the words in a document. Higher scores mean that words have greater importance for the document.

In [44]:
transformed_docs = vectorizer.fit_transform(df_list)

Get all words included by the TFIDF vectorizer.

In [45]:
terms = vectorizer.get_feature_names_out()
print(terms)

['abbeville' 'absence' 'absentee' ... 'zeal' 'zealand' 'zoology']


We then use the TFIDF matrix and sklearn's Kmeans to find clusters of similar documents based on their similarity in terms of TFIDF scores. We first set the k variable to the number of clusters we want. This can be adjusted and experimented with to find the ideal number of clusters after seeing the results.

In [46]:
k = 3
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5,
                verbose=0).fit(transformed_docs)

Get cluster centres to get the words which form each cluster in order of prominence.

In [47]:
cluster_cents = km.cluster_centers_
sorted_cents = cluster_cents.argsort()[:, ::-1]
sorted_cents

array([[1266, 1583, 1580, ...,  334, 1497,  856],
       [ 245,  999, 1116, ...,  675, 1462, 1161],
       [1548, 1209, 1778, ..., 1065, 1066,    0]])

Get cluster label for each document's contents.

Convert TFIDF matrix to dense matrix (include terms with a zero value for each document).

In [48]:
km_labels = km.predict(transformed_docs)
dense = transformed_docs.todense()
dense_array = np.asarray(dense)

Transform data to 2d for use in graph plotting using principal component analysis. Changes TFIDF (document) and cluster center (word) vectors into two dimensions.

In [49]:
pca = PCA(n_components=2).fit(dense_array)
data2d = pca.transform(dense_array)
pca_cluster_cents = pca.transform(cluster_cents)

Use number from sorted_cents vectors to find the top words for each cluster from the terms list, join together to form a string and append to keys list.

Use the index of the items in the keys list to find the equivalent items in the km_labels list, which contains the numeric topic labels for each document, and convert this into a new ordered list of document topic keys in the same order as the km_labels list.

In [50]:
keys = []
for i in range(k):
    terms_list = []
    for ind in sorted_cents[i, :6]:
        top_term = terms[ind]        
        terms_list.append(top_term)
    terms_list = ', '.join(terms_list)                    
    keys.append(terms_list)
key_list = [keys[item] for item in km_labels]

Create dataframe for plotting using the two dimensional transformation of the document vectors, the key list of ordered topic labels for each document and the ordered date list for each document.

In [51]:
x_list, y_list = zip(*data2d)
plot_df = pd.DataFrame({"x": x_list, "y": y_list, "top_words": key_list, "date": date_list})
plot_df["date"] = plot_df["date"].dt.strftime('%Y-%m-%d')

Use plotly to create a labelled scatter chart of documents and their proximity in terms of TFIDF scores in vector space.

Date can be added when hovering over points by adding to custom_data parameter then using update_traces and adding to hovertemplate.

In [52]:
fig = px.scatter(
    data_frame=plot_df, 
    x="x", 
    y="y",
    color="top_words",
    custom_data=["date"],
    title="TFIDF and K-Means"
    )

fig.update_traces(
    hovertemplate="<br>".join([
        "date: %{customdata}",
        ]) 
    )

fig.update_layout(xaxis_title=None, yaxis_title=None)
fig.update_layout(legend=dict(font=dict(size=10)))
fig.update_layout(legend_title_text="Top 5 Words in Cluster")

Convert plotting dataframe to new dataframe and sort by date.

Group the date column by year and topic and aggregate by count to get a count of documents belonging to each topic per year.

In [53]:
clust_date_df = plot_df[["date", "top_words"]].copy()
clust_date_df["date"] = pd.to_datetime(clust_date_df['date'])
clust_date_df = clust_date_df.sort_values(by=["date"])
clust_date_df = clust_date_df["date"].groupby([clust_date_df.date.dt.year, clust_date_df.top_words]).agg("count")
clust_date_df = clust_date_df.rename_axis(["date", "top_words"]).reset_index(name="count")
clust_date_df

Unnamed: 0,date,top_words,count
0,1818,"cambridge, london, museum, paper, suffolk, copy",1
1,1819,"cambridge, london, museum, paper, suffolk, copy",1
2,1820,"cambridge, london, museum, paper, suffolk, copy",2
3,1820,"plant, specimen, specie, collection, speciman,...",2
4,1821,"cambridge, london, museum, paper, suffolk, copy",5
...,...,...,...
83,1860,"cambridge, london, museum, paper, suffolk, copy",49
84,1860,"plant, specimen, specie, collection, speciman,...",2
85,1860,"sincerely_palmerston, palmerston, vote, commit...",2
86,1861,"cambridge, london, museum, paper, suffolk, copy",3


Use the above dataframe to plot a line chart of document-topic distribution by year across the corpus.

In [54]:
fig_2 = px.line(
    data_frame=clust_date_df, 
    x="date", 
    y="count",
    color="top_words",
    title="TFIDF and K-Means Document Distribution",
    labels= {
        "count": "document count",
        "date": "year"   
            }
    )

fig_2.update_layout(legend_title_text="Top 5 Words in Cluster")
fig_2.show()