This is a very useful guide into TF-IDF calculation and K-means clustering using Sci-Kit Learn:

https://jonathansoma.com/lede/algorithms-2017/classes/clustering/k-means-clustering-with-scikit-learn/

In [16]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer #SciKit-Learn Machine Learning Library
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv("analysis_data_end_of_stage_2.csv")

df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PageNumber,custom_filter,lemmatised_tags,top_words_only
0,0,0,100,bike bicycle politics leith scottshparliame gr...,bike bicycle politic leith scottshparliame gre...,bike bicycle politic leith greenerleith pedalo...
1,1,1,101,winter landscape salisburycrags holyroodpark a...,winter landscape salisburycrag holyroodpark ar...,winter landscape salisburycrag holyroodpark ar...
2,2,2,104,canal unioncanal water rowing spring sunny sun...,canal unioncanal water rowing spring sunny sun...,canal unioncanal water peoplewatche candid pol...
3,3,3,105,christmas westhallgardens tree christmastree b...,christmas westhallgardens tree christmastree b...,christmas tree game boroughmuirhigh school squ...
4,4,4,106,square hudson instagram foursquare venue ddced...,square hudson instagram foursquare venue ddced...,square instagram foursquare venue cycling bike...
...,...,...,...,...,...,...
63,63,63,95,urban streetart color colour wall canon painti...,urban streetart color colour wall canon paint ...,urban streetart colour canon paint graffiti fa...
64,64,64,96,tollcross lauristonplace goldbergs alcohol the...,tollcross lauristonplace goldbergs alcohol the...,tollcross tenement themeadow pentax canon eyef...
65,65,65,97,copyright photography november sunset mist sno...,copyright photography november sunset mist sno...,photography november sunset mist snow december...
66,66,66,98,university library georgesquare libraryworkin ...,university library georgesquare libraryworkin ...,winter snow december fringe edfringe architect...


In [12]:
#create vectoriser - TdidfVectorizer normalises the tfidf values from 0-1
#set use_idf to 'True' so that it actually calculates the IDF part of TF-IDF (otherwise it's just TF which is bloody confusing)
#max_df = words that appear over 80% of entire corpus is missed out
#min_df = miss out on words that appear less than 5 times in corpus
# norm = 'l2' normalises the length of the documents to prepare for calculation (between two equal-length vectors) 
Vectorizer = TfidfVectorizer(lowercase = True, max_df = 0.8, min_df = 5, stop_words = 'english', use_idf = True, norm = 'l2')

Vectors = Vectorizer.fit_transform(df['top_words_only'])
#convert to array
Vectors.toarray()

#print shape of array
#(76, 324) - 76 rows and 324 columns
Vectors.shape

#get feature names from vectors
feature_names = Vectorizer.get_feature_names_out()

In [14]:
#convert array to sparse matrix
Vectors_sparse = sparse.csr_matrix(Vectors)
Vectors_sparse

<68x434 sparse matrix of type '<class 'numpy.float64'>'
	with 7710 stored elements in Compressed Sparse Row format>

In [17]:
similarities = cosine_similarity(Vectors_sparse)
print('pairwise dense output:\n {}\n'.format(similarities))

pairwise dense output:
 [[1.         0.45580624 0.16135657 ... 0.33068668 0.30295618 0.46012321]
 [0.45580624 1.         0.22173072 ... 0.42082607 0.3690385  0.46951377]
 [0.16135657 0.22173072 1.         ... 0.27981079 0.32082719 0.18262553]
 ...
 [0.33068668 0.42082607 0.27981079 ... 1.         0.570087   0.38227157]
 [0.30295618 0.3690385  0.32082719 ... 0.570087   1.         0.3453214 ]
 [0.46012321 0.46951377 0.18262553 ... 0.38227157 0.3453214  1.        ]]



In [18]:
#check shape to see if it's a matrix - and yes it is
print(similarities.shape)

list = similarities.tolist()
cosine_similarity_matrix = pd.DataFrame(list)

cosine_similarity_matrix.to_csv("cosine_sim_matrix.csv")

(68, 68)


This is useful and below code draws from it: https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity

In [13]:
#using the tfidf vectors, let's try and see what the terms are with high tfidf scores 
#convert to matrix
matrix = Vectors.todense()

#convert to list via a matrix
matrix_list = matrix.tolist()

#matrix_list

In [9]:
#this takes each word from the matrix list (word embedding) and churns it out into a list of actual tags. ONLY the rarer
#terms are printed below

all_keywords = []
for tag in matrix_list:
    x=0
    keywords = []
    for word in tag:
        if word > 0: #select words with tfidf values of over 0 - these indicate more common words
            keywords.append(feature_names[x])
        x=x+1
    all_keywords.append(keywords)

In [11]:
print(matrix_list[0])#print first list in matrix
print(all_keywords[0]) #print tags with tfidf (normalised) > 0 indicating rarer terms


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.025165154099934833, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04783650014364247, 0.0, 0.0, 0.028254988857307914, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2703328334562451, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.024522206503097128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09567300028728494, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06843058411092454, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.020076191703300876, 0.0, 0.0, 0.01617057822236084, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.015147913920561158, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [10]:
#let's try cluster the terms from each document
#if using, need to calculate for 'LEMMATISED_TAGS' column not the already filtered top_words

#let's try 8 clusters to start off with
true_k = 10

model = KMeans(n_clusters=true_k, init="k-means++", max_iter=100, n_init=1)

#fit K-means model to the vector we have created above 
model.fit(Vectors)

#create clusters by the centre value?
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = Vectorizer.get_feature_names_out()

with open ("tfidf_cluster_results.txt", "w", encoding="utf-8") as f:
    for i in range(true_k):
        f.write(f"Cluster {i}")
        f.write("\n")
        for ind in order_centroids[i, :15]: #show first 10 terms in cluster (most likely to be associated with it)
            f.write (' %s' % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")