This is a very useful guide into TF-IDF calculation and K-means clustering using Sci-Kit Learn:

https://jonathansoma.com/lede/algorithms-2017/classes/clustering/k-means-clustering-with-scikit-learn/

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer #SciKit-Learn Machine Learning Library
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.cluster import KMeans
import nltk
from nltk import word_tokenize

In [13]:
df = pd.read_csv("grid_750m_newest_round3.csv")
#group by grid cell number and then concatenate all tags
df = df.astype(str).groupby('PageNumber')['custom_f_1'].apply(lambda x: ' '.join(x)).reset_index()

#make sure all characters <3 and >15 are excluded
df['custom_f_1'] = df['custom_f_1'].str.findall('\w{4,15}').str.join(' ')

tokenized_tags = df['custom_f_1'].astype(str).apply(nltk.word_tokenize)

filter_words = ['wwwpoopmapeu','wwwpoopmapde','smcpentaxm','iansdigitalphot','italphotos','iansdigitalpho','flickrandroidap','camera','filter','none']
df["custom_f_1"] = [[t for t in tok_sent if t not in filter_words] for tok_sent in tokenized_tags]

#convert list back to string
df['custom_f_1'] = df['custom_f_1'].str.join(' ')

df
#df.to_csv("analysis_data_750m_geocluster_1_pagenumbers.csv")

Unnamed: 0,PageNumber,custom_f_1
0,10,portrait cemetery retrato cementerio blonde ru...
1,11,canal dock unioncanal polwarth fountainbridge ...
2,12,film movie theater cameo corinthian column the...
3,13,sign square clarendon mmelmaritv bruntsfield b...
4,14,trees blossom bluesky blooms earlysummer park ...
5,15,road bicycle race tour meadows september nices...
6,16,summerhall royaldickbar leicasummicron leicam ...
7,17,edinbourg voyage royaumeuni square foursquare ...
8,2,guardianbookswa bankofscotland tescometro brun...
9,20,dunsapielochedi nburghscotlandu nitedkingdombr...


In [14]:
#create vectoriser - TdidfVectorizer normalises the tfidf values from 0-1
#set use_idf to 'True' so that it actually calculates the IDF part of TF-IDF (otherwise it's just TF which is bloody confusing)
#max_df = words that appear over 80% of entire corpus is missed out
#min_df = miss out on words that appear less than 5 times in corpus
# norm = 'l2' normalises the length of the documents to prepare for calculation (between two equal-length vectors) 
Vectorizer = TfidfVectorizer(lowercase = True, stop_words = 'english', use_idf = True, norm = 'l2')

#Vectors = Vectorizer.fit_transform(df['lemmatised_tags'])

#https://stackoverflow.com/questions/64743583/which-10-words-has-the-highest-tf-idf-value-in-each-document-total

#finding top 10 tfidf tags per document
X_tfidf = Vectorizer.fit_transform(df['custom_f_1'])
#X_tfidf_array = X_tfidf.toarray()


In [15]:
#convert array to sparse matrix
Vectors_sparse = sparse.csr_matrix(X_tfidf)

similarities = cosine_similarity(Vectors_sparse)
print('pairwise dense output:\n {}\n'.format(similarities))

pairwise dense output:
 [[1.         0.21980341 0.04934129 ... 0.00849755 0.00202068 0.00256731]
 [0.21980341 1.         0.57011772 ... 0.03943643 0.01424324 0.01812187]
 [0.04934129 0.57011772 1.         ... 0.0271379  0.01163182 0.0197645 ]
 ...
 [0.00849755 0.03943643 0.0271379  ... 1.         0.07529783 0.31171976]
 [0.00202068 0.01424324 0.01163182 ... 0.07529783 1.         0.07339531]
 [0.00256731 0.01812187 0.0197645  ... 0.31171976 0.07339531 1.        ]]



In [16]:
#check shape to see if it's a matrix - and yes it is
print(similarities.shape)

list = similarities.tolist()
cosine_similarity_matrix = pd.DataFrame(list)

#cosine_similarity_matrix.to_csv("cosine_sim_matrix_750m_geocluster_3.csv")

(53, 53)


This is useful and below code draws from it: https://stackoverflow.com/questions/12118720/python-tf-idf-cosine-to-find-document-similarity