In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer #SciKit-Learn Machine Learning Library
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.cluster import KMeans
import nltk
from nltk import word_tokenize

In [None]:
##this notebook is used to remove the top-10 non-geographical terms from TF-IDF analysis
#only ocnducted for 500m^2
df = pd.read_csv("grid_500m_newest_og_tfidf.csv")


In [None]:
#group by grid cell number and then concatenate all tags
df = df.astype(str).groupby('PageNumber')['custom_filter_further'].apply(lambda x: ' '.join(x)).reset_index()

In [None]:
#create vectoriser - TdidfVectorizer normalises the tfidf values from 0-1
#set use_idf to 'True' so that it actually calculates the IDF part of TF-IDF (otherwise it's just TF which is bloody confusing)
#max_df = words that appear over 80% of entire corpus is missed out
#min_df = miss out on words that appear less than 5 times in corpus
# norm = 'l2' normalises the length of the documents to prepare for calculation (between two equal-length vectors) 
Vectorizer = TfidfVectorizer(lowercase = True, stop_words = 'english', use_idf = True, norm = 'l2')

#Vectors = Vectorizer.fit_transform(df['lemmatised_tags'])

#https://stackoverflow.com/questions/64743583/which-10-words-has-the-highest-tf-idf-value-in-each-document-total

#finding top 10 tfidf tags per document
X_tfidf = Vectorizer.fit_transform(df['custom_filter_further'])
X_tfidf_array = X_tfidf.toarray()
#calculates freq of terms in corpus
vocab = Vectorizer.vocabulary_
#records dictionary with term then freq
reverse_vocab = {v:k for k,v in vocab.items()}


feature_names = Vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(X_tfidf_array, columns = feature_names)

#sort each row in array in columns from small-> largest tfidf values
idx = X_tfidf_array.argsort(axis=1)

#get indices of the top n tfidf values per document to inspect the terms
#access entire column in array indexed at the last 10 columns
tfidf_max10 = idx[:,-10:]

df_tfidf['top10'] = [[(reverse_vocab.get(item), X_tfidf[i, item]) for item in row] 
                     for i, row in enumerate(tfidf_max10)]

#[[reverse_vocab.get(item) for item in row] for row in tfidf_max10 ]


df_tfidf = pd.DataFrame(df_tfidf['top10'])
df_tfidf.to_csv("top_tfidf_per_document_500m_og.csv")

In [5]:
#following inspection of all non-geographical terms across all documents, these are transferred to a 'global' custom list
#for removal

#first load in the analysis data
df = pd.read_csv("analysis_data.csv")

df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,index,Unnamed: 0.1.1,accuracy,context,datetaken,datetakengranularity,datetakenunknown,farm,...,secret,tags,title,url_n,date,time,year,short_filtered,char_filtered,custom_filter
0,0,0,0,0,16,0,2010-09-01 13:15:00,0,0,5,...,17d70c22ce,winter snow ice docks river scotland boat froz...,Icebreaker Needed...,https://live.staticflickr.com/4039/4269213164_...,09/01/2010,13:15:00,2010,winter snow ice docks river scotland boat froz...,winter snow ice docks river scotland boat froz...,winter snow ice docks river boat frozen ship y...
1,1,1,1,1,16,0,2010-08-01 15:10:00,0,0,5,...,fb6cb30938,bridge winter snow ice church scotland canal e...,"Union Canal, January 01",https://live.staticflickr.com/4045/4277833924_...,08/01/2010,15:10:00,2010,bridge winter snow ice church scotland canal e...,bridge winter snow ice church scotland canal e...,bridge winter snow ice church canal harrisonpa...
2,2,2,2,4,16,0,2010-02-01 12:19:00,0,0,5,...,82095fb86c,architecture,IMG_0701,https://live.staticflickr.com/4057/4263150530_...,02/01/2010,12:19:00,2010,architecture,architecture,architecture
3,3,3,3,5,16,0,2010-01-01 17:31:00,0,0,5,...,75a1649e18,scotland edinburgh puppet newyear holyrood roy...,"The Big Man, Holyrood",https://live.staticflickr.com/4053/4234840366_...,01/01/2010,17:31:00,2010,scotland edinburgh puppet newyear holyrood roy...,scotland edinburgh puppet newyear holyrood roy...,puppet newyear holyrood royalmile holyroodhous...
4,4,5,7,11,16,0,2010-09-01 15:12:00,0,0,3,...,2.56E+33,plumber,Super mario,https://live.staticflickr.com/2719/4259628026_...,09/01/2010,15:12:00,2010,plumber,plumber,plumber
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23545,26032,32950,49941,63975,16,0,2019-12-23 22:58:00,0,0,66,...,0b9b5b8a56,mia britishshorthair britishblueshorthair brit...,"Mia, December 2019",https://live.staticflickr.com/65535/4937454765...,23/12/2019,22:58:00,2019,mia britishshorthair britishblueshorthair brit...,mia britishshorthair britishblueshorthair brit...,mia britishshorthair britishblueshorthair brit...
23546,26033,32951,49942,63977,16,0,2019-12-22 18:37:00,0,0,66,...,371150b625,iansdigitalphotos edinburghchristmas2019 night...,Christmas treats,https://live.staticflickr.com/65535/4926039462...,22/12/2019,18:37:00,2019,iansdigitalphotos edinburghchristmas2019 night...,iansdigitalphotos edinburghchristmas night...,edinburghchristmas nightscene
23547,26034,32952,49943,63979,16,0,2019-12-21 23:20:00,0,0,66,...,de30620c44,edinburgh jazzbar trumpet musician male,"The Jazz Bar, Edinburgh",https://live.staticflickr.com/65535/4927802449...,21/12/2019,23:20:00,2019,edinburgh jazzbar trumpet musician male,edinburgh jazzbar trumpet musician male,jazzbar trumpet musician male
23548,26035,32953,49945,63981,16,0,2019-12-21 19:58:00,0,0,66,...,4e4b611fa9,karaoke 2019 christmas,"Richard's Christmas Drinks, Dec 2019",https://live.staticflickr.com/65535/4937425722...,21/12/2019,19:58:00,2019,karaoke 2019 christmas,karaoke christmas,karaoke christmas


In [6]:
#tokenise column
tokenized_tags = df['custom_filter'].astype(str).apply(nltk.word_tokenize)


#first definee these terms
filter_words = ['candid','foodspotting','peoplewatching','livres','flickr','male','catriona','densedinburgh','royal',\
'bunched','konicaminolta','verf','poisondukwon','psycho','snapseed','edinburghnortha','eyefi','louisfarrell','louis',\
'alexander','dennis','mobileupload','majortom','yota','pellicola','serialtags','biancoenero','pointshoot',\
'inister','flickstagram','wrtl','edinburghopenda','thsep','instagram','durmming','natalie','exii','acasagm',\
'gdnedinburgh','canonspeedlite','plastik','hundekotbeutel','poopmap','abfall','newshower','newkitchenkitch','ardmor',\
'selfie','iansdigitalphot','rlmoiswr','theimagebusines','woman','monochrome','blackwhite','eyefi','evening','zuiko',\
'olympusmzuikodi','gital','olympuspenepl','olympuspenlite','ecaf','buckyball','alicolam','hoysound','oimusica','eical',\
'sundaygetdownsp','simpsonbrown','edinburghphotog','geotagged','photoaday','raphers','lastfm','rcat','thair',\
'britishshorthai','tabby','anna','grandchildren','victoriaswingbr','idge','britishblueshor','olympictorch',\
'spencegloverfer','guson','bleak','ribaaward','decemeber','vacation','trip','anna','parties','annamcintosh','nuit',\
'nacht','brel','iphone','ryries','arden','twofatladdies','flckriosapp','flickrmobile','nofilter','ival','eiff',\
'andrewhobson','photogram','nburgh','nlifttriptych','polaroidorigina','nliftprint','samsung','vsco','iainpope','remt',\
'streetphotograp','gallipoli','dence','nokia','classiccar','xcountry','dmarkii','apel','nart','make','lens','squadronkway',\
'gemini','sony','wright','rdens','embt','concoursofelega','great','country','','citro','cuavenue','hedinburgh','sdressed',\
'gormley','inedinburgh','retrodresses','wwwmissbiziocom','yfair','mobileuploads','trident','nowhereinscotla','mini',\
'wrighteclisege','pentax','astro','bavonet','volvob','fujiphotofilmco','arrg','whitehallwelfar','albert','welfaire',\
'aphynetwork','tommy','olympusimagingc','thealternativec','file']

#nested loop to weed out tags from the filter_words list
df["custom_filter_further"] = [[t for t in tok_sent if t not in filter_words] for tok_sent in tokenized_tags]

#convert list back to string
df['custom_filter_further'] = df['custom_filter_further'].str.join(' ')

#remove empty rows with no tags now that I've removed tags

#first convert empty strings to NA
df['custom_filter_further'].replace('', np.nan, inplace=True)


#then drop
df.dropna(subset = ["custom_filter_further"], inplace=True)


#reset index so it runs from 1 to whatever the last entry is without random gaps
df.reset_index(drop=True)

df.to_csv("analysis_data_final.csv")