This time let's try applying a topic model and finding out the words from the dominant topic as per Purvs and Hollenstein's work.

In [3]:
pip install gensim spacy 

Note: you may need to restart the kernel to use updated packages.


In [4]:
!python -m spacy download en_core_web_md

2022-06-15 11:47:58.188277: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-15 11:47:58.188324: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[K     |████████████████████████████████| 45.7 MB 1.3 MB/s eta 0:00:01  | 38.1 MB 1.3 MB/s eta 0:00:07
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [5]:
import pandas as pd
import numpy as np
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
#import pyLDAvis
#import pyLDAvis.gensim_models


#nltk
import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer #SciKit-Learn Machine Learning Library
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.cluster import KMeans


In [11]:
central_edi_posts = pd.read_csv("grid_500m_newest_intersect.csv")

#drop na otherwise error
central_edi_posts.dropna(subset = ['custom_filter'])

#group by grid cell number and then concatenate all tags
central_edi_posts_grouped = central_edi_posts.astype(str).groupby('PageNumber')['custom_filter_further'].apply(lambda x: ' '.join(x)).reset_index()

central_edi_posts_grouped
#central_edi_posts_grouped.to_csv("500m_grids_newest.csv")

Unnamed: 0,PageNumber,custom_filter_further
0,1,nissan micrak micra edinburghcar door colinton...
1,101,edinburghaccies heriotsrugbyclu rugby game mat...
2,102,waterofleithwal kway cuppa streetphotograp han...
3,103,pink blue girls people orange white black gree...
4,104,november vegetables fruit market gifts relish ...
...,...,...
86,9,boroughmuirhigh school warmemorial boroughmuir...
87,90,photographer tree landscape caltonhill nationa...
88,91,lothian lothianbuses transportforedi nburgh wr...
89,92,abbeymount abbeyhill europeanunion pentaxk pen...


In [9]:
##EXPORT 500M^2 GRIDS TO CSV
#export to individual csvs
#set index
#central_edi_posts_grouped = central_edi_posts_grouped.set_index('PageNumber')

# then loop
for idx in central_edi_posts_grouped.index:
    central_edi_posts_grouped.loc[[idx]].to_csv(rf'500m_Grid_Documents/grid_{idx}.csv')

In [38]:
# TOPIC MODEL
##10 topics

def topic_model_500m(grid_document):
    ''' this function is used to conduct topic modelling for each grid/document '''
    #tokenise words in string
    tokens = [word_tokenize(i) for i in grid_document['custom_filter_further']]
    
    #convert tokenized lists into dictionary
    dictionary = corpora.Dictionary(tokens)
    #create document term matrix
    doc_term_matrix = [dictionary.doc2bow(tag) for tag in tokens]
    #initialise topic model from gensim
    LDA = gensim.models.ldamodel.LdaModel
    #build and train topic model
    lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=10, random_state=100,
                chunksize=150, passes=50,iterations=100)

    #write top 20 words from each document as csv
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

    return pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words_500m_10_topics.csv",
                                                                                    mode='a', index = False, header=False)

In [39]:
from glob import glob
filenames = glob("500m_Grid_Documents/grid*.csv")
df_from_each_file = (pd.read_csv(f) for f in filenames)

In [40]:
for f in df_from_each_file:
    topic_model_500m(f)