This time let's try applying a topic model and finding out the words from the dominant topic as per Purvs and Hollenstein's work.

In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [13]:
!python -m spacy download en_core_web_md

2022-05-17 18:46:55.183782: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-17 18:46:55.183833: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[K     |████████████████████████████████| 45.7 MB 189 kB/s eta 0:00:01
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
import pandas as pd
import numpy as np
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
#import pyLDAvis
#import pyLDAvis.gensim_models


#nltk
import nltk
from nltk.tokenize import word_tokenize

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [5]:
central_edi_posts = pd.read_csv("Grid_500m_intersect_points.csv")


#drop na otherwise error
central_edi_posts.dropna(subset = ['custom_filter'])
#group by grid cell number and then concatenate all tags
central_edi_posts_grouped = central_edi_posts.astype(str).groupby('PageNumber')['custom_filter'].apply(lambda x: ' '.join(x)).reset_index()

In [6]:
#make sure all characters <3 and >15 are excluded
central_edi_posts_grouped['custom_filter'] = central_edi_posts_grouped['custom_filter'].str.findall('\w{4,15}').str.join(' ')
central_edi_posts_grouped


Unnamed: 0,PageNumber,custom_filter
0,100,bike bicycle politics leith scottshparliame gr...
1,101,winter landscape salisburycrags holyroodpark a...
2,104,canal unioncanal water rowing spring sunny sun...
3,105,christmas westhallgardens tree christmastree b...
4,106,square hudson instagram foursquare venue ddced...
...,...,...
63,95,urban streetart color colour wall canon painti...
64,96,tollcross lauristonplace goldbergs alcohol the...
65,97,copyright photography november sunset mist sno...
66,98,university library georgesquare libraryworkin ...


In [7]:
#lemmatisation
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
#disable: Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline.


def lemmatization(texts,allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

central_edi_posts_grouped['lemmatised_tags'] = lemmatization(text for text in central_edi_posts_grouped['custom_filter'])


In [8]:
#convert list back to string
central_edi_posts_grouped['lemmatised_tags'] = central_edi_posts_grouped['lemmatised_tags'].str.join(' ')

#export to csv so I can use for later
#central_edi_posts_grouped.to_csv("lemmatised_500m_grid_posts.csv")

In [23]:
##EXPORT 500M^2 GRIDS TO CSV
#export to individual csvs
#set index
central_edi_posts_grouped = central_edi_posts_grouped.set_index('PageNumber')

# then loop
for idx in central_edi_posts_grouped.index:
    central_edi_posts_grouped.loc[[idx]].to_csv(rf'500m_Grid_Documents/grid_{idx}.csv')

In [24]:
# TOPIC MODEL

def topic_model_500m(grid_document):
    ''' this function is used to conduct topic modelling for each grid/document '''
    #tokenise words in string
    tokens = [word_tokenize(i) for i in grid_document['lemmatised_tags']]
    
    #convert tokenized lists into dictionary
    dictionary = corpora.Dictionary(tokens)
    #create document term matrix
    doc_term_matrix = [dictionary.doc2bow(tag) for tag in tokens]
    #initialise topic model from gensim
    LDA = gensim.models.ldamodel.LdaModel
    #build and train topic model
    lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=4, random_state=100,
                chunksize=150, passes=50,iterations=100)

    #write top 20 words from each document as csv
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

    return pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P']).to_csv("top_words_500m_lemmatised.csv",
                                                                                    mode='a', index = False, header=False)

In [25]:
from glob import glob
filenames = glob("500m_Grid_Documents/grid*.csv")
df_from_each_file = (pd.read_csv(f) for f in filenames)

In [26]:
for f in df_from_each_file:
    topic_model_500m(f)