In [56]:
import pandas as pd
import nltk
import spacy
import gensim
import seaborn as sb
from gensim import corpora, models, similarities
from spacy.lang.en import English
from nltk.corpus import wordnet as wn

In [57]:
data=pd.read_csv("GuilfordCounty_original_data.csv",)

In [58]:
data.head()

Unnamed: 0.1,Unnamed: 0,page_number,word
0,1,2,guilford
1,2,2,county
2,3,2,by
3,4,2,the
4,5,2,numbers


In [59]:
data.drop(data.columns[0], axis=1)

Unnamed: 0,page_number,word
0,2,guilford
1,2,county
2,2,by
3,2,the
4,2,numbers
5,2,what
6,2,makes
7,2,us
8,2,count
9,2,county


In [60]:
data.describe()

Unnamed: 0.1,Unnamed: 0,page_number
count,94559.0,94559.0
mean,47280.0,220.921033
std,27296.976389,133.722968
min,1.0,2.0
25%,23640.5,110.0
50%,47280.0,213.0
75%,70919.5,324.0
max,94559.0,495.0


In [61]:
#cleaning 
spacy.load('en')
parser=English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.orth_.startswith('http'):
            lda_tokens.append('com')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [62]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\messi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [63]:
def get_lemma(word):
    lemma=wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [64]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [65]:
#filtering stopwords
nltk.download('stopwords')
stop=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\messi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [66]:
#preparing for lda
def prepare_lda(text):
    tokens=tokenize(text)
    tokens=[token for token in tokens if len(token)>4]
    tokens=[token for token in tokens if token not in stop]
    tokens=[get_lemma(token) for token in tokens]
    return tokens

In [67]:
#creating tokens
import random
text_data=[]
with open('GuilfordCounty_original_data.csv') as f:
    for line in f:
        tokens=prepare_lda(line)
        if random.random()>.99:
            print(tokens)
            text_data.append(tokens)

['27",2,"ad']
['63",2,"areas']
['440",5,"unit']
['1232",12,"commerce']
['1250",12,"more']
['1290",12,"level']
['1464",12,"the']
['1469",12,"work']
['1542",12,"areas']
['1744",13,"increases']
['1777",13,"balance']
['1965",14,"and']
['2061",14,"cents']
['2236",15,"for']
['2259",15,"from']
['2532",15,"have']
['2564",16,"total']
['2583",16,"housing']
['2776",16,"in']
['3040",17,"juveniles']
['3298",18,"of']
['3399",18,"charter']
['3460",18,"all']
['3462",18,"nties']
['3530",18,"to']
['3666",18,"trustees']
['3806",19,"have']
['3903",19,"expansion']
['3986",19,"expenditures']
['4033",19,"of']
['4051",19,"safety']
['4141",20,"recommendation']
['4155",20,"county']
['4295",20,"was']
['4330",20,"also']
['4410",20,"detention']
['4552",21,"these']
['4613",21,"county']
['4670",21,"during']
['4885",21,"repayment']
['5076",22,"million']
['5098",22,"avoid']
['5378",23,"counts']
['5516",23,"additional']
['5584",23,"growth']
['6205",25,"family']
['6232",25,"to']
['6311",25,"xv']
['6322",26,"positions']


['77742",364,"detention']
['77754",364,"accommodate']
['77775",364,"law']
['77781",364,"of']
['77865",364,"is']
['77869",364,"from']
['78096",366,"yet']
['78125",367,"guilford']
['78131",367,"subscriber']
['78201",367,"the']
['78560",370,"the']
['78893",372,"in']
['78900",372,"instruction']
['78913",372,"to']
['79012",372,"canoe']
['79193",374,"the']
['79316",374,"have']
['79367",375,"request']
['79435",375,"countys']
['79449",375,"in']
['79483",375,"funding']
['79614",376,"an']
['79666",376,"the']
['79724",377,"administration']
['79760",377,"replacement']
['79763",377,"mental']
['79923",378,"items']
['80106",379,"february']
['80181",379,"office']
['80345",381,"projects']
['80471",382,"center']
['80527",382,"independence']
['80528",382,"center']
['80750",383,"total']
['80956",384,"southern']
['81083",385,"capital']
['81359",389,"from']
['81399",389,"as']
['81506",389,"the']
['81570",389,"the']
['81819",390,"program']
['82145",390,"board']
['82317",391,"of']
['82385",391,"must']
['82578

In [68]:
#creating dictionary
from gensim import corpora
dictionary=corpora.Dictionary(text_data)
corpus= [dictionary.doc2bow(text) for text in text_data]
import pickle 
pickle.dump(corpus, open('corpus.pkl', 'wb'))

In [70]:
#example
ldamodel=gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=6)
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} word: {}'.format(idx, topic))

Topic: 0 word: 0.006*"11509",48,"actual" + 0.006*"47396",214,"community" + 0.006*"14292",66,"and" + 0.006*"79483",375,"funding" + 0.006*"78096",366,"yet" + 0.006*"91481",470,"residential" + 0.006*"45164",206,"director" + 0.006*"80345",381,"projects" + 0.006*"77593",363,"of" + 0.006*"41404",188,"program"
Topic: 1 word: 0.007*"39126",179,"internet" + 0.007*"15802",72,"by" + 0.007*"79367",375,"request" + 0.007*"75007",347,"sources" + 0.007*"4033",19,"of" + 0.007*"69046",314,"departmental" + 0.007*"2236",15,"for" + 0.007*"73475",341,"replace" + 0.007*"13083",61,"of" + 0.007*"14365",66,"fluctuations"
Topic: 2 word: 0.006*"18163",83,"and" + 0.006*"94495",495,"community" + 0.006*"3530",18,"to" + 0.006*"65405",288,"continue" + 0.006*"38971",178,"in" + 0.006*"76097",354,"sources" + 0.006*"14610",67,"technicians" + 0.006*"10118",40,"the" + 0.006*"13546",63,"pool" + 0.006*"85773",415,"maintenance"
Topic: 3 word: 0.006*"55975",251,"reflect" + 0.006*"73489",341,"services" + 0.006*"16601",77,"water"

In [72]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)