In [2]:
import pandas as pd

In [4]:
import pickle

In [5]:
import numpy as np
np.random.seed(2018)

In [8]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [3]:
url='https://drive.google.com/file/d/1si7B_mq8EnoUNFjBEovILwPfzYW8Gjn8/view'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
trainingDf = pd.read_csv(url)

In [17]:
en_stop = set(nltk.corpus.stopwords.words('english'))

In [6]:
import nltk
from nltk.corpus import wordnet

lmtzr = nltk.WordNetLemmatizer().lemmatize

## We lookup whether a word is and adjective, verb, noun or adverb here.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    
## This version uses word type. Needs the bigger nltp download ("popular")
def normalize_text(text):
    ## Runs on documents (vector of words)
    word_pos = nltk.pos_tag(nltk.word_tokenize(text))
    lemm_words = [lmtzr(sw[0], get_wordnet_pos(sw[1])) for sw in word_pos]

    return [x.lower() for x in lemm_words]

## This version doesn't require the "popular" download
def preprocess(text):
    ## Runs on documents (vector of words)
    lemmatizer = nltk.WordNetLemmatizer()
    return([lemmatizer.lemmatize(i) for i in text.split()])

################
## wordnet version
from nltk.corpus import wordnet as wn
def get_lemma(word):
    ## morphy does a lemma lookup and word standardization
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

## lemmatize
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

## This version is for comparison
def prepare_text_for_lda(text):
    ## Runs on documents (vector of words)
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [7]:
processed_docs = trainingDf['text'].map(preprocess) # preprocess is faster than normalise_text.
processed_docs[:10]

0    [good, and, helpfull, read, this, book, is, ve...
1    [Sadly, overpriced, and, irrelevant, In, spite...
2    [Endless, rant, Howard, should, have, borrowed...
3    [Not, Quite, Hip, It's, really, a, shame, abou...
4    [Journey, to, the, Centre, of, the, Earth, Hey...
5    [No, longer, the, Land, of, the, Free, The, re...
6    [DEMON, IN, MY, VIEW-AMELIA, ATWATER-RHODES, A...
7    [Heartbreaking...but, you'll, live, The, novel...
8    [I, waited, for, this?, I, got, this, book, wh...
9    [Awesome!, The, book, wa, absolutely, beautifu...
Name: text, dtype: object

In [9]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [12]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [13]:
print("Creating corpus and saving to pickle")
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
pickle.dump(bow_corpus, open('bow_corpusE.pkl', 'wb'))
pickle.dump(dictionary, open('dictionaryE.pkl', 'wb'))

Creating corpus and saving to pickle


In [14]:
bow_doc_16 = bow_corpus[1000]

for i in range(len(bow_doc_16)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_16[i][0], 
                                               dictionary[bow_doc_16[i][0]], 
                                                bow_doc_16[i][1]))

Word 2 ("but") appears 3 time.
Word 17 ("more") appears 1 time.
Word 20 ("really") appears 1 time.
Word 22 ("some") appears 1 time.
Word 26 ("there") appears 1 time.
Word 30 ("very") appears 1 time.
Word 42 ("The") appears 3 time.
Word 47 ("about") appears 1 time.
Word 55 ("all") appears 2 time.
Word 62 ("at") appears 3 time.
Word 72 ("completely") appears 1 time.
Word 83 ("do") appears 1 time.
Word 102 ("half") appears 2 time.
Word 103 ("have") appears 3 time.
Word 123 ("no") appears 1 time.
Word 124 ("none") appears 1 time.
Word 125 ("not") appears 1 time.
Word 127 ("on") appears 2 time.
Word 128 ("one") appears 1 time.
Word 129 ("or") appears 1 time.
Word 149 ("so") appears 1 time.
Word 158 ("time") appears 1 time.
Word 168 ("where") appears 1 time.
Word 170 ("which") appears 1 time.
Word 171 ("with") appears 3 time.
Word 190 ("don't") appears 1 time.
Word 195 ("from") appears 1 time.
Word 198 ("his") appears 1 time.
Word 250 ("he") appears 2 time.
Word 253 ("him") appears 1 time.
W

In [19]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.13680446895709966),
 (1, 0.22357034541838006),
 (2, 0.08448633111521395),
 (3, 0.22272383387233372),
 (4, 0.19479764099028732),
 (5, 0.15505766682846991),
 (6, 0.17140018835933787),
 (7, 0.20706215084319432),
 (8, 0.24790150235920178),
 (9, 0.10957731696404206),
 (10, 0.11037529753101362),
 (11, 0.1600336402015205),
 (12, 0.19817624508922924),
 (13, 0.33165340348502614),
 (14, 0.16077018290616207),
 (15, 0.0818448203705459),
 (16, 0.17623034848572441),
 (17, 0.06797125117132176),
 (18, 0.3415960748877919),
 (19, 0.044529894508430846),
 (20, 0.08993478504754539),
 (21, 0.11069440822829753),
 (22, 0.08025360783600693),
 (23, 0.2816137209010218),
 (24, 0.08484649134817178),
 (25, 0.29045998608754864),
 (26, 0.09195865510941037),
 (27, 0.08045096618131507),
 (28, 0.09819980195139141),
 (29, 0.18223808169998368),
 (30, 0.06891912002600492),
 (31, 0.07195495996050973),
 (32, 0.0715940762794895),
 (33, 0.1496925070573312)]


In [20]:
try:
    lda_model=pickle.load(open('lda_modelE.pkl', 'rb'))
    print("Reading lda_model from pickle")
except FileNotFoundError:
    print("Creating lda_model and saving to pickle")
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
    pickle.dump(lda_model,open('lda_modelE.pkl','wb'))

Creating lda_model and saving to pickle


In [25]:
#Visualisation

import pyLDAvis.gensim
pyLDAvis.enable_notebook()

ModuleNotFoundError: No module named 'pyLDAvis.gensim'

In [24]:
!pip3 install pyLDAvis



