In [1]:
import pandas as pd
import sqlite3
conn = sqlite3.connect('nlp_learn.db')

# data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
query = "SELECT id, title FROM books"
result_set = conn.execute(query).fetchall()
data = pd.DataFrame(result_set, columns = ['id', 'title'])

data_text = data[['title']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
len(documents)

671255

In [3]:
documents[:10]

Unnamed: 0,title,index
0,Things Fall Apart,0
1,Fairy tales,1
2,The Divine Comedy,2
3,The Epic Of Gilgamesh,3
4,The Book Of Job,4
5,One Thousand and One Nights,5
6,Njál's Saga,6
7,Pride and Prejudice,7
8,Le Père Goriot,8
9,"Molloy, Malone Dies, The Unnamable, the trilogy",9


In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\salman.akhatar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
print(WordNetLemmatizer().lemmatize('went', pos='v'))

go


In [7]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [8]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 1:
            result.append(lemmatize_stemming(token))
    return result

In [9]:
doc_sample = documents[documents['index'] == 4130].values[0][0]

print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Jump', 'Start', 'Your', 'Gluten-Free', 'Diet!', 'Living', 'with', 'Celiac', '/', 'Coeliac', 'Disease', '&', 'Gluten', 'Intolerance']


 tokenized and lemmatized document: 
['jump', 'start', 'gluten', 'free', 'diet', 'live', 'celiac', 'coeliac', 'diseas', 'gluten', 'intoler']


In [10]:
processed_docs = documents['title'].map(preprocess)

In [11]:
processed_docs[:10]

0                   [thing, fall, apart]
1                          [fairi, tale]
2                        [divin, comedi]
3                      [epic, gilgamesh]
4                            [book, job]
5                      [thousand, night]
6                           [njál, saga]
7                      [pride, prejudic]
8                     [le, père, goriot]
9    [molloy, malon, die, unnam, trilog]
Name: title, dtype: object

In [12]:
dictionary = gensim.corpora.Dictionary(processed_docs)
len(dictionary)

203957

In [13]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 apart
1 fall
2 thing
3 fairi
4 tale
5 comedi
6 divin
7 epic
8 gilgamesh
9 book
10 job


In [14]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [15]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4130]

[(694, 1),
 (779, 1),
 (1336, 1),
 (1443, 1),
 (1844, 1),
 (1888, 1),
 (3103, 2),
 (4419, 1)]

In [16]:
bow_doc_4310 = bow_corpus[4130]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 694 ("jump") appears 1 time.
Word 779 ("live") appears 1 time.
Word 1336 ("diet") appears 1 time.
Word 1443 ("start") appears 1 time.
Word 1844 ("diseas") appears 1 time.
Word 1888 ("free") appears 1 time.
Word 3103 ("gluten") appears 2 time.
Word 4419 ("intoler") appears 1 time.


In [17]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [18]:
corpus_tfidf = tfidf[bow_corpus]

In [19]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.7196780521769552), (1, 0.47007123578090043), (2, 0.5109760605999079)]


In [34]:
print("Running LDA using Bag of Words")
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=25, id2word=dictionary, passes=2, workers=2)
lda_model.save('trained_model/lda_train_bow.model')
print("LDA Training - Complete")

Running LDA using Bag of Words


TypeError: write() argument must be str, not bytes

In [21]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.126*"world" + 0.069*"histori" + 0.049*"year" + 0.025*"ii" + 0.024*"faith" + 0.023*"centuri" + 0.022*"war" + 0.019*"new" + 0.018*"ancient" + 0.018*"simpl"
Topic: 1 
Words: 0.131*"la" + 0.060*"el" + 0.040*"del" + 0.039*"le" + 0.030*"come" + 0.026*"wild" + 0.025*"break" + 0.024*"los" + 0.019*"daughter" + 0.016*"princess"
Topic: 2 
Words: 0.245*"book" + 0.087*"seri" + 0.022*"recip" + 0.021*"kid" + 0.020*"age" + 0.017*"queen" + 0.016*"child" + 0.016*"free" + 0.012*"alpha" + 0.012*"easi"
Topic: 3 
Words: 0.100*"time" + 0.057*"way" + 0.028*"garden" + 0.026*"stone" + 0.025*"kill" + 0.024*"old" + 0.020*"long" + 0.020*"seven" + 0.017*"cook" + 0.017*"troubl"
Topic: 4 
Words: 0.076*"day" + 0.055*"black" + 0.042*"citi" + 0.032*"guid" + 0.031*"end" + 0.027*"white" + 0.022*"human" + 0.019*"build" + 0.018*"handbook" + 0.017*"color"
Topic: 5 
Words: 0.069*"american" + 0.052*"rise" + 0.046*"king" + 0.037*"saga" + 0.034*"soul" + 0.027*"summer" + 0.026*"land" + 0.019*"passion" + 0.018*"

In [22]:
print("Running LDA using TF-IDF")
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=25, id2word=dictionary, passes=2, workers=4)
print("LDA Training - Complete")

Running LDA using TF-IDF
LDA Training - Complete


In [23]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.054*"die" + 0.037*"der" + 0.023*"miss" + 0.021*"son" + 0.018*"das" + 0.017*"من" + 0.016*"wish" + 0.015*"im" + 0.015*"grace" + 0.015*"des"
Topic: 1 Word: 0.028*"road" + 0.022*"old" + 0.022*"mr" + 0.019*"cowboy" + 0.019*"demon" + 0.017*"father" + 0.016*"hors" + 0.016*"awaken" + 0.016*"affair" + 0.014*"north"
Topic: 2 Word: 0.047*"death" + 0.034*"come" + 0.026*"go" + 0.019*"dont" + 0.019*"lord" + 0.018*"home" + 0.018*"water" + 0.016*"travel" + 0.015*"let" + 0.014*"day"
Topic: 3 Word: 0.038*"lose" + 0.035*"blood" + 0.033*"love" + 0.027*"ladi" + 0.020*"book" + 0.018*"walk" + 0.018*"sky" + 0.017*"time" + 0.016*"long" + 0.014*"marriag"
Topic: 4 Word: 0.027*"stori" + 0.023*"write" + 0.022*"short" + 0.021*"daughter" + 0.018*"letter" + 0.017*"best" + 0.016*"fiction" + 0.015*"hope" + 0.014*"select" + 0.014*"scienc"
Topic: 5 Word: 0.052*"night" + 0.029*"men" + 0.025*"friend" + 0.024*"wolf" + 0.022*"los" + 0.021*"brother" + 0.020*"kill" + 0.016*"rock" + 0.014*"hunt" + 0.014*"wind"


In [24]:
print("Performance evaluation by classifying sample document using LDA TF-IDF model")
for index, score in sorted(lda_model_tfidf[bow_corpus[4130]], key=lambda tup: -1*tup[1]):
    print(index)
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index)))

Performance evaluation by classifying sample document using LDA TF-IDF model
10

Score: 0.5736227035522461	 
Topic: 0.035*"boy" + 0.030*"know" + 0.028*"summer" + 0.023*"doctor" + 0.021*"tree" + 0.017*"need" + 0.016*"troubl" + 0.016*"color" + 0.015*"sleep" + 0.015*"hot"
16

Score: 0.1158323809504509	 
Topic: 0.086*"la" + 0.032*"del" + 0.029*"thing" + 0.014*"revolut" + 0.013*"final" + 0.012*"saint" + 0.012*"el" + 0.012*"horror" + 0.011*"fate" + 0.011*"happen"
20

Score: 0.11341923475265503	 
Topic: 0.040*"star" + 0.030*"break" + 0.023*"devil" + 0.022*"want" + 0.019*"box" + 0.019*"set" + 0.015*"case" + 0.015*"haunt" + 0.013*"seri" + 0.013*"train"
19

Score: 0.11310765892267227	 
Topic: 0.030*"vol" + 0.030*"bear" + 0.030*"wild" + 0.023*"like" + 0.022*"garden" + 0.021*"island" + 0.020*"tale" + 0.020*"princ" + 0.019*"season" + 0.016*"fairi"


In [25]:
print("Testing model on unseen document")
unseen_document = "The Free Voice"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
print(lda_model[bow_vector])
print("________________________________________________________________________________________________")
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print(index)
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index)))

Testing model on unseen document
[(0, 0.013333629), (1, 0.013333629), (2, 0.346664), (3, 0.013333629), (4, 0.013333629), (5, 0.013333629), (6, 0.013333629), (7, 0.013333629), (8, 0.013333629), (9, 0.013333629), (10, 0.013333629), (11, 0.013333629), (12, 0.013333629), (13, 0.013333629), (14, 0.013333629), (15, 0.013333629), (16, 0.013333629), (17, 0.013333629), (18, 0.3466625), (19, 0.013333629), (20, 0.013333629), (21, 0.013333629), (22, 0.013333629), (23, 0.013333629), (24, 0.013333629)]
________________________________________________________________________________________________
2
Score: 0.3466639816761017	 Topic: 0.245*"book" + 0.087*"seri" + 0.022*"recip" + 0.021*"kid" + 0.020*"age" + 0.017*"queen" + 0.016*"child" + 0.016*"free" + 0.012*"alpha" + 0.012*"easi"
18
Score: 0.3466625511646271	 Topic: 0.069*"live" + 0.045*"women" + 0.038*"work" + 0.035*"woman" + 0.028*"step" + 0.026*"devil" + 0.026*"truth" + 0.024*"letter" + 0.024*"second" + 0.022*"classic"
0
Score: 0.0133336288854479

In [35]:
import pickle
from gensim.models import Word2Vec
print("Testing model on unseen document")
filename = "trained_model/lda_train_bow.model"
# lda_model = pickle.load(open(filename, 'rb'))
lda_model = Word2Vec.load(filename)
unseen_document = "The Free Voice"
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
print(lda_model[bow_vector])
print("________________________________________________________________________________________________")
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print(index)
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index)))

Testing model on unseen document


AttributeError: 'LdaMulticore' object has no attribute 'negative'

In [None]:
a = "Salman's bag"
a.replace("'", "")