In [1]:
# importing the required libraries

import numpy as np 
import json 
import glob

# gensim 

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


#spacy 
import spacy
from nltk.corpus import stopwords

#vis 
import pyLDAvis
import pyLDAvis.gensim

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)


### Preparing the Data 

In [2]:
# creating a function to load and write the json data 

def load_data(file):
    with open(file, 'r',encoding='utf-8') as f:
        data= json.load(f)
    return data 


def write_data(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f)

In [3]:
# let's get the list of stopwords
stopwords = stopwords.words('english')
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
# Load the data 
data = load_data(r"data\ushmm_dn.json")['texts']
print(data[0][0:90])

 My name David Kochalski. I was born in a small town called , and I was born May 5, 1928. 


In [5]:
# lemmatization using spacy 
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = " ".join(new_text)
        texts_out.append(final)
    return (texts_out)


lemmatized_texts = lemmatization(data)
print (lemmatized_texts[0][0:90])

name bear small town call bear very hard work child father mother small mill flour buckwhe


In [6]:
# generate new words 
def gen_words(texts):
    final =[]
    for text in texts:
        new= gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return final
  
data_words= gen_words(lemmatized_texts)

#### Bigrams and Trigrams 

In [7]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

print (data_bigrams_trigrams[0][0:20])


['name', 'bear', 'small', 'town', 'call', 'bear', 'very', 'hard', 'work', 'child', 'father', 'mother', 'small', 'mill', 'flour', 'buckwheat', 'prosperous', 'comfortable', 'go', 'school']


In [8]:
#TF-IDF REMOVAL
from gensim.models import TfidfModel

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [9]:
# # creating a dictionary of words 
# ld2word = corpora.Dictionary(data_words)

# corpus=[]
# for text in data_words:
#     new = ld2word.doc2bow(text)
#     corpus.append(new)

# print(corpus[0][0:20])

In [10]:
# word= ld2word[[0][:1][0]]

In [11]:
# word

In [12]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                            id2word=id2word, 
                                            num_topics=30, 
                                            random_state=100,
                                            update_every=1, 
                                            chunksize=100, 
                                            passes=10, 
                                            alpha='auto')

In [13]:
test_doc = corpus[-1]

vector = lda_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(1, 0.27880788), (7, 0.048639536), (9, 0.011969122), (11, 0.27440786), (15, 0.24109037), (17, 0.034882054), (19, 0.019126762), (29, 0.063026346)]
[(1, 0.27880788), (11, 0.27440786), (15, 0.24109037), (29, 0.063026346), (7, 0.048639536), (17, 0.034882054), (19, 0.019126762), (9, 0.011969122)]


In [21]:
#saving the model 
lda_model.save("models/test_model.model")


In [22]:
new_model = gensim.models.ldamodel.LdaModel.load("models/test_model.model")


In [23]:
test_doc = corpus[-1]

vector = new_model[test_doc]
print (vector)

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)
new_vector = Sort(vector)
print (new_vector)

[(1, 0.2788109), (7, 0.04863416), (9, 0.011969014), (11, 0.2744082), (15, 0.24109021), (17, 0.034882847), (19, 0.019126747), (29, 0.063027896)]
[(1, 0.2788109), (11, 0.2744082), (15, 0.24109021), (29, 0.063027896), (7, 0.04863416), (17, 0.034882847), (19, 0.019126747), (9, 0.011969014)]


### Visualization of the Data

In [19]:
import pyLDAvis.gensim

# Assuming you have already created your LDA model (lda_model), corpus, and id2word

pyLDAvis.enable_notebook()
viz = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=id2word, mds='mmds', R=30)
viz


