# Goals
- Create a bag of words model on the training data 
- Convert to tfidf 
- Create model & Serialise to disk 
- Try recommendation on some value from the test dataset 
- Make a user profile from some books

In [60]:
# Imports 
import json 
from gensim import corpora, models, similarities
from gensim.utils import simple_preprocess
from smart_open import open
from bs4 import BeautifulSoup

In [61]:
english_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves",
                     "you", "your", "yours", "yourself", "yourselves", "he", "him", 
                     "his", "himself", "she", "her", "hers", "herself", "it", "its", 
                     "itself", "they", "them", "their", "theirs", "themselves", "what", 
                     "which", "who", "whom", "this", "that", "these", "those", "am", "is", 
                     "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", 
                     "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", 
                     "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", 
                     "against", "between", "into", "through", "during", "before", "after", "above", 
                     "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", 
                     "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", 
                     "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", 
                     "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", 
                     "will", "just", "don", "should", "now"]

In [104]:
corpus_link = './data/how-good-is-your-medium-article/test.json'
# corpus_link = './data/test-file.json'
features = ['url', 'title', 'content', 'author', 'image_url']

In [105]:
class MyCorpus: 
    def __iter__(self): 
        for line in open(corpus_link): 
            yield dictionary.doc2bow(jsonLine2text(line).lower().split())
            # Assume one doc per line. Separeated by whitespace 
#             yield dictionary.doc2bow(line.lower().split())

In [106]:
url_list = []
def jsonLine2text(line): 
    global url_list
    json_txt = json.loads(line)
    soup = BeautifulSoup(json_txt['content'], 'html.parser')
    data_block = list(soup.stripped_strings)
    
    # Extra: Grab the urls of the docs 
    url_list.append((json_txt['url'], json_txt['title']))
    
    # Combine all the data into one document 
    data_doc = ' '.join(data_block)
    
    return data_doc
    

# Loading in Data & Making the bag of words

In [107]:
dictionary = corpora.Dictionary()

for line in open(corpus_link):     
    # Get the html content as a block of text
    data_doc = jsonLine2text(line)
    
    # Add document ot the dictionary
    dictionary.add_documents([data_doc.lower().split()])

# Remove stopwords 
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in english_stopwords
    if stopword in dictionary.token2id
]

# Remove id's that occur rarely 
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]

dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
print('Dictionary created')

Dictionary created


In [108]:
dictionary.save('./models/dictionary/test-dictionary')

In [109]:
print(dictionary)

Dictionary(586992 unique tokens: ['&', '(scroll', '1', '10', '2018']...)


In [13]:
dictionary.token2id

{'&': 0,
 '(1)': 1,
 '(2)': 2,
 '(3)': 3,
 '(4)': 4,
 '(e.g.,': 5,
 '(including': 6,
 '(like': 7,
 '(“services”).': 8,
 '),': 9,
 ',': 10,
 '.': 11,
 '13': 12,
 '13,': 13,
 '13.': 14,
 '14': 15,
 '2012': 16,
 '2016': 17,
 '2017': 18,
 '7,': 19,
 'access': 20,
 'accordance': 21,
 'account': 22,
 'account,': 23,
 'account.': 24,
 'accounts': 25,
 'accuracy,': 26,
 'adapt': 27,
 'advertising': 28,
 'affirm': 29,
 'agree': 30,
 'agreement': 31,
 'agreement.': 32,
 'allowed': 33,
 'also': 34,
 'altered,': 35,
 'amendment': 36,
 'another’s': 37,
 'anything': 38,
 'anytime.': 39,
 'appropriately.': 40,
 'apps,': 41,
 'areas': 42,
 'arise': 43,
 'arising': 44,
 'aspect': 45,
 'assume': 46,
 'aug': 47,
 'availability,': 48,
 'available': 49,
 'available,': 50,
 'backup': 51,
 'bounty': 52,
 'breach': 53,
 'bug': 54,
 'burden': 55,
 'california': 56,
 'california.': 57,
 'can’t': 58,
 'change': 59,
 'change,': 60,
 'changes': 61,
 'children': 62,
 'choice': 63,
 'claims': 64,
 'claims,': 65,
 'c

## Crete BoW using Dictionary on Corpus

In [110]:
bow_corpus = MyCorpus()

## Serialise & Save corpus 

In [111]:
corpora.MmCorpus.serialize('./models/testing-corpus.mm', bow_corpus)

In [112]:
# load the corpus 
loaded_corpus = corpora.MmCorpus('./models/testing-corpus.mm')

In [None]:
class BoWCorpus: 
    def __init__(self, path, dictionary):
        self.filepath = path
        self.dictionary = dictionary
        
    def __iter__(self):
#         global mydict  # OPTIONAL, only if updating the source dictionary.
        for line in open(self.filepath, encoding='latin'):
#             global dictionary 
            # Get text from json 
            json_line = jsonLine2text(line)
        
            # tokenize
            tokenized_list = simple_preprocess(json_line, deacc=True)

            # create bag of words
            bow = self.dictionary.doc2bow(tokenized_list, allow_update=True)

#             # update the source dictionary (OPTIONAL)
#             mydict.merge_with(self.dictionary)

            # lazy return the BoW
            yield bow

In [None]:
# bow_corpus = BoWCorpus(corpus_link, dictionary)

## TFIDF Model

In [113]:
# train model 
# tfidf = models.TfidfModel(bow_corpus, dictionary=dictionary)
tfidf = models.TfidfModel(loaded_corpus, dictionary=dictionary)

In [45]:
# Transform the input document 
words = 'system design in science'.lower().split()
vec_bow = tfidf[dictionary.doc2bow(words)]
print(vec_bow)

[(102, 0.5425120553890905), (1014, 0.6603859913621831), (1045, 0.5192022844422887)]


In [114]:
text = 'Using machine learning, one can find useful patterns from large data sets to make'\
'data more informative and qualitatively insightful. This is very important for'\
'decision making. Students will be exposed to supervised and unsupervised'\
'learning, respectively.'

In [115]:
text_words = simple_preprocess(text)
vec_bow = tfidf[dictionary.doc2bow(text_words)]
print(vec_bow)

[(289, 0.05511005970567312), (380, 0.07478243926231902), (466, 0.16268351589010227), (1122, 0.07311829191187329), (2049, 0.06480167404722555), (3056, 0.09323414651261741), (3229, 0.33871129311398357), (3235, 0.31896819931233295), (3277, 0.11518833846552938), (3356, 0.21052910760069152), (3727, 0.1098119860305428), (5560, 0.23750920809199708), (7748, 0.1479187641444614), (7878, 0.18900008894443024), (14372, 0.20469617905157386), (25154, 0.38545881069931026), (40959, 0.3882376586774619), (88481, 0.45313540216529463)]


## Perform Similarity Check 

In [None]:
index = similarities.MatrixSimilarity(tfidf[loaded_corpus])

In [None]:
sims = index[vec_bow]
print(list(sorted(enumerate(sims), key=lambda x: x[1], reverse=True))[:10])

In [81]:
url_list[1393]

('https://medium.com/web11/semantic-programming-a-contract-driven-approach-11e00bc7a239',
 'Semantic programming. A contract driven approach. – Web11 – Medium')

In [82]:
index.save('./my-index')

In [85]:
new_index = similarities.MatrixSimilarity.load('./my-index')

In [98]:
print(list(sorted(enumerate(new_index[vec_bow]), key=lambda x: x[1], reverse=True))[:10])

[(1516, 0.104466945), (4322, 0.10161233), (4787, 0.10131438), (4657, 0.09834733), (3550, 0.08717883), (2973, 0.087101206), (2120, 0.0780553), (4000, 0.07706502), (3810, 0.07656737), (3854, 0.075353034)]


In [103]:
url_list[4657]

('https://hackernoon.com/how-not-to-hire-your-first-data-scientist-34f0f56f81ae',
 'How not to hire your first data scientist – Hacker Noon')

## LSI Model

In [55]:
# train model 
lsi = models.LsiModel(loaded_corpus, id2word=dictionary, num_topics=250)

In [56]:
vec_lsi = lsi[vec_bow]

In [57]:
lsi_index = similarities.MatrixSimilarity(lsi[loaded_corpus])

In [58]:
lsi_sims = index[vec_bow]

In [59]:
print(list(sorted(enumerate(sims), key=lambda x: x[1], reverse=True))[:10])

[(606, 0.347016), (934, 0.22037357), (191, 0.20504291), (82, 0.20036006), (855, 0.19806148), (257, 0.18509659), (462, 0.15427643), (842, 0.14031976), (256, 0.12940617), (938, 0.12658773)]


## Get the Documents that were similar form their document id