# TM Predict New Document

- Date: 2019/08/31
- Author: Daniel Hu (University of Melbourne)
- Description: Assist the researchers by predicting the code of this document / sentence.

# Step 1: Import Libraries and Collect Data
## (1) Import Packages and Prepare Stopwords

In [1]:
# Run in python console
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

from random import shuffle



# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## (2) Import Data and Datafile to List

In [2]:
# Import Dataset
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

def datafile_to_list(data_file):
    # Convert to list
    data_list = data_file.content.values.tolist()
    return data_list

data_list = datafile_to_list(df)
shuffle(data_list)

## (3) Seperating Training and Testing Data

In [3]:
# Used to build LDA model 1 (100%)
len_nintyfive_per = int(len(data_list)*0.95)

# Used to build LDA model 2 (90%)
train_data_list = data_list[:len_nintyfive_per]

# Load into LDA model 2 one by one (do not corrupt model 2 by saving in a new model)
test_data_list = data_list[len_nintyfive_per:]

# Step 2: Preprocessing
## (1) Remove Noisy Characters & Tokenize Words

In [4]:
def remove_noise(data):
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', doc) for doc in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', doc) for doc in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", doc) for doc in data]
    
    return data

def doc_to_words(docs):
    for doc in docs:
        yield(gensim.utils.simple_preprocess(str(doc), deacc=True))
        
data = remove_noise(data_list)
data_words = list(doc_to_words(data))

## (2) Remove Stopwords, Make Bigrams and Lemmatize

In [5]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

## (3) Create the Dictionary and Corpus needed for Topic Modeling

In [6]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# Step 3: Building Topic Model for All Data
## (1) Build the LDA Topic Model & View the Topics

In [7]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.077*"bike" + 0.045*"insurance" + 0.042*"logic" + 0.037*"fear" + '
  '0.034*"dod" + 0.033*"click" + 0.033*"ride" + 0.032*"advice" + '
  '0.031*"straight" + 0.026*"islam"'),
 (1,
  '0.113*"count" + 0.041*"united_states" + 0.040*"april" + 0.037*"best" + '
  '0.031*"vote" + 0.027*"faqs" + 0.027*"planet" + 0.019*"percent" + '
  '0.018*"june" + 0.018*"andrew"'),
 (2,
  '0.094*"game" + 0.061*"team" + 0.052*"win" + 0.046*"play" + 0.035*"year" + '
  '0.033*"player" + 0.024*"league" + 0.019*"lose" + 0.019*"division" + '
  '0.018*"fan"'),
 (3,
  '0.820*"ax" + 0.058*"max" + 0.007*"fool" + 0.005*"stephen" + 0.004*"xv" + '
  '0.004*"florida" + 0.002*"sigh" + 0.001*"mb" + 0.000*"part" + 0.000*"end"'),
 (4,
  '0.166*"gun" + 0.062*"moon" + 0.060*"firearm" + 0.048*"handgun" + '
  '0.040*"weapon" + 0.035*"space_shuttle" + 0.028*"gm" + 0.015*"tank" + '
  '0.013*"safety" + 0.013*"ban"'),
 (5,
  '0.065*"pittsburgh" + 0.035*"amp" + 0.027*"gordon_banks" + 0.026*"surrender" '
  '+ 0.021*"intellect" +

## (2) Compute Model Perplexity and Coherence Score

In [8]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -14.314281254734093

Coherence Score:  0.5236575485943878


## (3) Gather the Label of the Last 10% Data

In [9]:
###
# Hand code the similarity
# Remember the purpose of this experiment is to analyse the precision and recall of this method
#
# This experiment does not work. 
# (1) The topics only focus on topics like no. 0, 6, and 17.
# (2) The topic id will be different for each attempt, so might need to hand code the topics for 
#     the "all_data" and "train_data" id labels to match
# (3) Even taking 10% of the data may cause change to the topic (consistency issue)
###

# Record doc_id and its most dominant topic
# len(train_data_lemma)
# for doc_number in range(len(train_data_lemma),len(data_lemmatized)):
#     topic_list = sorted(lda_model.get_document_topics(corpus[doc_number],minimum_probability=0.0), key=lambda x: (x[1]), reverse=True)
#     if topic_list[0][0] == 6:
#         print("Topic No.", topic_list[1][0], topic_list[1][1])
#     else:
#         print("Topic No.", topic_list[0][0], topic_list[0][1])

# Step 4: Apply Training Data to Build Model
## (1) Preprocessing

In [10]:
train_data_lemma = data_lemmatized[:len_nintyfive_per]
test_data_lemma = data_lemmatized[len_nintyfive_per:]

# Create Dictionary
train_id2word = corpora.Dictionary(train_data_lemma)

# Create Corpus
train_texts = train_data_lemma

# Term Document Frequency
train_corpus = [id2word.doc2bow(text) for text in train_texts]

## (2) Build Model

In [11]:
# Build LDA model
train_lda_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                                  id2word=train_id2word,
                                                  num_topics=20, 
                                                  random_state=100,
                                                  update_every=1,
                                                  chunksize=100,
                                                  passes=10,
                                                  alpha='auto',
                                                  per_word_topics=True)

pprint(train_lda_model.print_topics())

[(0,
  '0.085*"sale" + 0.058*"bike" + 0.039*"keyboard" + 0.036*"ride" + '
  '0.036*"count" + 0.034*"unit" + 0.034*"tape" + 0.033*"fear" + 0.027*"dod" + '
  '0.027*"advice"'),
 (1,
  '0.094*"car" + 0.056*"buy" + 0.054*"price" + 0.045*"sell" + 0.032*"pay" + '
  '0.031*"cost" + 0.023*"model" + 0.020*"money" + 0.018*"drive" + '
  '0.017*"dealer"'),
 (2,
  '0.033*"go" + 0.026*"get" + 0.022*"year" + 0.017*"time" + 0.017*"good" + '
  '0.014*"not" + 0.013*"day" + 0.013*"s" + 0.013*"back" + 0.012*"first"'),
 (3,
  '0.134*"game" + 0.118*"team" + 0.071*"play" + 0.070*"win" + 0.032*"hockey" + '
  '0.032*"season" + 0.025*"goal" + 0.024*"roger" + 0.022*"baseball" + '
  '0.020*"stat"'),
 (4,
  '0.086*"god" + 0.055*"christian" + 0.033*"believe" + 0.032*"bible" + '
  '0.028*"church" + 0.027*"religion" + 0.025*"belief" + 0.024*"faith" + '
  '0.020*"jesus" + 0.018*"exist"'),
 (5,
  '0.872*"ax" + 0.060*"max" + 0.004*"mono" + 0.001*"xv" + 0.001*"mb" + '
  '0.000*"part" + 0.000*"reply" + 0.000*"university" 

In [12]:
# Compute Perplexity
print('\nPerplexity: ', train_lda_model.log_perplexity(train_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
train_coherence_model_lda = CoherenceModel(model=train_lda_model, texts=train_data_lemma, dictionary=train_id2word, coherence='c_v')
train_coherence_lda = train_coherence_model_lda.get_coherence()
print('\nCoherence Score: ', train_coherence_lda)


Perplexity:  -14.216960468199522

Coherence Score:  0.47801838837242494


# Step 5: Predict New Documents

## (1) Predicting

In [18]:
test_doc_no = 4

# Create Dictionary
test_this_id2word = corpora.Dictionary([test_data_lemma[test_doc_no]])

# Create Corpus
test_this_texts = [test_data_lemma[test_doc_no]]

# Term Document Frequency
test_this_corpus = [train_id2word.doc2bow(text) for text in test_this_texts]

unseen_doc = test_this_corpus[0]
vector = train_lda_model[unseen_doc]

print("Document No.", test_doc_no)
print(test_data_list[test_doc_no])

print("The topic distribution of this new document is:")
for (topic_id, percentage) in sorted(vector[0], key=lambda x: (x[1]), reverse=True):
    print("Topic No.", topic_id, "\tProb.", "%.2f"%percentage)

Document No. 4
From: gsnow@clark.edu (Gary Snow)
Article-I.D.: clark.1993Apr6.210853.26502
Organization: Clark College, Vancouver, Wa.  USA
Lines: 20

In article <D2150035.ub9c68@outpost.SF-Bay.org> peirce@outpost.SF-Bay.org (Michael Peirce) writes:
>
>Surprised? Shouldn't be.  Protective tarriffs almost always end up
>hurting the U.S. in the long run.  Same with subsidies.  they way
>to build a strong economy isn't to wall it off from the tough outside
>world, but rather to compete in the global market place (and don't
>come crying when the world doesn't always want to play by our house
>rules).

Tell that to the Japanese, their local market is neatly protected by
the Japanese government. Its one very tough nut to crack. In fact
the only current way to break into it, is to do it with a Japanese
company as a partner in the venture.
 
Gary

-- 
-----
Gary Snow
uunet!clark!gsnow  or  gsnow@clark.edu

The topic distribution of this new document is:
Topic No. 19 	Prob. 0.39
Topic No. 2 	Pr

## (2) Updating

In [14]:
train_lda_model.update(test_this_corpus)

  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words


# Step 6: Predict New Sentence

## (1) Preprocessing the Sentence

In [20]:
pre_sentence = "They soft-landed on Mars - the least we could do is soft-land on Earth!"
sent_in_list = remove_noise([pre_sentence])
sentence_words = list(doc_to_words(sent_in_list))


# Build the bigram and trigram models
sent_bigram = gensim.models.Phrases(sentence_words, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a trigram/bigram
sent_bigram_mod = gensim.models.phrases.Phraser(sent_bigram)

# Remove Stop Words
sent_words_nostops = remove_stopwords(sentence_words)

# Form Bigrams
sent_words_bigrams = make_bigrams(sent_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
sent_lemma = lemmatization(sent_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

## (2) Update Word Dictionary and Predict the Topic

In [22]:
# Create Dictionary
this_sent_id2word = corpora.Dictionary([sent_lemma[0]])

# Create Corpus
this_sent_texts = [sent_lemma[0]]

# Term Document Frequency
this_sent_corpus = [train_id2word.doc2bow(text) for text in this_sent_texts]

unseen_doc = this_sent_corpus[0]
vector = train_lda_model[unseen_doc]


print("Sentence:", pre_sentence)

print("The topic distribution of this sentence is:")
for (topic_id, percentage) in sorted(vector[0], key=lambda x: (x[1]), reverse=True):
    print("Topic No.", topic_id, "\tProb.", "%.2f"%percentage)

Sentence: They soft-landed on Mars - the least we could do is soft-land on Earth!
The topic distribution of this sentence is:
Topic No. 19 	Prob. 0.39
Topic No. 18 	Prob. 0.15
Topic No. 16 	Prob. 0.10
Topic No. 2 	Prob. 0.10
Topic No. 11 	Prob. 0.09
Topic No. 9 	Prob. 0.06
Topic No. 8 	Prob. 0.03
Topic No. 1 	Prob. 0.01
