In [8]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)



In [67]:
data = pd.read_csv(r'../news_classification_engine/dataset.csv', encoding="ISO-8859-1");

In [68]:
data_text = data[['news']]
data_text['index'] = data_text.index
documents = data_text

In [140]:
print(len(documents))
print(documents.loc[2210])

2225
news     Mobiles rack up 20 years of use\n \n Mobile ph...
index                                                 2210
Name: 2210, dtype: object


In [70]:
stemmer = PorterStemmer()
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

preview preprocessing on selected document

In [71]:
doc_sample = documents[documents['index'] == 2210].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Mobiles', 'rack', 'up', '20', 'years', 'of', 'use\n', '\n', 'Mobile', 'phones', 'in', 'the', 'UK', 'are', 'celebrating', 'their', '20th', 'anniversary', 'this', 'weekend.\n', '\n', "Britain's", 'first', 'mobile', 'phone', 'call', 'was', 'made', 'across', 'the', 'Vodafone', 'network', 'on', '1', 'January', '1985', 'by', 'veteran', 'comedian', 'Ernie', 'Wise.', 'In', 'the', '20', 'years', 'since', 'that', 'day,', 'mobile', 'phones', 'have', 'become', 'an', 'integral', 'part', 'of', 'modern', 'life', 'and', 'now', 'almost', '90%', 'of', 'Britons', 'own', 'a', 'handset.', 'Mobiles', 'have', 'become', 'so', 'popular', 'that', 'many', 'people', 'use', 'their', 'handset', 'as', 'their', 'only', 'phone', 'and', 'rarely', 'use', 'a', 'landline.\n', '\n', 'The', 'first', 'ever', 'call', 'over', 'a', 'portable', 'phone', 'was', 'made', 'in', '1973', 'in', 'New', 'York', 'but', 'it', 'took', '10', 'years', 'for', 'the', 'first', 'commercial', 'mobile', 'service', 'to', 'be', 

In [72]:
processed_docs = documents['news'].map(preprocess)
processed_docs[:10]

0    [china, role, yuko, split, china, lend, russia...
1    [rebound, weather, effect, price, recov, asian...
2    [indonesia, declin, debt, freez, indonesia, lo...
3    [payoff, shell, boss, shell, financ, chief, st...
4    [bank, settlement, bank, america, subsidiari, ...
5    [verizon, seal, takeov, verizon, takeov, battl...
6    [parmalat, boast, doubl, profit, parmalat, ita...
7    [seek, smoker, rule, justic, depart, overturn,...
8    [steel, firm, job, mittal, steel, world, large...
9    [car, pull, retail, figur, retail, sale, fell,...
Name: news, dtype: object

### Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0 # showing only 10 words
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abl
1 accord
2 agre
3 ambit
4 analyst
5 announc
6 aton
7 attempt
8 auction
9 bank
10 bankruptci


#### Gensim filter_extremes

Filter out tokens that appear in

    - less than 15 documents (absolute number) or
    - more than 0.5 documents (fraction of total corpus size, not absolute number).
    - after the above two steps, keep only the first 100000 most frequent tokens.

In [16]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#### Gensim doc2bow

In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[2210]

[(39, 1),
 (45, 1),
 (50, 2),
 (59, 1),
 (106, 1),
 (133, 1),
 (134, 2),
 (157, 1),
 (178, 1),
 (179, 1),
 (222, 3),
 (271, 2),
 (300, 1),
 (307, 1),
 (326, 1),
 (340, 1),
 (346, 3),
 (353, 1),
 (364, 1),
 (366, 1),
 (374, 1),
 (381, 1),
 (383, 1),
 (398, 12),
 (399, 1),
 (401, 4),
 (405, 10),
 (523, 1),
 (541, 1),
 (548, 3),
 (555, 1),
 (558, 3),
 (581, 1),
 (585, 1),
 (609, 2),
 (614, 2),
 (624, 2),
 (647, 2),
 (682, 1),
 (703, 3),
 (765, 1),
 (808, 4),
 (843, 2),
 (851, 2),
 (863, 1),
 (936, 1),
 (939, 1),
 (1018, 1),
 (1040, 1),
 (1062, 1),
 (1192, 5),
 (1213, 1),
 (1215, 1),
 (1367, 1),
 (1373, 1),
 (1398, 1),
 (1413, 1),
 (1448, 1),
 (1495, 1),
 (1589, 1),
 (1615, 1),
 (1616, 1),
 (1634, 1),
 (1718, 2),
 (1720, 1),
 (1790, 1),
 (1860, 1),
 (1911, 1),
 (1990, 1),
 (2037, 1),
 (2121, 1),
 (2151, 1),
 (2288, 7),
 (2343, 1),
 (2403, 1),
 (2462, 2),
 (2593, 1),
 (2610, 1),
 (2860, 1)]

Preview Bag Of Words for our sample preprocessed document.

In [24]:
bow_doc_4310 = bow_corpus[2210]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 39 ("firm") appears 1 time.
Word 45 ("global") appears 1 time.
Word 50 ("help") appears 2 time.
Word 59 ("littl") appears 1 time.
Word 106 ("size") appears 1 time.
Word 133 ("day") appears 1 time.
Word 134 ("despit") appears 2 time.
Word 157 ("market") appears 1 time.
Word 178 ("world") appears 1 time.
Word 179 ("york") appears 1 time.
Word 222 ("oper") appears 3 time.
Word 271 ("januari") appears 2 time.
Word 300 ("time") appears 1 time.
Word 307 ("busi") appears 1 time.
Word 326 ("head") appears 1 time.
Word 340 ("offic") appears 1 time.
Word 346 ("popular") appears 3 time.
Word 353 ("second") appears 1 time.
Word 364 ("total") appears 1 time.
Word 366 ("wide") appears 1 time.
Word 374 ("billion") appears 1 time.
Word 381 ("commun") appears 1 time.
Word 383 ("cost") appears 1 time.
Word 398 ("mobil") appears 12 time.
Word 399 ("month") appears 1 time.
Word 401 ("network") appears 4 time.
Word 405 ("phone") appears 10 time.
Word 523 ("young") appears 1 time.
Word 541 ("huge") app

### TF-IDF


In [26]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
# from pprint import pprint
# for doc in corpus_tfidf:
#     pprint(doc)
#     break

#### Running LDA using Bag of Words

In [31]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=3, id2word=dictionary, passes=2, workers=2)

In [32]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"game" + 0.006*"peopl" + 0.005*"time" + 0.005*"compani" + 0.005*"play" + 0.004*"world" + 0.004*"firm" + 0.004*"player" + 0.004*"come" + 0.004*"think"
Topic: 1 
Words: 0.007*"peopl" + 0.006*"time" + 0.005*"mobil" + 0.005*"phone" + 0.005*"world" + 0.004*"servic" + 0.004*"like" + 0.004*"come" + 0.004*"work" + 0.004*"game"
Topic: 2 
Words: 0.006*"film" + 0.005*"peopl" + 0.005*"govern" + 0.004*"game" + 0.004*"time" + 0.004*"best" + 0.004*"go" + 0.004*"labour" + 0.004*"like" + 0.004*"plan"


#### Running LDA using TF-IDF

In [34]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=4)

In [36]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWord: {}'.format(idx, topic))

Topic: 0 
Word: 0.004*"elect" + 0.004*"labour" + 0.004*"parti" + 0.003*"tori" + 0.003*"blair" + 0.003*"brown" + 0.003*"govern" + 0.003*"howard" + 0.002*"minist" + 0.002*"plan"
Topic: 1 
Word: 0.003*"mobil" + 0.003*"phone" + 0.002*"peopl" + 0.002*"firm" + 0.002*"servic" + 0.002*"technolog" + 0.002*"music" + 0.002*"china" + 0.002*"user" + 0.002*"compani"
Topic: 2 
Word: 0.005*"film" + 0.004*"game" + 0.003*"award" + 0.003*"best" + 0.003*"play" + 0.002*"england" + 0.002*"star" + 0.002*"club" + 0.002*"player" + 0.002*"music"


Performance evaluation by classifying sample document using LDA Bag of Words model

In [52]:
data.loc[2210],

(news    Mobiles rack up 20 years of use\n \n Mobile ph...
 type                                                 tech
 Name: 2210, dtype: object,)

In [45]:
for index, score in sorted(lda_model[bow_corpus[2210]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9943941235542297	 
Topic: 0.007*"peopl" + 0.006*"time" + 0.005*"mobil" + 0.005*"phone" + 0.005*"world" + 0.004*"servic" + 0.004*"like" + 0.004*"come" + 0.004*"work" + 0.004*"game"


Performance evaluation by classifying sample document using LDA TF-IDF model.

In [47]:
for index, score in sorted(lda_model_tfidf[bow_corpus[2210]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9945883750915527	 
Topic: 0.003*"mobil" + 0.003*"phone" + 0.002*"peopl" + 0.002*"firm" + 0.002*"servic" + 0.002*"technolog" + 0.002*"music" + 0.002*"china" + 0.002*"user" + 0.002*"compani"


Testing model on unseen document

In [103]:
unseen_document = '''
Sydney gamer has been charged after he was heard allegedly assaulting a woman during a live stream of the hugely popular game Fortnite.

The 26-year-old man who goes by the handle MrDeadMoth was arrested after someone saw the video and reported it to police.

In the video, that went viral on social media, a woman can be heard screaming off-camera.

Two children were inside the home at the time of the alleged assault.
'''

unseen_document = ' '.join(preprocess(unseen_document))

In [105]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
    
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.9724864363670349	 Topic: 0.007*"peopl" + 0.006*"time" + 0.005*"mobil" + 0.005*"phone" + 0.005*"world"
Score: 0.014182060025632381	 Topic: 0.006*"film" + 0.005*"peopl" + 0.005*"govern" + 0.004*"game" + 0.004*"time"
Score: 0.013331498019397259	 Topic: 0.007*"game" + 0.006*"peopl" + 0.005*"time" + 0.005*"compani" + 0.005*"play"
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Score: 0.6965793371200562	 Topic: 0.007*"game" + 0.006*"peopl" + 0.005*"time" + 0.005*"compani" + 0.005*"play"
Score: 0.28790152072906494	 Topic: 0.007*"peopl" + 0.006*"time" + 0.005*"mobil" + 0.005*"phone" + 0.005*"world"
Score: 0.015519153326749802	 Topic: 0.006*"film" + 0.005*"peopl" + 0.005*"govern" + 0.004*"game" + 0.004*"time"


In [59]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [63]:
lda = LatentDirichletAllocation(n_components=5,random_state=0)

In [91]:
test = processed_docs.tolist()

In [92]:
processed_docs_vec = [' '.join(ele) for ele in test]

In [124]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(processed_docs_vec)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer( max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(processed_docs_vec)
tf_feature_names = tf_vectorizer.get_feature_names()

In [94]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 3

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)




In [96]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
say govern peopl labour year elect compani firm blair market
Topic 1:
game play england player match team say injuri final wale
Topic 2:
film award best star oscar nomin actor actress festiv director
Topic 0:
game say year play film time best player world award
Topic 1:
say govern elect labour peopl parti year minist plan tell
Topic 2:
say peopl year mobil technolog phone servic firm market user


In [126]:
tf_vectorizer.fit_transform([unseen_document])
predict = lda.transform(tf)