In [1]:
# download news group data set from sklearn
from sklearn.datasets import fetch_20newsgroups

In [2]:
#assign train and test data
ng_train = fetch_20newsgroups(subset = 'train',shuffle=True)
ng_test   = fetch_20newsgroups(subset = 'test',shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
print(list(ng_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
#sample news
ng_train.data[:3]

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [5]:
#shape of the subsets
print(ng_train.filenames.shape,ng_train.target.shape)

(11314,) (11314,)


In [6]:
#Data Preprocessing
#Tokenization
'''
Loading gensim and nltk libraries
'''
#!pip install gensim
import gensim

# Converts into tokens (Alternative to word_tokenize)
from gensim.utils import simple_preprocess

from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer, SnowballStemmer
#from nltk.stem import *
import numpy as np
np.random.seed(400)

In [7]:
import nltk
#nltk.download('wordnet')

In [8]:
#testing the lemmatize statment. It should convert past or future tense into present tense
WordNetLemmatizer().lemmatize('runs')

'run'

In [9]:
#testing the stemming part before preprocessing. This should be changing any plural into singular word
import pandas as pd
stemmer = SnowballStemmer("english")
original_words=['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']

singles= [WordNetLemmatizer().lemmatize(plural,pos='v') for plural in original_words]

pd.DataFrame(data={'Original Words':original_words, 'Lemma':singles})

#Stemma is not performing well 
#singles= [stemmer.stem(plural) for plural in original_words]
#singles2= [stemmer.stem(plural2) for plural2 in singles]
#stemma = pd.DataFrame(data={'Lemma':singles, 'Stemmed':singles2})



Unnamed: 0,Original Words,Lemma
0,caresses,caress
1,flies,fly
2,dies,die
3,mules,mules
4,denied,deny
5,died,die
6,agreed,agree
7,owned,own
8,humbled,humble
9,sized,size


In [10]:
# writing function for the entire dataset
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

#Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token))
            
    return result
        

In [11]:
#preview a document before preprocessing

#document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original Document: ")
words=[]
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original Document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [12]:
processed_docs = []

for doc in ng_train.data:
    processed_docs.append(preprocess(doc))

In [13]:
len(processed_docs)

11314

In [14]:
print(processed_docs[:2])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

In [15]:
#bag of words on the data set
dictionary = gensim.corpora.Dictionary(processed_docs)

In [16]:
print (dictionary)

Dictionary(61411 unique tokens: ['addit', 'bodi', 'bricklin', 'bring', 'bumper']...)


In [17]:
#Lets see if dictionary created succesfully
count=0
for k,v in dictionary.iteritems():
    print (k, v)
    count +=1
    if count >20:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten
11 funki
12 histori
13 host
14 info
15 know
16 late
17 lerxst
18 line
19 look
20 mail


In [18]:
#remove rare and repeatative words
dictionary.filter_extremes(no_below=15,no_above=0.1,keep_n=100000)

In [19]:
print (dictionary)

Dictionary(6535 unique tokens: ['addit', 'bodi', 'bring', 'bumper', 'call']...)


In [20]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [21]:
bow_corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 2),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1)]

In [22]:
#preview 
document_num = 10
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]],
                                                     bow_doc_x[i][1]))

Word 14 ("model") appears 1 time.
Word 33 ("clock") appears 1 time.
Word 46 ("keyword") appears 1 time.
Word 63 ("summari") appears 1 time.
Word 83 ("email") appears 1 time.
Word 101 ("opinion") appears 1 time.
Word 130 ("worth") appears 1 time.
Word 146 ("nice") appears 1 time.
Word 227 ("hard") appears 1 time.
Word 385 ("owner") appears 1 time.
Word 399 ("axi") appears 1 time.
Word 400 ("beemer") appears 1 time.
Word 401 ("bike") appears 2 time.
Word 402 ("brown") appears 1 time.
Word 403 ("ducati") appears 2 time.
Word 404 ("expir") appears 1 time.
Word 405 ("fade") appears 1 time.
Word 406 ("leak") appears 2 time.
Word 407 ("lonestar") appears 2 time.
Word 408 ("motor") appears 1 time.
Word 409 ("orang") appears 1 time.
Word 410 ("paint") appears 1 time.
Word 411 ("pop") appears 1 time.
Word 412 ("recommend") appears 1 time.
Word 413 ("richardson") appears 2 time.
Word 414 ("run") appears 1 time.
Word 415 ("sell") appears 1 time.
Word 416 ("shop") appears 1 time.
Word 417 ("stabl")

In [23]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [24]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.010*"bike" + 0.006*"game" + 0.005*"run" + 0.005*"team" + 0.005*"virginia" + 0.004*"motorcycl" + 0.004*"pitch" + 0.004*"player" + 0.004*"play" + 0.004*"ride"


Topic: 1 
Words: 0.010*"armenian" + 0.008*"israel" + 0.007*"isra" + 0.007*"kill" + 0.006*"turkish" + 0.005*"weapon" + 0.005*"jew" + 0.005*"govern" + 0.005*"arab" + 0.004*"gun"


Topic: 2 
Words: 0.020*"game" + 0.017*"team" + 0.013*"play" + 0.010*"player" + 0.010*"hockey" + 0.006*"season" + 0.006*"canada" + 0.005*"leagu" + 0.005*"score" + 0.005*"andrew"


Topic: 3 
Words: 0.014*"window" + 0.013*"card" + 0.009*"driver" + 0.006*"sale" + 0.006*"video" + 0.006*"monitor" + 0.006*"speed" + 0.006*"appl" + 0.005*"price" + 0.005*"softwar"


Topic: 4 
Words: 0.016*"file" + 0.011*"program" + 0.010*"window" + 0.007*"imag" + 0.006*"avail" + 0.006*"data" + 0.005*"version" + 0.005*"server" + 0.005*"graphic" + 0.005*"applic"


Topic: 5 
Words: 0.018*"space" + 0.013*"nasa" + 0.007*"scienc" + 0.007*"orbit" + 0.006*"launch" + 0.00

In [25]:
num = 100
unseen_document = ng_test.data[num]
print(unseen_document)

Subject: help
From: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)
Lines: 13

Hello All!

    It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?


Chris

 * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)



In [26]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 30)))

Score: 0.6123292446136475	 Topic: 0.014*"window" + 0.013*"card" + 0.009*"driver" + 0.006*"sale" + 0.006*"video" + 0.006*"monitor" + 0.006*"speed" + 0.006*"appl" + 0.005*"price" + 0.005*"softwar" + 0.005*"memori" + 0.004*"version" + 0.004*"machin" + 0.004*"mous" + 0.004*"cwru" + 0.004*"color" + 0.004*"engin" + 0.004*"printer" + 0.004*"modem" + 0.004*"sell" + 0.004*"cleveland" + 0.003*"simm" + 0.003*"board" + 0.003*"port" + 0.003*"upgrad" + 0.003*"instal" + 0.003*"mode" + 0.003*"access" + 0.003*"sound" + 0.003*"offer"
Score: 0.3618493974208832	 Topic: 0.016*"file" + 0.011*"program" + 0.010*"window" + 0.007*"imag" + 0.006*"avail" + 0.006*"data" + 0.005*"version" + 0.005*"server" + 0.005*"graphic" + 0.005*"applic" + 0.005*"sourc" + 0.005*"user" + 0.005*"list" + 0.005*"softwar" + 0.005*"code" + 0.004*"display" + 0.004*"send" + 0.004*"email" + 0.004*"wire" + 0.004*"manag" + 0.003*"messag" + 0.003*"current" + 0.003*"output" + 0.003*"widget" + 0.003*"function" + 0.003*"build" + 0.003*"format" 

In [27]:
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda x: x[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.3618205189704895	 Topic: 0.016*"file" + 0.011*"program" + 0.010*"window" + 0.007*"imag" + 0.006*"avail"
Score: 0.6123580932617188	 Topic: 0.014*"window" + 0.013*"card" + 0.009*"driver" + 0.006*"sale" + 0.006*"video"


In [28]:
print(ng_test.target[1])

5
