<a href="https://colab.research.google.com/github/Vish4github/Projects/blob/master/TopicModeling_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt

In [0]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [0]:
categories=['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
remove =('header','footers','quotes')
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test',categories=categories,remove=remove)


In [22]:
newsgroups_train.filenames.shape,newsgroups_train.target.shape

((2034,), (2034,))

In [23]:
newsgroups_train.data[:1]

["From: rych@festival.ed.ac.uk (R Hawkes)\nSubject: 3DS: Where did all the texture rules go?\nLines: 21\n\nHi,\n\nI've noticed that if you only save a model (with all your mapping planes\npositioned carefully) to a .3DS file that when you reload it after restarting\n3DS, they are given a default position and orientation.  But if you save\nto a .PRJ file their positions/orientation are preserved.  Does anyone\nknow why this information is not stored in the .3DS file?  Nothing is\nexplicitly said in the manual about saving texture rules in the .PRJ file. \nI'd like to be able to read the texture rule information, does anyone have \nthe format for the .PRJ file?\n\nIs the .CEL file format available from somewhere?\n\nRych"]

In [24]:
print("\n".join(newsgroups_train.data[:4]))

From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
Subject: Re: Biblical Backing of Koresh's 3-02 Tape (Cites enclosed)
From: kmcvay@oneb.almanac.bc.ca (Ken Mcvay)
Organization: The Old Frog's Almanac
Lines: 20



Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to tak

In [25]:
np.array(newsgroups_train.target_names)[newsgroups_train.target[:4]]

array(['comp.graphics', 'talk.religion.misc', 'sci.space', 'alt.atheism'],
      dtype='<U18')

In [26]:
newsgroups_train.target[:10]

array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1])

In [0]:
num_topics,num_top_words = 6,10 

In [0]:
from sklearn.feature_extraction import stop_words

In [29]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
from nltk import stem

In [0]:
wln=stem.WordNetLemmatizer()
porter=stem.porter.PorterStemmer()

In [32]:
word_list=['feet','foot','football','foots','footing']
word_list=['universe','university']
[wln.lemmatize(word) for word in word_list]


['universe', 'university']

In [33]:
[porter.stem(word) for word in word_list]

['univers', 'univers']

In [0]:
import spacy
nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
#nlp=spacy.load('en_core_web_sm')

In [0]:
from spacy.lemmatizer import Lemmatizer, ADJ, NOUN, VERB
lemmatizer = nlp.vocab.morphology.lemmatizer

In [43]:
[lemmatizer.lookup(word) for word in word_list]

['universe', 'university']

In [44]:
nlp.Defaults.stop_words -   stop_words.ENGLISH_STOP_WORDS

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'ca',
 'did',
 'does',
 'doing',
 'just',
 'make',
 "n't",
 'n‘t',
 'n’t',
 'quite',
 'really',
 'regarding',
 'say',
 'unless',
 'used',
 'using',
 'various',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve'}

In [45]:
stop_words.ENGLISH_STOP_WORDS - nlp.Defaults.stop_words

frozenset({'amoungst',
           'bill',
           'cant',
           'co',
           'con',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'eg',
           'etc',
           'fill',
           'find',
           'fire',
           'found',
           'hasnt',
           'ie',
           'inc',
           'interest',
           'ltd',
           'mill',
           'sincere',
           'system',
           'thick',
           'thin',
           'un'})

In [0]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [0]:
import nltk

In [0]:
vectorizer = CountVectorizer(stop_words='english')

In [49]:
vectors = vectorizer.fit_transform(newsgroups_train.data).todense()
vectors.shape

(2034, 29319)

In [50]:
print(len(newsgroups_train.data),vectors.shape)

2034 (2034, 29319)


In [0]:
#2034 postings and 29139 tokens

In [0]:
vocab = np.array(vectorizer.get_feature_names())

In [53]:
vocab[7000:7020]

array(['collaborators', 'collapse', 'collapsed', 'collapses', 'collar',
       'colleague', 'colleagues', 'collect', 'collected', 'collectibles',
       'collecting', 'collection', 'collections', 'collective',
       'collectively', 'collectors', 'collects', 'college', 'collegue',
       'collelo'], dtype='<U80')

SVD - Single Value  Decomposition

In [0]:
U,s,Vh=linalg.svd(vectors,full_matrices=False)

In [55]:
np.diag(s[:4])

array([[441.07406101,   0.        ,   0.        ,   0.        ],
       [  0.        , 297.03689229,   0.        ,   0.        ],
       [  0.        ,   0.        , 243.62076992,   0.        ],
       [  0.        ,   0.        ,   0.        , 223.03584121]])

In [0]:
reconstructed_vectors = U @ np.diag(s) @ Vh

In [57]:
np.linalg.norm(reconstructed_vectors-vectors)  #two ways to check both are same

2.951109052269698e-12

In [59]:
np.allclose(reconstructed_vectors,vectors)

True

In [0]:
num_top_words =8

def show_topics(a):
  top_words = lambda t:[vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
  topic_words=([top_words(t) for t in a])
  return [' '.join(t) for t in topic_words]

In [83]:
show_topics(Vh[:10])

['drube gesellschaft rdd rechenzentrum ipp reinhard _simple speech_',
 'edu space graphics data pub mail nasa 128',
 'space jesus launch god people satellite matthew atheists',
 'space launch satellite nasa commercial satellites market year',
 'jpeg graphics space edu ray pub mail send',
 'jesus matthew prophecy messiah psalm isaiah david said',
 'nasa space lunar mars probe moon missions probes',
 'image probe lunar surface mars probes moon orbit',
 'argument fallacy conclusion example true ad argumentum premises',
 'space image edu larson nasa organization lines subject']

In [63]:
Vh[:10]

array([[-0.0096691 , -0.01177086, -0.00002726, ..., -0.00000176,
        -0.00004423, -0.00107002],
       [ 0.00414983,  0.01884558,  0.00004775, ...,  0.00000226,
         0.0000783 ,  0.00169861],
       [ 0.00005385,  0.02069747,  0.00005967, ...,  0.00000721,
         0.00005593, -0.00164461],
       ...,
       [-0.0016227 , -0.04228493, -0.00014365, ...,  0.00001279,
         0.00020648,  0.00148198],
       [-0.00044781,  0.00601288,  0.00000922, ..., -0.00000259,
        -0.00000626, -0.00015666],
       [ 0.00380339, -0.01448013,  0.00014521, ...,  0.00003733,
         0.00039274,  0.00033987]])

In [65]:
vocab

array(['00', '000', '0000', ..., 'zxmkr08', 'zyeh', 'zyxel'], dtype='<U80')

NMF

In [0]:
m,n=vectors.shape

In [0]:
d=5

In [0]:
clf=decomposition.NMF(n_components=d,random_state=1)

In [0]:
W1 = clf.fit_transform(vectors)

In [0]:
H1= clf.components_

In [89]:
show_topics(H1)

['edu graphics pub mail 128 ray ftp send',
 'jpeg image gif file color images format quality',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists atheism does said',
 'image data available software processing ftp analysis edu']

In [0]:
vectorizer_tfidf=TfidfVectorizer(stop_words='english')    #term document matrix using tf-idf
vectors_tfidf=vectorizer_tfidf.fit_transform(newsgroups_train.data)

In [0]:
  W1 = clf.fit_transform(vectors_tfidf)
  H1 = clf.components_

In [92]:
show_topics(H1)

['god people jesus sandvik kent say don christian',
 'caltech keith edu schneider cco pasadena allan institute',
 'sgi livesey solntze wpd com jon posting nntp',
 'space edu nasa university gov organization lines subject',
 'access digex express online communications pat prb net']

In [96]:
from sklearn import decomposition
%time u,s,v = decomposition.randomized_svd(vectors,10)

CPU times: user 13.1 s, sys: 1.59 s, total: 14.7 s
Wall time: 10.6 s


In [0]:
#%pip install fbpca

In [98]:
import fbpca
import time
%time u,s,v = fbpca.pca(vectors,10)   #10 = no of singular values, essentially lets you select the no of topics

CPU times: user 2.48 s, sys: 538 ms, total: 3.02 s
Wall time: 1.67 s


In [99]:
%time u,s,v = np.linalg.svd(vectors,full_matrices=False)

CPU times: user 1min 21s, sys: 3.56 s, total: 1min 24s
Wall time: 46.9 s
