In [1]:
import numpy as np
import pandas as pd
import gensim
import nltk
import sklearn



In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

#### 1. Download Data

In [3]:
#fetch_20newsgroups?

In [4]:
raw_data = fetch_20newsgroups(subset='train', shuffle=True,
                              random_state=42, remove=('headers', 'footers', 'quotes'))

In [5]:
documents = raw_data.data
target = raw_data.target_names

In [6]:
target[5]

'comp.windows.x'

In [7]:
documents[5]

'\n\n\n\n\nOf course.  The term must be rigidly defined in any bill.\n\n\nI doubt she uses this term for that.  You are using a quote allegedly\nfrom her, can you back it up?\n\n\n\n\nI read the article as presenting first an argument about weapons of mass\ndestruction (as commonly understood) and then switching to other topics.\nThe first point evidently was to show that not all weapons should be\nallowed, and then the later analysis was, given this understanding, to\nconsider another class.\n\n\n\n'

#### 2. Pre-process data and create a BOW dictionary  
- Remove digits,there are a lot if them,  punctuations, replace with space(nltk)
- remove short words or letters 
- lemmatize(nltk)
- Tokenize using white space, remove stopwords

In [8]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#import string

In [170]:
nltk.download('stopwords', 'wordnet')

[nltk_data] Downloading package stopwords to wordnet...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
lemma = WordNetLemmatizer()
#punc = string.punctuation
stop_words = stopwords.words('english')

In [10]:
len(documents)

11314

In [11]:
docs_series = pd.Series(documents)

In [12]:
documents_1 = docs_series.str.replace('[^a-zA-Z]', ' ')
documents_2 = documents_1.apply(lambda doc : ' '.join([wrd.lower() for wrd in doc.split() if len(wrd) >3]))
documents_3 = documents_2.apply(lambda doc : ' '.join([lemma.lemmatize(word) for word in doc.split()]))
documents_4 = documents_3.apply(lambda doc : ' '.join([word for word in doc.split() if word not in stop_words]))

In [13]:
print(documents[3],'\n',documents_1[3], '\n',documents_2[3],'\n',documents_3[3],'\n',documents_4[3])


Do you have Weitek's address/phone number?  I'd like to get some information
about this chip.
 
  Do you have Weitek s address phone number   I d like to get some information about this chip   
 have weitek address phone number like some information about this chip 
 have weitek address phone number like some information about this chip 
 weitek address phone number like information chip


In [14]:
#lemma.lemmatize('corpora')

#### min_df, max_df affect vocab formation, max_features filters after vocab is formed

#### 3. From BOW, create a TF-IDF document representation

In [15]:
vect_tfidf = TfidfVectorizer(lowercase=True, analyzer='word',
                        ngram_range=(1,1), min_df=1,max_df=0.9,max_features=1000,
                        use_idf=True, norm='l2', smooth_idf=True, sublinear_tf=False)
dtm_1 = vect_tfidf.fit_transform(documents_4)

In [16]:
dtm_1.shape

(11314, 1000)

In [17]:
#vect_tfidf.get_feature_names()
#temp.vocabulary_['800']
#max(sorted(vect_tfidf.vocabulary_.values()))

In [18]:
#TfidfVectorizer?

#### 4. On the matrix, apply a truncated SVD 
- inspect document-topic matrix U 
- S matrix 
- topic-term matrix i.e Vt, and guess topics

In [19]:
from sklearn.decomposition import TruncatedSVD

In [20]:
svd = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100,
            random_state=42)
svd.fit(dtm_1)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=42, tol=0.0)

In [21]:
svd.components_.shape

(20, 1000)

In [233]:
terms = np.array(vect_tfidf.get_feature_names())
for i in range(svd.components_.shape[0]):
    arr = svd.components_[i, :]
    print('Topic {0} : {1}'.format(i, terms[np.argsort(arr)[0:7]]))
#svd.components_[0,0:5]

Topic 0 : ['stephanopoulos' 'char' 'jpeg' 'null' 'azerbaijani' 'stream' 'remark']
Topic 1 : ['people' 'year' 'think' 'team' 'right' 'game' 'christian']
Topic 2 : ['people' 'file' 'christian' 'window' 'jesus' 'government' 'would']
Topic 3 : ['game' 'window' 'thanks' 'file' 'anyone' 'team' 'know']
Topic 4 : ['thanks' 'please' 'anyone' 'know' 'doe' 'mail' 'would']
Topic 5 : ['file' 'please' 'mail' 'space' 'system' 'email' 'address']
Topic 6 : ['card' 'chip' 'driver' 'video' 'system' 'monitor' 'sale']
Topic 7 : ['people' 'card' 'christian' 'doe' 'please' 'armenian' 'jesus']
Topic 8 : ['would' 'window' 'please' 'card' 'jesus' 'christian' 'mail']
Topic 9 : ['would' 'game' 'card' 'chip' 'system' 'file' 'team']
Topic 10 : ['window' 'right' 'armenian' 'thanks' 'drive' 'government' 'year']
Topic 11 : ['window' 'chip' 'game' 'system' 'christian' 'jesus' 'doe']
Topic 12 : ['like' 'window' 'people' 'right' 'game' 'chip' 'government']
Topic 13 : ['doe' 'year' 'would' 'anyone' 'window' 'sale' 'price'

In [238]:
svd.singular_values_.s

(20,)

In [227]:
terms_arr[[0,2]]

array(['ability', 'accept'], dtype='<U14')

In [65]:
len(temp.get_feature_names())

72916

#### 5. Evaluate how orthogonal the topics are