In [4]:
# source: https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
import os
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Load Dataset
documents_list = []
with open( os.path.join("/Users/rawassizadeh/EVERYTHING/Work/TEACHING/CS 688_WebAnalyticsMining/toGithub/Clustering/articles.txt") ,"r") as fin:
    for line in fin.readlines():
        text = line.strip()
        documents_list.append(text)
        
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)   

In [5]:
# Define the number of topics or components
num_components=10

# Create SVD object
lsa = TruncatedSVD(n_components=num_components, n_iter=100, random_state=42)

# Fit SVD model on data
lsa.fit_transform(train_data)

# Get Singular values and Components 
Sigma = lsa.singular_values_ 
V_transpose = lsa.components_.T

# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lsa.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:5]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['s', 'trump', 'said', 'eu', 't']
Topic 1:  ['trump', 'clinton', 'republican', 'donald', 'cruz']
Topic 2:  ['s', 'league', 'season', 'min', 'leicester']
Topic 3:  ['eu', 'league', 'min', 'season', 'brexit']
Topic 4:  ['bank', 'banks', 'banking', 'rbs', 'financial']
Topic 5:  ['health', 'nhs', 'care', 'mental', 'patients']
Topic 6:  ['min', 'ball', 'corner', 'yards', 'goal']
Topic 7:  ['facebook', 'internet', 'online', 'users', 'twitter']
Topic 8:  ['film', 'films', 'movie', 'women', 'director']
Topic 9:  ['labour', 'party', 'bank', 'corbyn', 'film']


In [31]:
! pip install stop_words
! pip install pyLDAvis

#-------------- LDA example -----------
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim


  and should_run_async(code)




In [32]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
    
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health." 

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)

doc_lda = ldamodel[corpus]
print(ldamodel.print_topics())



[(0, '0.091*"good" + 0.090*"brocolli" + 0.071*"brother" + 0.071*"mother" + 0.070*"eat" + 0.042*"like" + 0.042*"basebal" + 0.042*"lot" + 0.042*"around" + 0.042*"time"'), (1, '0.082*"health" + 0.060*"drive" + 0.059*"pressur" + 0.035*"caus" + 0.035*"tension" + 0.035*"increas" + 0.035*"may" + 0.035*"expert" + 0.035*"blood" + 0.035*"suggest"')]


  and should_run_async(code)


In [33]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
vis

  and should_run_async(code)


TypeError: import_optional_dependency() got an unexpected keyword argument 'errors'

In [30]:
! pip install pandas



  and should_run_async(code)


