# Latent Semantic Analysis

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option("display.max_colwidth", 200)

In [8]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [9]:
news_df = pd.DataFrame({'document':documents})

In [10]:
news_df.head()

Unnamed: 0,document
0,Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe...
1,"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism? No, you need a little leap of faith, Jimmy. Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n..."
2,"Although I realize that principle is not one of your strongest\npoints, I would still like to know why do do not ask any question\nof this sort about the Arab countries.\n\n If you want to conti..."
3,"Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it? ATT's last product in this area (a) was priced over\n$1000, as I suspect 'clipper' phones will be; (b) ca..."
4,"Well, I will have to change the scoring on my playoff pool. Unfortunately\nI don't have time right now, but I will certainly post the new scoring\nrules by tomorrow. Does it matter? No, you'll ..."


In [11]:
# removing everything except alphabets`
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z#]", " ")

# removing short words
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# make all text lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

In [12]:
#Remove stopwords

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# tokenization
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())

# remove stop-words
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [13]:
news_df = news_df.head()
news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe...,well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism? No, you need a little leap of faith, Jimmy. Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n...",yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgrou...
2,"Although I realize that principle is not one of your strongest\npoints, I would still like to know why do do not ask any question\nof this sort about the Arab countries.\n\n If you want to conti...",although realize principle strongest points would still like know question sort arab countries want continue think tank charade fixation israel must stop might start asking sort questions arab cou...
3,"Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it? ATT's last product in this area (a) was priced over\n$1000, as I suspect 'clipper' phones will be; (b) ca...",notwithstanding legitimate fuss proposal much change last product area priced suspect clipper phones came customer automatically preregistered government authorities thus aside attempting legitimi...
4,"Well, I will have to change the scoring on my playoff pool. Unfortunately\nI don't have time right now, but I will certainly post the new scoring\nrules by tomorrow. Does it matter? No, you'll ...",well change scoring playoff pool unfortunately time right certainly post scoring rules tomorrow matter enter anyway good keith keller rangers quakers kkeller mail upenn league champs


In [17]:
#create tf-idf matrix using sklearn

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 10, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True)

'''max_df = float in range [0.0, 1.0] or int (default=1.0)
When building the vocabulary ignore terms that have a document
frequency strictly higher than the given threshold (corpus-specific stop words).
If float, the parameter represents a proportion of documents, integer absolute counts.
This parameter is ignored if vocabulary is not None.
'''

X = vectorizer.fit_transform(news_df['clean_doc'])

X.shape # check shape of the document-term matrix

x = pd.DataFrame(X)
x.tail()

Unnamed: 0,0
0,"(0, 9)\t0.1939619103137807\n (0, 5)\t0.1939619103137807\n (0, 4)\t0.9616431535111439"
1,"(0, 8)\t0.3741047724501572\n (0, 6)\t0.9273864454638184"
2,"(0, 3)\t0.48096754724600776\n (0, 2)\t0.32064503149733853\n (0, 1)\t0.48096754724600776\n (0, 0)\t0.48096754724600776\n (0, 7)\t0.32064503149733853\n (0, 8)\t0.12934719623338004\n (0, 9)\t..."
3,
4,


In [23]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=X[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
media,0.961643
realize,0.193962
think,0.193962
arab,0.0
countries,0.0
fixation,0.0
israel,0.0
sorry,0.0
sort,0.0
start,0.0


In [50]:
X.shape

(5, 189)

In [25]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=5, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)


5

In [26]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])
        print(" ")


Topic 0: 
media
 
sorry
 
countries
 
israel
 
arab
 
realize
 
start
 
Topic 1: 
sorry
 
start
 
fixation
 
sort
 
countries
 
israel
 
arab
 
Topic 2: 
countries
 
arab
 
israel
 
fixation
 
sort
 
realize
 
think
 
Topic 3: 
fixation
 
countries
 
media
 
sorry
 
start
 
sort
 
israel
 
Topic 4: 
israel
 
media
 
sorry
 
start
 
think
 
realize
 
sort
 


In [33]:
import umap

X_topics = svd_model.fit_transform(X)
embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics)

plt.figure(figsize=(7,5))
plt.scatter(embedding[:, 0], embedding[:, 1], 
c = dataset.target,
s = 10, # size
edgecolor='none'
)
plt.show()


AttributeError: module 'umap' has no attribute 'UMAP'