In [1]:
import pandas as pd

# <center>Latent Semantic Analysis using SVD</center>

___

Latent Semantic Analysis is a technique is Natural Language Processing of analyzing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms.

LSA finds this relationship by using a mathematical technique called Singular Vector Decomposition (SVD), which decomposes the document term matrix into three matrices.

- Input A, a matrix where m is the number of documents and n is the number of terms. A will be decomposed into 3 matrices as below -

$$A\ = U\sum V^T$$

- Matrix U will be m*k matrix. The rows will be the documents and the columns will be concepts (or topics)
- Matrix Sigma will be k*k diagonal matrix. The elements will be the amount of variation captured from each concept
- Matrix V will be m*k (the transpose matrix). The rows will be the terms and columns will be the concepts

SVD also reduces the dimensions significantly as the new concept space defined by eigen vectors is in sorted order, with first dimension defining the strongest concept dimension.

Just like PCA, we need not use the entire matrix and can pick first k values which will define majority of the relationship between terms and documents.

This is why its called `Reduced SVD` or `Truncated SVD`

Now, let's see how to implement this.

In [None]:
d1 = "The Indian Space Research Organisation (Isro) is planning to launch Chandrayaan 3 during the first half of 2021. The previous one crash landed on Moon’s surface in September 2019.The mission has been configured based on the lessons learnt from Chadrayaan 2."

d2 = "The launch of the Mars Orbiter Mission (MOM) also known as the Mangalyaan mission was a daring effort by the Indian Space Research Organisation (ISRO) to attempt an inter-planetary journey."

d3 = "On Friday health authorities in Florida said two COVID-19 patients in the state have died, in what are believed to be the first deaths linked to the disease in florida"

d4 = "Two people in Florida who tested positive for the coronavirus have died, becoming the first known fatalities outside California and Washington state, health officials reported Friday."

d5 = "Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems."

d6 = "Machine learning is provides systems the ability to automatically learn and improve from experience without being explicitly programmed"

d_all = [d1, d2, d3, d4, d5, d6]

In [None]:
d_all

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,3))

In [None]:
# Create a TFIDF matrix by applying fit() and transform() of the TFidfVectorizer() 
tfidf1 =
tfidf =

In [None]:
pd.DataFrame(tfidf.toarray(), columns= vectorizer.get_feature_names())

In [None]:
components = 3

from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components= components)
lsa.fit(tfidf)
lsa_data = lsa.transform(tfidf)

In [None]:
# Print the U and Sigma matrx 


In [None]:
terms = vectorizer.get_feature_names()

In [None]:
len(terms)

In [None]:
terms

In [None]:
# The variance of the training samples transformed by a projection to each component.
print(lsa.explained_variance_)

In [None]:
# Percentage of variance explained by each of the selected components.
print(lsa.explained_variance_ratio_)

In [None]:
# The singular values corresponding to each of the selected components.


In [None]:
print(tfidf.shape)
print(lsa.components_.shape)

In [None]:
# Printing the V Transpose matrix


In [None]:
for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[0:15]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

In [None]:
import pandas as pd

concepts = ['concept{}'.format(i) for i in range(components)]
concept_word = pd.DataFrame(lsa.components_, columns=vectorizer.get_feature_names(), index=concepts)
concept_word

In [None]:
lsa_data

In [None]:
doc_concept = pd.DataFrame(lsa_data, index = d_all,columns=concepts)
doc_concept

In [None]:
test = ['Artificial Intelligence will play very important role in future','indias future space exploration plans depends on the success of Chadrayaan 3']

In [None]:
# Step1 : Convert the test documents to TFIDF


In [None]:
# Step2 : Apply truncated SVD


In [None]:
# Executing Step1 and Step2 together and finding the concept from the text.
import numpy as np
print("1st sentence:",np.argmax(lsa.transform(tfidf_test[0])))
print("2nd sentence:",np.argmax(lsa.transform(tfidf_test[1])))