# Latent Semantic Analysis
Latent Semantic Analysis is a technique of analysing relationships between a set of documents and the terms they contain by producing a set of concepts related to the documents and terms - wikipedia

<br>
SVD - Singular Value Decomposition


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import numpy as np
import nltk

### Sample Data

In [2]:
dataset = ["The amount of polution is increasing day by day",
           "The concert was just great",
           "I love to see Gordon Ramsay cook",
           "Google is introducing a new technology",
           "AI Robots are examples of great technology present today",
           "All of us were singing in the concert",
           "We have launch campaigns to stop pollution and global warming"]

In [3]:
dataset = [line.lower() for line in dataset]
dataset

['the amount of polution is increasing day by day',
 'the concert was just great',
 'i love to see gordon ramsay cook',
 'google is introducing a new technology',
 'ai robots are examples of great technology present today',
 'all of us were singing in the concert',
 'we have launch campaigns to stop pollution and global warming']

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(dataset)
X

<7x42 sparse matrix of type '<class 'numpy.float64'>'
	with 51 stored elements in Compressed Sparse Row format>

In [5]:
print(X[0])

  (0, 5)	0.3211483974289089
  (0, 9)	0.6422967948578178
  (0, 17)	0.3211483974289089
  (0, 19)	0.2665807498646048
  (0, 26)	0.3211483974289089
  (0, 24)	0.2278643877752444
  (0, 2)	0.3211483974289089
  (0, 34)	0.2278643877752444


In [6]:
lsa = TruncatedSVD(n_components=4, n_iter=100)
lsa.fit(X)

In [7]:
lsa.components_.shape

(4, 42)

In [8]:
lsa.components_[0].shape

(42,)

In [11]:
terms = vectorizer.get_feature_names_out()
print('terms',terms, sep=": ")
concept_words ={}

for i, comp in enumerate(lsa.components_):
    #print('(i, comp)', (i, comp), sep=": ")
    component_terms = zip(terms, comp)
    #print('component_terms', component_terms, sep=": ")
    sorted_terms = sorted(component_terms, key = lambda x:x[1], reverse = True)
    #print('sorted_terms', sorted_terms, sep=": ")
    sorted_terms = sorted_terms[:10]
    concept_words['Concept '+str(i)] = sorted_terms
    #print(concept_words)
    #print('\nConcept' ,i,': ')
    #for term in sorted_terms:
        #print(term)
    


terms: ['ai' 'all' 'amount' 'and' 'are' 'by' 'campaigns' 'concert' 'cook' 'day'
 'examples' 'global' 'google' 'gordon' 'great' 'have' 'in' 'increasing'
 'introducing' 'is' 'just' 'launch' 'love' 'new' 'of' 'pollution'
 'polution' 'present' 'ramsay' 'robots' 'see' 'singing' 'stop'
 'technology' 'the' 'to' 'today' 'us' 'warming' 'was' 'we' 'were']


In [15]:
for key in concept_words.keys():
    sentence_scores = []
    for sentence in dataset:
        words = nltk.word_tokenize(sentence)
        score = 0 
        for w in words:
            for word_with_score in concept_words[key]:
                if w == word_with_score[0]:
                    score += word_with_score[1]
        sentence_scores.append(score)
    print("\n"+key+":")
    for s in sentence_scores:
        print(s)


Concept 0:
1.1297395470753955
1.4959427190164012
0
0.1838383456741348
0.7797604325216755
1.373365598990949
0

Concept 1:
0
0
1.833746733642543
0
0
0
1.2850142324187055

Concept 2:
0.6242100916830957
0
0
1.7440703383075622
0.8334337554863578
0
0

Concept 3:
2.2015937554478837
0.12724213180694421
0
0.21264455202449947
0
0.29658207438874035
0
