## Practical 5 : Implementing LSA and Topic model.

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
# If nltk stop word is not downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shubh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = pd.read_csv('TAJReviews.csv',nrows=50)
df = pd.DataFrame(data)

df['documents'] = df['Reviews']
df = df.drop(['UserName','Rating','Date'],axis=1)
df

Unnamed: 0,Reviews,documents
0,World's Largest White Marble Structure - Very ...,World's Largest White Marble Structure - Very ...
1,Really a beautiful and mesmerizing place to sp...,Really a beautiful and mesmerizing place to sp...
2,One of the most beautiful place I have ever se...,One of the most beautiful place I have ever se...
3,It was amazing and awesome Experience. I visit...,It was amazing and awesome Experience. I visit...
4,One of the most popular place all over the wor...,One of the most popular place all over the wor...
5,What to say... Its one of the seven wonders of...,What to say... Its one of the seven wonders of...
6,"Absolutely stunning !! Wonder of the world , a...","Absolutely stunning !! Wonder of the world , a..."
7,This is the place which reminds you of love of...,This is the place which reminds you of love of...
8,World's most beautiful wonder. This is one of ...,World's most beautiful wonder. This is one of ...
9,Beautiful as ever. Try to reach the place by 7...,Beautiful as ever. Try to reach the place by 7...


In [15]:
# List of documents
a1 = "He is a good dog."
a2 = "The dog is too lazy."
a3 = "That is a brown cat."
a4 = "The cat is very active."
a5 = "I have brown cat and dog."

df = pd.DataFrame()
df["documents"] = [a1,a2,a3,a4,a5]
df.head()


Unnamed: 0,documents
0,He is a good dog.
1,The dog is too lazy.
2,That is a brown cat.
3,The cat is very active.
4,I have brown cat and dog.


In [4]:
# Preprocessing
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

Unnamed: 0,Reviews,documents,clean_documents
0,World's Largest White Marble Structure - Very ...,World's Largest White Marble Structure - Very ...,world largest white marble structure very hot ...
1,Really a beautiful and mesmerizing place to sp...,Really a beautiful and mesmerizing place to sp...,really beautiful and mesmerizing place spend y...
2,One of the most beautiful place I have ever se...,One of the most beautiful place I have ever se...,one the most beautiful place have ever seen li...
3,It was amazing and awesome Experience. I visit...,It was amazing and awesome Experience. I visit...,was amazing and awesome experience visited thi...
4,One of the most popular place all over the wor...,One of the most popular place all over the wor...,one the most popular place all over the world ...


In [5]:
from nltk.corpus import stopwords
# tokenization
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

# remove stop-words

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stopwords.words('english')])

# de-tokenization
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc


In [6]:
df.head()

Unnamed: 0,Reviews,documents,clean_documents
0,World's Largest White Marble Structure - Very ...,World's Largest White Marble Structure - Very ...,world largest white marble structure hot summe...
1,Really a beautiful and mesmerizing place to sp...,Really a beautiful and mesmerizing place to sp...,really beautiful mesmerizing place spend time ...
2,One of the most beautiful place I have ever se...,One of the most beautiful place I have ever se...,one beautiful place ever seen life place peace...
3,It was amazing and awesome Experience. I visit...,It was amazing and awesome Experience. I visit...,amazing awesome experience visited place revis...
4,One of the most popular place all over the wor...,One of the most popular place all over the wor...,one popular place world great infrastructure n...


In [7]:
# TF-IDF vector
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.24072601, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.18125774, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [8]:
X.shape 
# A56   U(5,5). S()

(50, 461)

In [9]:
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)


In [10]:
#Documents - Topic vector
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])


Unnamed: 0,documents,topic_1,topic_2
0,world largest white marble structure hot summe...,0.0984977464738135,0.0526143726912174
1,really beautiful mesmerizing place spend time ...,0.3694760415112652,-0.3165113696422552
2,one beautiful place ever seen life place peace...,0.368016174938392,0.1211011774731417
3,amazing awesome experience visited place revis...,0.4349192151433532,0.553267398811314
4,one popular place world great infrastructure n...,0.195387799495936,0.183936504029401
5,say one seven wonders world dont expect modern...,0.1113550263176955,0.0942229845888549
6,absolutely stunning wonder world came view fir...,0.3009591085818484,-0.10966997354232
7,place reminds love royal couple love birds vis...,0.3106908343722294,0.0049914789511873
8,world beautiful wonder one best place world,0.4391434953840064,0.1853233466000128
9,beautiful ever try reach place less crowd open...,0.1181628235559774,-0.0100785633950605


In [11]:
# Features or words used as features 
dictionary = vectorizer.get_feature_names()

In [12]:
dictionary

['absolute',
 'absolutely',
 'admired',
 'affair',
 'ago',
 'agra',
 'ahead',
 'amazing',
 'ancient',
 'architechture',
 'architectural',
 'architecture',
 'area',
 'arrangements',
 'art',
 'asked',
 'astonishing',
 'atmosphere',
 'attractive',
 'authorities',
 'available',
 'avoid',
 'aware',
 'away',
 'awe',
 'awesome',
 'baffling',
 'bank',
 'bargain',
 'beautified',
 'beautiful',
 'beauty',
 'beautyful',
 'behalf',
 'best',
 'better',
 'big',
 'birds',
 'birth',
 'bit',
 'booked',
 'booking',
 'bother',
 'brain',
 'breath',
 'breathe',
 'build',
 'building',
 'built',
 'busy',
 'buy',
 'calm',
 'came',
 'carts',
 'carving',
 'challenging',
 'charge',
 'charged',
 'charm',
 'chipmunks',
 'clean',
 'clearly',
 'cliche',
 'clicking',
 'close',
 'closed',
 'college',
 'come',
 'comes',
 'common',
 'complete',
 'complex',
 'compund',
 'computerized',
 'consider',
 'constructed',
 'construction',
 'contributed',
 'cool',
 'country',
 'couple',
 'cover',
 'covers',
 'covid',
 'critics',
 

In [13]:
# Term-Topic matrix
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T


In [14]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
absolute,0.0072574754223889,-0.0034523394401984
absolutely,0.0348409969895462,-0.0057520631281202
admired,0.0053020209932569,-0.0030475032161916
affair,0.0046606527209450,0.0018785909774744
ago,0.0155624169569769,0.0015846250403109
...,...,...
worn,0.0806516893095956,-0.1065995697753382
yamuna,0.0020874605986441,0.0024064556984060
year,0.0257251456086991,0.0118677084697467
years,0.0278068919490738,0.0065585105628482
