<a href="https://colab.research.google.com/github/TITHI007/NLP/blob/main/NLP_Practical_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Implement LSA and Topic model**

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
a1 = "NLP"
a2 = "NLP is natural language processing."
a3 = "NLP is about analysis of text"

df = pd.DataFrame()
df["documents"] = [a1,a2,a3]
df.head()

Unnamed: 0,documents
0,NLP
1,NLP is natural language processing.
2,NLP is about analysis of text


In [3]:
df['clean_documents'] = df['documents'].str.replace("[^a-zA-Z#]", " ")
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower())

df.head()

Unnamed: 0,documents,clean_documents
0,NLP,nlp
1,NLP is natural language processing.,nlp natural language processing
2,NLP is about analysis of text,nlp about analysis text


In [4]:
import nltk 
nltk.download('stopwords') 
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split())

tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) 
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

df['clean_documents'] = detokenized_doc

In [6]:
df.head()

Unnamed: 0,documents,clean_documents
0,NLP,nlp
1,NLP is natural language processing.,nlp natural language processing
2,NLP is about analysis of text,nlp analysis text


In [7]:
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(df['clean_documents'])
X.toarray()

array([[0.        , 0.        , 0.        , 1.        , 0.        ,
        0.        ],
       [0.        , 0.54645401, 0.54645401, 0.32274454, 0.54645401,
        0.        ],
       [0.65249088, 0.        , 0.        , 0.38537163, 0.        ,
        0.65249088]])

In [8]:
X.shape

(3, 6)

In [9]:
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

In [10]:
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["documents"] = df['clean_documents']
display(topic_encoded_df[["documents", "topic_1", "topic_2"]])

Unnamed: 0,documents,topic_1,topic_2
0,nlp,0.8296366497864537,-0.0460873698072866
1,nlp natural language processing,0.6246263452107035,0.7245619353704799
2,nlp analysis text,0.6996697108711261,-0.5922004283549573


In [12]:
dictionary = vectorizer.get_feature_names()
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T



In [13]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
analysis,0.2911544773694453,-0.4401897997976114
language,0.2176856831508516,0.4510513033098431
natural,0.2176856831508516,0.451051303309843
nlp,0.8296366497864537,-0.0460873698072866
processing,0.2176856831508516,0.451051303309843
text,0.2911544773694455,-0.4401897997976114
