In [3]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.matutils import corpus2csc

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
df = pd.read_csv("quora_questions.csv")
data = df.sample(n=1000, axis=0)
data = data['Question']

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


In [5]:

def preprocess(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    import re
    special_chars = r'[,.:;?\(\'"\s]'
    words = [re.sub(special_chars, '', word) for word in words]
    return words


data = data.apply(preprocess)
dictionary = Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=0.5)
bow_corpus = [dictionary.doc2bow(text) for text in data]

In [6]:
num_topics = 5
lsamodel = LsiModel(bow_corpus, num_topics=num_topics, id2word=dictionary)

topics = lsamodel.show_topics(num_topics=num_topics, num_words=10)
top_topics = []
for topic in topics:
    top_topics.append(topic[1])

print("Top 5 LSA Topics:")
for i, topic in enumerate(top_topics, start=1):
    print(f"Topic {i} : {topic}")

Top 5 LSA Topics:
Topic 1 : 0.880*"best" + 0.225*"way" + 0.188*"s" + 0.116*"india" + 0.109*")" + 0.102*"get" + 0.102*"friend" + 0.091*"time" + 0.071*"movie" + 0.066*"good"
Topic 2 : -0.769*"s" + 0.305*"best" + -0.200*"like" + -0.170*"would" + -0.160*"note" + -0.138*"nt" + -0.123*"1000" + -0.122*"india" + -0.119*")" + -0.113*"500"
Topic 3 : 0.619*")" + 0.361*"note" + -0.360*"s" + 0.336*"r" + 0.251*"1000" + 0.232*"500" + 0.143*"banning" + 0.126*"indian" + 0.098*"money" + 0.096*"system"
Topic 4 : -0.674*")" + 0.361*"note" + 0.321*"r" + 0.271*"1000" + 0.252*"500" + 0.151*"banning" + 0.122*"indian" + -0.119*"like" + 0.115*"money" + -0.111*"system"
Topic 5 : 0.887*"get" + -0.183*"s" + 0.160*"job" + 0.155*"good" + -0.110*"best" + 0.110*"people" + 0.096*"better" + -0.090*"would" + 0.089*"home" + -0.083*")"
