In [2]:
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from gensim.matutils import corpus2csc

# Load data
df = pd.read_csv("quora_questions.csv")
data = df.sample(n=1000, axis=0)
data = data['Question']

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize NLTK components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Function to preprocess text
def preprocess(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    import re
    special_chars = r'[,.:;?\(\'"\s]'
    words = [re.sub(special_chars, '', word) for word in words]
    return words

# Apply preprocessing to the dataset
data = data.apply(preprocess)

# Create a Gensim Dictionary
dictionary = Dictionary(data)

# Filter out words that appear in fewer than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create bag-of-words representation
bow_corpus = [dictionary.doc2bow(text) for text in data]

# Train the LSA model
num_topics = 5
lsamodel = LsiModel(bow_corpus, num_topics=num_topics, id2word=dictionary)

# Get the topics
topics = lsamodel.show_topics(num_topics=num_topics, num_words=10)

# Extract top 5 topics
top_topics = []
for topic in topics:
    top_topics.append(topic[1])

# Print top 5 topics
print("Top 5 LSA Topics:")
for i, topic in enumerate(top_topics, start=1):
    print("Topic {}: {}".format(i, topic))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 5 LSA Topics:
Topic 1: -0.918*"best" + -0.226*"way" + -0.174*"india" + -0.124*"s" + -0.062*"place" + -0.059*"get" + -0.058*")" + -0.054*"ever" + -0.043*"company" + -0.043*"good"
Topic 2: 0.801*"s" + 0.315*"get" + -0.222*"best" + 0.166*"india" + 0.163*"people" + 0.120*"``" + 0.094*"good" + 0.084*"one" + 0.082*"quora" + 0.079*"know"
Topic 3: 0.854*"get" + -0.401*"s" + 0.125*"job" + 0.115*"good" + 0.102*"one" + 0.092*"year" + 0.081*"rid" + 0.081*")" + 0.064*"5" + 0.050*"girl"
Topic 4: -0.427*"india" + 0.420*"people" + 0.255*"quora" + 0.252*"way" + 0.217*"know" + 0.205*")" + 0.191*"question" + -0.189*"s" + -0.166*"get" + 0.165*"good"
Topic 5: 0.571*"india" + -0.456*"way" + 0.301*"people" + -0.227*"good" + 0.214*"know" + -0.204*"s" + 0.192*"quora" + 0.157*"question" + -0.117*"get" + 0.110*"ask"
