In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition
import numpy as np
import pickle
import tokenizer as tk

In [2]:
df = pd.read_csv(r'C:\Projects\PythonProjects\TopicModelling\Datasets\yelp-3star-complete.csv')

In [3]:
reviews = df['Review'].values
tokenized_reviews = [tk.preprocess_text(text) for text in reviews]

In [4]:
vectorizer_tf = TfidfVectorizer(max_df=0.90, min_df=50, max_features=10000)
tf_vectors = vectorizer_tf.fit_transform(tokenized_reviews)

In [7]:
lda = decomposition.LatentDirichletAllocation(n_components=3, max_iter=25, learning_method='online', random_state=42, batch_size=2000)

In [8]:
W1 = lda.fit_transform(tf_vectors)

In [9]:
H1 = lda.components_

In [11]:
num_words=20
vocab = np.array(vectorizer_tf.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]

In [12]:
topics

['food good go place service time get wait drink bar table order great us come take one beer like back',
 'good fry burger order chicken cheese get sandwich like food taco place try great steak taste come go really would',
 'pizza good food sushi place roll order chicken like dish price get go sauce try taste restaurant service salad soup']

In [13]:
colnames = ["Topic" + str(i) for i in range(lda.n_components)]
docnames = ["Doc" + str(i) for i in range(len(tokenized_reviews))]
df_doc_topic = pd.DataFrame(np.round(W1, 2), columns=colnames, index=docnames)
significant_topic = np.argmax(df_doc_topic.values, axis=1)
df_doc_topic['dominant_topic'] = significant_topic

In [46]:
df_doc_topic[0:11]

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.89,0.06,0.06,0
Doc1,0.1,0.76,0.14,1
Doc2,0.12,0.31,0.57,2
Doc3,0.8,0.1,0.11,0
Doc4,0.12,0.82,0.06,1
Doc5,0.06,0.88,0.06,1
Doc6,0.55,0.4,0.05,0
Doc7,0.82,0.08,0.1,0
Doc8,0.09,0.06,0.84,2
Doc9,0.09,0.09,0.82,2
