In [1]:
import pandas as pd
df= pd.read_csv('/content/drive/MyDrive/final_project/review_data.csv')
df.dropna(inplace=True)

# Topic Modeling For Clusters using LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd
import pickle

with open('/content/drive/MyDrive/final_project/clustering_pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

# Step 1: Predict clusters

df['Cluster'] = pipeline.predict(df['text_'])


# Step 2: Function for Topic Modeling with Gensim LDA
def extract_topics_with_gensim(texts, num_topics=3, num_words=5):
    # Tokenize text
    tokenized_texts = [text.split() for text in texts]

    # Create a dictionary and a corpus
    dictionary = corpora.Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

    # Train the LDA model
    lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, random_state=42)

    # Extract topics
    topics = []
    for topic_id, topic in lda_model.print_topics(num_topics=num_topics, num_words=num_words):
        topics.append(f"Topic {topic_id + 1}: {topic}")
    return topics

# Step 3: Apply LDA for Each Cluster
def lda_topic_modeling(df):
  cluster_topics = {}
  for cluster_num in sorted(df['Cluster'].unique()):
      cluster_texts = df[df['Cluster'] == cluster_num]['text_']
      print(f"\nCluster {cluster_num}: {len(cluster_texts)} reviews")
      topics = extract_topics_with_gensim(cluster_texts, num_topics=3, num_words=5)
      cluster_topics[cluster_num] = topics
      for topic in topics:
          print(topic)

lda_topic_modeling(df[['Cluster','text_']])


Cluster 0: 493 reviews




Topic 1: 0.091*"movie" + 0.029*"good" + 0.021*"acting" + 0.017*"great" + 0.016*"love"
Topic 2: 0.072*"movie" + 0.046*"good" + 0.022*"great" + 0.022*"story" + 0.015*"love"
Topic 3: 0.083*"movie" + 0.026*"good" + 0.023*"great" + 0.014*"story" + 0.013*"acting"

Cluster 1: 38126 reviews




Topic 1: 0.024*"fit" + 0.016*"comfortable" + 0.016*"would" + 0.015*"shoe" + 0.014*"wear"
Topic 2: 0.035*"love" + 0.023*"size" + 0.020*"great" + 0.014*"bought" + 0.013*"little"
Topic 3: 0.012*"one" + 0.011*"read" + 0.010*"story" + 0.008*"character" + 0.008*"boot"

Cluster 2: 20 reviews
Topic 1: 0.211*"good" + 0.128*"effect" + 0.123*"special" + 0.071*"quite" + 0.047*"acting"
Topic 2: 0.365*"good" + 0.255*"acting" + 0.061*"story" + 0.052*"movie" + 0.028*"pretty"
Topic 3: 0.308*"good" + 0.175*"story" + 0.044*"acting" + 0.037*"got" + 0.036*"storyline"

Cluster 3: 1791 reviews




Topic 1: 0.055*"book" + 0.032*"read" + 0.019*"character" + 0.017*"love" + 0.016*"story"
Topic 2: 0.062*"book" + 0.037*"story" + 0.029*"read" + 0.024*"character" + 0.015*"well"
Topic 3: 0.078*"book" + 0.032*"read" + 0.028*"story" + 0.025*"character" + 0.019*"good"

Cluster 4: 1 reviews
Topic 1: 0.910*"painting" + 0.012*"explain" + 0.011*"cover" + 0.011*"art" + 0.011*"history"
Topic 2: 0.130*"painting" + 0.109*"cover" + 0.109*"book" + 0.109*"one" + 0.109*"question"
Topic 3: 0.882*"painting" + 0.016*"one" + 0.016*"book" + 0.015*"question" + 0.015*"answer"


In [None]:
from sklearn.pipeline import Pipeline
import pickle

with open('/content/drive/MyDrive/final_project/topic_modeling.pkl','wb') as f:
  pickle.dump(lda_topic_modeling,f)
print('file pickled')

file pickled


In [None]:
with open('/content/drive/MyDrive/final_project/topic_modeling.pkl','rb') as f:
  topic_model=pickle.load(f)

topic_model(df[['Cluster','text_']].iloc[:10])




Cluster 1: 10 reviews
Topic 1: 0.085*"love" + 0.043*"well" + 0.042*"made" + 0.036*"set" + 0.029*"great"
Topic 2: 0.051*"pillow" + 0.051*"thing" + 0.038*"love" + 0.031*"great" + 0.030*"look"
Topic 3: 0.068*"great" + 0.040*"love" + 0.040*"information" + 0.039*"price" + 0.039*"missing"


# Topic Modeling For Clusters using nmf

In [None]:
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pickle

with open('/content/drive/MyDrive/final_project/clustering_pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

# Step 1: Predict clusters

df['Cluster'] = pipeline.predict(df['text_'])

# Step 2: Function for Topic Modeling with NMF
def extract_topics_with_nmf(texts, num_topics=3, num_words=5):
    # Vectorize the texts
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)

    # Apply NMF for topic modeling
    nmf_model = NMF(n_components=num_topics, random_state=42)
    W = nmf_model.fit_transform(X)  # Document-topic matrix
    H = nmf_model.components_  # Topic-term matrix

    # Extract top words for each topic
    topics = []
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(H):
        top_words_idx = topic.argsort()[-num_words:][::-1]
        top_words = [feature_names[i] for i in top_words_idx]
        topics.append(f"Topic {topic_idx + 1}: " + " ".join(top_words))

    return topics

# Step 3: Apply NMF for Topic Modeling on Each Cluster
def nmf_topic_modeling(df):
    cluster_topics = {}
    for cluster_num in sorted(df['Cluster'].unique()):
        cluster_texts = df[df['Cluster'] == cluster_num]['text_']
        print(f"\nCluster {cluster_num}: {len(cluster_texts)} reviews")
        topics = extract_topics_with_nmf(cluster_texts, num_topics=3, num_words=5)
        cluster_topics[cluster_num] = topics
        for topic in topics:
            print(topic)

# Apply NMF topic modeling to each cluster
nmf_topic_modeling(df[['Cluster', 'text_']])



Cluster 0: 493 reviews
Topic 1: movie good story love like
Topic 2: special effect notch superb acting
Topic 3: great action picture story lot

Cluster 1: 38126 reviews
Topic 1: great like good use little
Topic 2: book story read character good
Topic 3: love dog toy son great

Cluster 2: 20 reviews
Topic 1: acting good great movie really
Topic 2: story good movie great plot
Topic 3: good ending effect special movie

Cluster 3: 1791 reviews
Topic 1: book read good great series
Topic 2: story character love author good
Topic 3: liked ending loved story like

Cluster 4: 1 reviews
Topic 1: painting art history answer book
Topic 2: painting history book art answer
Topic 3: cover question explain answer book


  return np.sqrt(res * 2)


In [None]:
from sklearn.pipeline import Pipeline
import pickle

with open('/content/drive/MyDrive/final_project/topic_modeling_nmf.pkl','wb') as f:
  pickle.dump(nmf_topic_modeling,f)
print('file pickled')

file pickled


In [None]:
with open('/content/drive/MyDrive/final_project/topic_modeling_nmf.pkl','rb') as f:
  topic_model=pickle.load(f)

topic_model(df[['Cluster','text_']].iloc[:10000])


Cluster 1: 9998 reviews
Topic 1: great use love like good
Topic 2: cooky cup cake pie cupcake
Topic 3: tree beautiful bamboo totally life

Cluster 3: 2 reviews
Topic 1: like book love son nice
Topic 2: book little like month item
Topic 3: cook make knife looking sturdy


In [None]:
pip freeze > requirements.txt

# Sentiment analysis

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

In [3]:
text=[df['text_'].iloc[100]]

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [4]:
pipe(text)

[{'label': 'positive', 'score': 0.9993795156478882}]

In [5]:
import pickle
with open('/content/drive/MyDrive/final_project/sentiment_analysis.pkl','wb') as f:
  pickle.dump(pipe,f)
print('file pickled')

file pickled


In [None]:
for row in range(len(df['text_'])):
  text=[df['text_'].iloc[row]]
  print(f"{text}-- {pipe(text)}")

['love well made sturdy comfortable love itvery pretty']-- [{'label': 'neutral', 'score': 0.9869396686553955}]
['love great upgrade original ive mine couple year']-- [{'label': 'neutral', 'score': 0.703281044960022}]
['pillow saved back love look feel pillow']-- [{'label': 'neutral', 'score': 0.997622549533844}]
['missing information use great product price']-- [{'label': 'neutral', 'score': 0.9574671387672424}]
['nice set good quality set two month']-- [{'label': 'positive', 'score': 0.9956064820289612}]
['wanted different flavor']-- [{'label': 'neutral', 'score': 0.999836802482605}]
['perfect touch thing wish little space']-- [{'label': 'neutral', 'score': 0.9998249411582947}]
['done fit well look great love smoothness edge extra']-- [{'label': 'positive', 'score': 0.7940554022789001}]
['great big number easy read thing didnt like size']-- [{'label': 'neutral', 'score': 0.9997342228889465}]
['son love comforter well made also baby']-- [{'label': 'neutral', 'score': 0.9994683861732483

KeyboardInterrupt: 

In [None]:
pipe(['this is a good movie'])

[{'label': 'neutral', 'score': 0.65187007188797}]

In [None]:
# prompt: how to access the label in the output

for row in range(100):
  text=[df['text_'].iloc[row]]
  result = pipe(text)

  # Access the label
  label = result[0]['label']
  print(f"Label for text {row} : {label}")

Label for text 0 : neutral
Label for text 1 : neutral
Label for text 2 : neutral
Label for text 3 : neutral
Label for text 4 : positive
Label for text 5 : neutral
Label for text 6 : neutral
Label for text 7 : positive
Label for text 8 : neutral
Label for text 9 : neutral
Label for text 10 : neutral
Label for text 11 : neutral
Label for text 12 : neutral
Label for text 13 : neutral
Label for text 14 : neutral
Label for text 15 : neutral
Label for text 16 : neutral
Label for text 17 : neutral
Label for text 18 : neutral
Label for text 19 : neutral
Label for text 20 : neutral
Label for text 21 : neutral
Label for text 22 : neutral
Label for text 23 : neutral
Label for text 24 : neutral
Label for text 25 : neutral
Label for text 26 : neutral
Label for text 27 : neutral
Label for text 28 : neutral
Label for text 29 : neutral
Label for text 30 : neutral
Label for text 31 : neutral
Label for text 32 : neutral
Label for text 33 : neutral
Label for text 34 : neutral
Label for text 35 : neutral
