In [33]:
import nltk
import re
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')

In [34]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [35]:
merged_data = pd.read_csv("../data/clean/merged-labeled/final_dataset.csv")

In [36]:
merged_data.head()

Unnamed: 0,cleaned_text,sentiment
0,great card write speed fast enough intensive w...,Positive
1,nice mobile,Neutral
2,large choice ordered time online shop sunflowe...,Positive
3,able increase storage surface carrying around ...,Positive
4,regularly shop site please correctness accuracy,Positive


In [37]:
stop_words = set(stopwords.words('english'))  # Define stopwords
lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer

In [109]:
import re
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import mark_negation
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import Dictionary


# Initialize resources
lemmatizer = WordNetLemmatizer()

# Load stopwords and exclude negation words
stop_words = set(stopwords.words("english"))


def advance_text_cleaning(text: str) -> str:
    """
    Cleans the text using techniques: lowercasing, expanding contractions, 
    removing special characters, handling negations, removing stopwords, 
    lemmatization, and optional emoji removal.
    
    Args:  
        text (str): Input text to be cleaned.
        
    Returns:
        str: Cleaned and preprocessed text
    """

    if not isinstance(text, str):  
        return ""

    # Expand contractions (e.g., "can't" → "cannot")
    text = contractions.fix(text)
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    
    # Remove emojis and special characters
    text = re.sub(r'[^\w\s]', '', text, flags=re.UNICODE)  # Removes emojis and other symbols
    
    # Remove special characters and numbers, retain only alphabets and spaces
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    # Tokenize text and handle negations
    words = word_tokenize(text)
    # words = mark_negation(words)  # E.g., ["not", "good"] → ["not_good"]

    # Remove stopwords and lemmatize
    words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words
    ]

    # Convert tokens back to string
    return " ".join(words)


def preprocess_for_lda(documents, no_below=5, no_above=0.4):
    """
    Preprocesses documents for LDA with bigram and trigram detection, 
    and thresholding for word frequency.
    
    Args:
        documents (list of str): List of input documents as raw text.
        no_below (int): Minimum number of documents a word must appear in to be kept.
        no_above (float): Maximum proportion of documents a word can appear in to be kept.
        
    Returns:
        tuple: (cleaned_corpus, dictionary)
    """
    # Step 1: Clean and tokenize all documents
    tokenized_docs = [advance_text_cleaning(doc).split() for doc in documents]

    # Step 2: Detect and retain bigrams and trigrams
    bigram = Phrases(tokenized_docs, min_count=5, threshold=10)
    trigram = Phrases(bigram[tokenized_docs], threshold=10)
    bigram_mod = bigram.freeze()
    trigram_mod = trigram.freeze()

    tokenized_docs = [bigram_mod[doc] for doc in tokenized_docs]
    tokenized_docs = [trigram_mod[doc] for doc in tokenized_docs]

    # Step 3: Create dictionary and filter extremes
    dictionary = Dictionary(tokenized_docs)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above)

    custom_stopwords = {'good', 'product', 'phone', 'use', 'one', 'also'}
    dictionary.filter_tokens(bad_ids=[dictionary.token2id[word] for word in custom_stopwords if word in dictionary.token2id])


    # Step 4: Create a bag-of-words corpus
    cleaned_corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

    return cleaned_corpus, dictionary, tokenized_docs, bigram, trigram


Create a dictionary and corpus for LDA

In [110]:
corpus, dictionary, tokenised_doc, bigram, trigram= preprocess_for_lda(merged_data["cleaned_text"])

In [99]:
print(dictionary.token2id)



In [100]:
from gensim.corpora import MmCorpus
print(f"Number of documents: {len(corpus)}")
print(f"Number of unique tokens: {len(dictionary)}")

Number of documents: 11839
Number of unique tokens: 7006


In [101]:
num_topics = 3 # Adjust based on your data
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=3, passes=50, alpha='asymmetric', eta=0.01)

In [102]:
from gensim.models import CoherenceModel
coherence_model = CoherenceModel(model=lda_model, texts=tokenised_doc, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"Coherence Score: {coherence_score}")

Coherence Score: 0.3495904583996747


In [93]:
from gensim.models import CoherenceModel

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=1):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

model_list, coherence_values = compute_coherence_values(dictionary, corpus, tokenised_doc, limit=10)

for i, score in enumerate(coherence_values, start=2):
    print(f"Num Topics: {i}, Coherence Score: {score}")


Num Topics: 2, Coherence Score: 0.2925724496110942
Num Topics: 3, Coherence Score: 0.35410715995777925
Num Topics: 4, Coherence Score: 0.35102687588813586
Num Topics: 5, Coherence Score: 0.3423852935070616
Num Topics: 6, Coherence Score: 0.33958374745871756
Num Topics: 7, Coherence Score: 0.33690774500621895
Num Topics: 8, Coherence Score: 0.3534892333825977
Num Topics: 9, Coherence Score: 0.3457297685084993


In [103]:
topics = lda_model.show_topics(num_topics=8, num_words=10, formatted=False)
for topic_num, words in topics:
    print(f"Topic {topic_num}: {[word[0] for word in words]}")


Topic 0: ['card', 'go', 'sand', 'great', 'work', 'problem', 'bought', 'price', 'tablet', 'fast']
Topic 1: ['cable', 'quality', 'price', 'order', 'site', 'delivery', 'ordered', 'store', 'day', 'service']
Topic 2: ['quality', 'like', 'using', 'time', 'get', 'go', 'used', 'price', 'look', 'best']


In [105]:
topic_names = {0: "Product Performance and Issues",
1: "Ordering and Delivery Experience",
2: "Product Quality and Satisfaction",
}

In [106]:
def get_dominant_topic(lda_model, bow):
    topic_probs = lda_model.get_document_topics(bow)
    return max(topic_probs, key=lambda x: x[1])[0]

new_bow = dictionary.doc2bow(["product", "quality", "excellent"])
dominant_topic = get_dominant_topic(lda_model, new_bow)
print(f"Dominant Topic: {topic_names[dominant_topic]}")


Dominant Topic: Ordering and Delivery Experience


In [112]:
from gensim import corpora

# Save the corpus as an .mm file
corpora.MmCorpus.serialize('../models/lda_corpus.mm', corpus)
dictionary.save_as_text("../models/lda_dictionary.txt")
lda_model.save('../models/lda_model.gensim')
bigram.save("../models/bigram_model.phrases")
trigram.save("../models/trigram_model.phrases")


In [108]:
import json

topic_names = {
    0: "Product Performance and Issues",
    1: "Ordering and Delivery Experience",
    2: "Product Quality and Satisfaction"
}

with open("../models/topic_names.json", "w") as f:
    json.dump(topic_names, f)

### **Optimal Topics**

In [28]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd

# Load your dataset
file_path = '../data/clean/merged-labeled/final_dataset.csv'  # Update with your local path
data = pd.read_csv(file_path)

# Preprocess: Tokenize
data['tokenized_text'] = data['cleaned_text'].apply(advanced_text_cleaning)
texts = data['tokenized_text']

# Create dictionary and corpus
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Function to compute coherence values
def compute_coherence_values(dictionary, corpus, texts, start, limit, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

# Compute coherence
start, limit, step = 2, 16, 1
model_list, coherence_values = compute_coherence_values(dictionary, corpus, texts, start, limit, step)

# Find optimal number of topics
optimal_num_topics = start + coherence_values.index(max(coherence_values))
print(f"Optimal number of topics: {optimal_num_topics}")
print(f"Coherence values: {coherence_values}")


Optimal number of topics: 8
Coherence values: [0.3531716243647762, 0.35017579749716354, 0.35537662162921024, 0.3708716246561382, 0.3589572779391721, 0.36934039142974173, 0.37632138300498597, 0.35815194048481475, 0.3705343531179574, 0.3646102185405697, 0.356667956353332, 0.36081374313511827, 0.33984992313461077, 0.3436895281745887]


### **Testing**

In [33]:
import joblib

# Load the trained Logistic Regression model
model = joblib.load('../models/logistic_model.pkl')

# Load the trained TF-IDF vectorizer
vectorizer = joblib.load('../models/tfidf_vectorizer.pkl')

In [34]:
reviews = [
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 26 October 2024", "review_body": "The bottles are good quality. Sturdy lids and spill proof. Just don't put it in deep fridge"},
{"review_rating": "4.0 out of 5 stars", "review_date": "Reviewed in India on 22 August 2024", "review_body": "Looks wise nice and easy to use and clean also. God product. Lid should have been better designed. Lids are fitting properly, kids found it difficult to fit."},
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 14 October 2024", "review_body": "Recently bought a set of 3, 1 ltr bottles. The cap is good and leakproof. The design is elegant and comfortable to use. Looks durable"},
{"review_rating": "1.0 out of 5 stars", "review_date": "Reviewed in India on 28 October 2024", "review_body": "The bottle looks decent and weight wise also looks fine however there is a major design flaw in the bottle which makes it extremely vulnerable to cracks. The bottom base seems to be very loosely joint from the upper body and even if a very small fall is there, we see joints opening causing bottle to leak."},
{"review_rating": "3.0 out of 5 stars", "review_date": "Reviewed in India on 30 July 2024", "review_body": "The bottles are good and relatively cheaper than other similar products from other brands."},
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 13 November 2024", "review_body": "Durable"},
{"review_rating": "4.0 out of 5 stars", "review_date": "Reviewed in India on 12 February 2024", "review_body": "I purchased a set of three water bottles on Amazon, and while the overall product quality seemed satisfactory, I encountered a significant issue with one of the bottles. Unfortunately, upon inspection, I discovered that the third water bottle had a noticeable hole, making it unusable."},
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 15 November 2024", "review_body": "Value for money"}
]

In [35]:
def preprocess_input(text, vectorizer):
    # Transform the input text using the TF-IDF vectorizer
    return vectorizer.transform([text])  # Transform expects a list of strings

In [36]:
def predict_sentiment(text, model, vectorizer):
    # Preprocess the input text
    processed_text = preprocess_input(text, vectorizer)
    # Predict sentiment using the Logistic Regression model
    predicted_class = model.predict(processed_text)[0]
    return predicted_class


In [None]:
sentiment_results = predict_sentiment(input_text, model, vectorizer)

In [32]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.utils import simple_preprocess

def perform_lda(reviews, num_topics=3):
    # Preprocess reviews
    tokenized_reviews = [simple_preprocess(review) for review in reviews]
    # Create a dictionary and corpus
    dictionary = Dictionary(tokenized_reviews)
    corpus = [dictionary.doc2bow(text) for text in tokenized_reviews]
    # Train LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
    return lda_model

# Group reviews by sentiment
positive_reviews = [result['Review'] for result in sentiment_results if result['Sentiment'] == 'Positive']
negative_reviews = [result['Review'] for result in sentiment_results if result['Sentiment'] == 'Negative']

# Perform LDA for each sentiment group
print("Topics in Positive Reviews:")
positive_lda = perform_lda(positive_reviews)
for idx, topic in positive_lda.print_topics():
    print(f"Topic {idx}: {topic}")

print("\nTopics in Negative Reviews:")
negative_lda = perform_lda(negative_reviews)
for idx, topic in negative_lda.print_topics():
    print(f"Topic {idx}: {topic}")


NameError: name 'sentiment_results' is not defined