## **Using the merged dataset to train the Logistic Model**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import NMF

In [2]:
merged_data = pd.read_csv('../data/clean/merged-labeled/final_dataset.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    merged_data['cleaned_text'], merged_data['sentiment'], test_size=0.2, random_state=42
)

In [4]:
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

Training set size: 9471
Testing set size: 2368


In [5]:
print(X_train.isnull().sum())  # Check for NaN values in the training set
print(X_test.isnull().sum())

2
0


In [6]:
X_train = X_train.dropna()
X_test = X_test.dropna()

In [7]:
print(X_train.isnull().sum())  # Check for NaN values in the training set
print(X_test.isnull().sum())

0
0


### **Extract Topic Distributions as Features**

In [8]:
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize

# Assuming X_train and X_test are your raw text datasets (as a list of documents)
# Tokenize the text (you should use the same tokenizer as for TF-IDF and GloVe)
X_train_tokens = [word_tokenize(doc.lower()) for doc in X_train]
X_test_tokens = [word_tokenize(doc.lower()) for doc in X_test]

# Create a dictionary from the training data
dictionary = Dictionary(X_train_tokens)

# Create the corpus for LDA
corpus_train = [dictionary.doc2bow(doc) for doc in X_train_tokens]
corpus_test = [dictionary.doc2bow(doc) for doc in X_test_tokens]


In [9]:
from gensim.models import LdaMulticore

# Fit the LDA model on the training corpus
lda_model = LdaMulticore(corpus_train, num_topics=10, id2word=dictionary, passes=15)

# Get the topic distributions for the training set
topic_distributions_train = [lda_model.get_document_topics(doc) for doc in corpus_train]

# Get the topic distributions for the test set
topic_distributions_test = [lda_model.get_document_topics(doc) for doc in corpus_test]


In [10]:
# Convert topic distributions to feature vectors for training set
topic_features_train = []
for dist in topic_distributions_train:
    topic_vector = [0] * lda_model.num_topics
    for topic_num, prop in dist:
        topic_vector[topic_num] = prop
    topic_features_train.append(topic_vector)

# Convert topic distributions to feature vectors for test set
topic_features_test = []
for dist in topic_distributions_test:
    topic_vector = [0] * lda_model.num_topics
    for topic_num, prop in dist:
        topic_vector[topic_num] = prop
    topic_features_test.append(topic_vector)


In [11]:
import numpy as np

X_lda_train = np.array(topic_features_train)
X_lda_test = np.array(topic_features_test)

### **Use TF-IDF Vectorization to convert text into numerical features for model training**

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [13]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [14]:
print("TF-IDF matrix shape:", X_train_tfidf.shape)

TF-IDF matrix shape: (9469, 5000)


In [15]:
print(X_train_tfidf)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### **Compute GloVe Embeddings**

In [16]:
import numpy as np
from gensim.models import KeyedVectors

In [17]:
glove_file = "../data/glove/glove.6B.100d.txt"
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [18]:
def get_embedding(text):
    words = text.split()
    word_vectors = [glove_model[word] for word in words if word in glove_model]
    if word_vectors:
        return sum(word_vectors) / len(word_vectors)  # Average vector
    return np.zeros(100)  # Return a zero vector if no words are in GloVe

In [19]:
X_train_glove = np.array([get_embedding(text) for text in X_train])
X_test_glove = np.array([get_embedding(text) for text in X_test])

### **Compute NMF Topics**

In [20]:
num_topics = 10

# Initialize the NMF model
nmf_model = NMF(n_components=num_topics, random_state=42, max_iter=500)

# Fit the NMF model on the TF-IDF matrix
X_train_nmf = nmf_model.fit_transform(X_train_tfidf)
X_test_nmf = nmf_model.transform(X_test_tfidf)

### **Normalize Features**

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
# scaler_lda = StandardScaler()
# X_lda_train_scaled = scaler_lda.fit_transform(X_lda_train)
# X_lda_test_scaled = scaler_lda.transform(X_lda_test)

In [23]:
scaler_nmf = StandardScaler()
X_train_nmf_scaled = scaler_nmf.fit_transform(X_train_nmf)
X_test_nmf_scaled = scaler_nmf.transform(X_test_nmf)

In [24]:
# Normalize TF-IDF features
scaler_tfidf = StandardScaler()
X_train_tfidf_scaled = scaler_tfidf.fit_transform(X_train_tfidf)
X_test_tfidf_scaled = scaler_tfidf.transform(X_test_tfidf)

In [25]:
# Normalize GloVe features
scaler_glove = StandardScaler()
X_train_glove_scaled = scaler_glove.fit_transform(X_train_glove)
X_test_glove_scaled = scaler_glove.transform(X_test_glove)

Combine Features with Weights

In [26]:
alpha = 0.4  # Weight for TF-IDF
beta = 0.3   # Weight for GloVe
gamma = 0.3  # Weight for LDA

In [27]:
print(f"Documents in X_train_tfidf_scaled: {X_train_tfidf_scaled.shape[0]}")
print(f"Documents in X_train_glove_scaled: {X_train_glove_scaled.shape[0]}")
print(f"Documents in X_nmf_scaled: {X_train_nmf_scaled.shape[0]}")

Documents in X_train_tfidf_scaled: 9469
Documents in X_train_glove_scaled: 9469
Documents in X_nmf_scaled: 9469


In [28]:
X_train_combined = np.hstack([alpha * X_train_tfidf_scaled, beta * X_train_glove_scaled, gamma * X_train_nmf_scaled])
X_test_combined = np.hstack([alpha * X_test_tfidf_scaled, beta * X_test_glove_scaled, gamma * X_test_nmf_scaled])

### **Use SMOTE to oversample the minority classes in the training data.**

In [29]:
from imblearn.over_sampling import SMOTE

In [30]:
smote = SMOTE(random_state=42)

In [31]:
y_train_aligned = y_train[:X_train_combined.shape[0]]

In [32]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_combined, y_train_aligned)

In [33]:
print("Class distribution after SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

Class distribution after SMOTE:
sentiment
Positive    7817
Negative    7817
Neutral     7817
Name: count, dtype: int64


In [34]:
model = LogisticRegression(max_iter=500)

In [35]:
model.fit(X_train_resampled, y_train_resampled)

In [36]:
y_pred = model.predict(X_test_combined)

In [37]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    Negative       0.81      0.79      0.80       189
     Neutral       0.62      0.70      0.66       214
    Positive       0.96      0.94      0.95      1965

    accuracy                           0.91      2368
   macro avg       0.79      0.81      0.80      2368
weighted avg       0.91      0.91      0.91      2368

Accuracy Score: 0.9087837837837838


In [38]:
import joblib

joblib.dump(model, '../models/logistic_sentiment_model.joblib')  # Save the trained model
joblib.dump(tfidf_vectorizer, '../models/tfidf_vectorizer.joblib')  # Save the TF-IDF vectorizer
joblib.dump(glove_model, '../models/glove_model.joblib')  # Save the GloVe model (optional, but useful for future use)
joblib.dump(nmf_model, '../models/nmf_model.joblib')
# Save the LDA model
# lda_model.save('../models/lda_model_sentiment.gensim')

joblib.dump(scaler_tfidf, "../models/scaler_tfidf.joblib")
joblib.dump(scaler_glove, "../models/scaler_glove.joblib")
joblib.dump(scaler_nmf, "../models/scaler_nmf.joblib")

# Save the dictionary used for training
dictionary.save_as_text('../models/lda_dictionary_sentiment.txt')

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.


### **Fine-Tune the Logistic Regression Model**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': ['balanced']
}


In [None]:
grid_search = GridSearchCV(LogisticRegression(), param_grid, scoring='f1_macro', cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)

In [None]:
print("Best parameters:", grid_search.best_params_)

In [None]:
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test_combined)

In [None]:
print("Tuned Classification Report:")
print(classification_report(y_test, y_pred_tuned))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# **Test**

In [39]:
def preprocess_input(text, vectorizer):
    # Transform the input text using the TF-IDF vectorizer
    return vectorizer.transform([text])  # Transform expects a list of strings


In [40]:
def compute_glove_embedding(tokens, glove_model):
    """
    Compute the average GloVe embedding for the given tokens.
    """
    embeddings = [glove_model[token] if token in glove_model else np.zeros(300) for token in tokens]

    if embeddings:  return np.mean(embeddings, axis=0)

    else:  return np.zeros(300)

In [41]:
def predict_sentiment(text, model, vectorizer):
    # Preprocess the input text
    processed_text = preprocess_input(text, vectorizer)
    # Predict sentiment using the Logistic Regression model
    predicted_class = model.predict(processed_text)[0]
    return predicted_class


In [42]:
import joblib

# Load the trained Logistic Regression model
model = joblib.load('../models/logistic_sentiment_model.joblib')

# Load the trained TF-IDF vectorizer
vectorizer = joblib.load('../models/tfidf_vectorizer.joblib')


In [43]:
import numpy as np
from joblib import load
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore


# Load models and vectorizers
tfidf_vectoriser = load("../models/tfidf_vectorizer.joblib")
nmf_model = load("../models/nmf_model.joblib")
glove_model = load("../models/glove_model.joblib")
scaler_tfidf = load("../models/scaler_tfidf.joblib")
scaler_glove = load("../models/scaler_glove.joblib")
scaler_nmf = load("../models/scaler_nmf.joblib")


def compute_glove_embedding(tokens, glove_model):
    """
    Compute the average GloVe embedding for the given tokens.
    """
    embeddings = [glove_model[token] for token in tokens if token in glove_model]

    # print(f"Tokens: {tokens}")
    # print(f"Embeddings: {embeddings}")

    if embeddings:  # Ensure there are valid embeddings
        return np.mean(embeddings, axis=0)
    else:
        # Handle cases where no tokens match the GloVe vocabulary
        return np.zeros(glove_model.vector_size)


def get_nmf_features(tfidf_features):
    """
    Extract NMF features using the pre-trained NMF model.
    """
    # Get topic distribution using the NMF model
    nmf_features = nmf_model.transform(tfidf_features)

    return nmf_features.flatten()


def extract_features(reviews: str, tfidf_weight=0.4, glove_weight=0.3, nmf_weight=0.3):
    """
    Extract features from preprocessed text using TF-IDF, GloVe, and LDA.
    """

    # TF-IDF features
    tfidf_features = tfidf_vectoriser.transform([reviews]).toarray()

    # GloVe features
    tokens = reviews.split()  # Tokenize preprocessed text
    glove_features = compute_glove_embedding(tokens, glove_model).reshape(1, -1)

    # LDA features
    nmf_features = get_nmf_features(tfidf_features).reshape(1, -1)  # Ensure NMF features are 2D

    tfidf_scaled = scaler_tfidf.transform(tfidf_features)
    glove_scaled = scaler_glove.transform(glove_features)
    nmf_scaled = scaler_nmf.transform(nmf_features)

    # Weighted sum of features
    # combined_features = np.hstack(
    #     [tfidf_weight * tfidf_scaled, glove_weight * glove_scaled, nmf_weight * nmf_scaled]
    # )
    combined_features = np.hstack(
        [tfidf_scaled, glove_scaled, nmf_scaled]
    )

    return combined_features


In [44]:
reviews_json = [
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 23 October 2024", "review_body": "I have been using this phone for the past one month, and after thoroughly testing it, here\u2019s my detailed review."},
{"review_rating": "4.0 out of 5 stars", "review_date": "Reviewed in India on 29 October 2024", "review_body": "Performance- Very Good\ud83d\udc4d"},
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 21 November 2024", "review_body": "I am writing this review after 1 month of using the Oneplus Nord 4. It is one of the best midrange phone that you can blindly buy without any confusion. I bought this phone during Great Indian festival after so much research and doubts. I was not sure which phone will be the best in 30000 segment. I was looking for a balanced phone which has everything like good performance, good design, good battery and good camera. I don't want to compromise in any one thing. So after lot of research i was having confusion between nord 4 and realme gt 6t. But in realme gt 6t the display is curved which i don't prefer at all. Curved display is waste of money and feels like gimmick so i avoided it and bought nord 4. Now i feel it was the best decision."},
{"review_rating": "5.0 out of 5 stars", "review_date": "Reviewed in India on 4 November 2024", "review_body": "I usually don\u2019t review products after purchasing them, but this time I felt compelled. This might be my first-ever review."}
]

In [45]:
import re
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.util import mark_negation
from nltk.stem import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import Dictionary


# Initialize resources
lemmatizer = WordNetLemmatizer()

# Load stopwords and exclude negation words
stop_words = set(stopwords.words("english"))


def advance_text_cleaning(text: str) -> str:
    """
    Cleans the text using techniques: lowercasing, expanding contractions, 
    removing special characters, handling negations, removing stopwords, 
    lemmatization, and optional emoji removal.
    
    Args:  
        text (str): Input text to be cleaned.
        
    Returns:
        str: Cleaned and preprocessed text
    """

    if not isinstance(text, str):  
        return ""

    # Expand contractions (e.g., "can't" → "cannot")
    text = contractions.fix(text)
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    
    # Remove emojis and special characters
    text = re.sub(r'[^\w\s]', '', text, flags=re.UNICODE)  # Removes emojis and other symbols
    
    # Remove special characters and numbers, retain only alphabets and spaces
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Convert to lowercase
    text = text.lower()

    # Tokenize text and handle negations
    words = word_tokenize(text)
    # words = mark_negation(words)  # E.g., ["not", "good"] → ["not_good"]

    # Remove stopwords and lemmatize
    words = [
        lemmatizer.lemmatize(word)
        for word in words
        if word not in stop_words
    ]

    # Convert tokens back to string
    return " ".join(words)

In [46]:
def analyse_sentiments(reviews):


    preprocessed_review = advance_text_cleaning(reviews)

    features = extract_features(preprocessed_review)

    sentiment = model.predict(features)
    
    return sentiment[0]  # Return the prediction (usually a single value for one input)
   

In [47]:
reviews = [review["review_body"] for review in reviews_json]

In [None]:
sentiments = [analyse_sentiments(review) for review in reviews]
sentiments

['Negative', 'Neutral', 'Neutral', 'Negative']

: 