# Training COREX model

In [2]:
#Training semi-supervised Anchored COREX model
import numpy as np
import scipy.sparse as ss
import pickle
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from ast import literal_eval
import joblib


data_anchored_corex = pd.read_excel("preprocessed_data.xlsx")
train_topics = pd.read_excel("Training_topics.xlsx")
df_anchored_corex = pd.DataFrame()


# Drop rows with NaN values in 'Keys' column
train_topics = train_topics.dropna(subset=['Keys'])

data_words_anchored_corex = []
for x in data_anchored_corex['Clean']:
    data_words_anchored_corex.append(' '.join(literal_eval(x)))
keywords = []
for x in train_topics['Keys']:
    keywords.append([word.strip() for word in literal_eval(x)])  # Remove leading/trailing spaces

vectorizer_anchored_corex = CountVectorizer(stop_words='english', max_features=20000, binary=True)

doc_word_anchored_corex = vectorizer_anchored_corex.fit_transform(data_words_anchored_corex)
doc_word_anchored_corex = ss.csr_matrix(doc_word_anchored_corex)

words = list(vectorizer_anchored_corex.vocabulary_.keys())  # Accessing feature names directly from vocabulary

#Train the CorEx topic model with 6 topics
topic_model_anchored_corex = ct.Corex(n_hidden=10, words=words, max_iter=1000, verbose=False, seed=2022)
topic_model_anchored_corex.fit(doc_word_anchored_corex, words=words, anchors = keywords, anchor_strength=3)

#Save the model and topics
joblib.dump(topic_model_anchored_corex, open('Trained Model/Anchored_CorEx_Train_model_May24.sav', 'wb'))
topic_list_anchored_corex = topic_model_anchored_corex.get_topics()

df_anchored_corex['Topics'] = topic_list_anchored_corex
df_anchored_corex.to_excel('Anchored_CorEx_topics_final_May24.xlsx')



In [3]:
# Sample code to adjust anchor words based on vocabulary
missing_anchors = []
for anchor_set in keywords:
    for word in anchor_set:
        if word not in words:
            missing_anchors.append(word)
            print(f"WARNING: Anchor word not in word column labels provided to CorEx: {word}")

# Print missing anchors
print("Missing anchor words:", missing_anchors)

# Optionally, remove missing anchor words from keywords (if necessary)
keywords = [[word for word in anchor_set if word in words] for anchor_set in keywords]


Missing anchor words: ['well', 'last', 'less', 'find', 'show', 'give', 'much', 'system', 'system', 'well', 'several', 'please']


In [4]:
# Print all topics from the CorEx topic model
anchored_corex_topics = topic_model_anchored_corex.get_topics()
for n,topic in enumerate(anchored_corex_topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: storage, technology, fast, postit, ssd, retail, default, tb, misuse, installing
1: increase, emojis, novice, slowness, cellular, force, issue, pocket, speedy, annoying
2: install, fun, blaze, great, upgrade, use, outer, basically, fake, run
3: itune, tablet, browser, throttle, confident, lose, superior, research, bias, dell
4: connect, store, external, policy, properly, pc, old, launch, cellphone, slow
5: install, rf, differ, negate, subscribing, complicated, camera, run, galaxy, apply
6: heat, pack, seriously, moment, low, sshd, sandisc, league, amazon, handy
7: legit, raspberry, usuable, opportunity, compact, fullsize, pleasantly, consider, bit, issue
8: ethernet, hype, everytime, nonexistent, use, discontinue, short, eye, volume, outstanding
9: glad, regular, mileage, activation, happy, beginning, love, personal, array, satisfied


In [5]:
#label the review with topics
import pandas as pd
import pickle
from ast import literal_eval

data = pd.read_excel("preprocessed_data.xlsx")
aspect_list = [[],[],[],[],[],[],[],[],[],[]]
words_set = []
for x in data['Clean']:
    words_set.append(set(literal_eval(x)))

model = joblib.load(open("Trained Model/Anchored_CorEx_Train_model_May24.sav", 'rb'))
topic_list = []
for i, topic_words in enumerate(model.get_topics()):
  topic_list.append(set([words[0] for words in topic_words if words[1] > 0]))

for words in words_set:
  for i,topic_words in enumerate(topic_list):
      if (words & topic_words):
        aspect_list[i].append(1)
      else :
        aspect_list[i].append(0)
for i in range(10):
  data['Topic ' + str(i)] = aspect_list[i]
data.to_excel('labelled_aspect_May24.xlsx')

In [6]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Preprocessing

In [7]:
#Text Preprocessing for sentiment analysis
import string
import emoji
import re
import nltk
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.metrics import edit_distance
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import pandas as pd

stopwords = nltk.corpus.stopwords.words('english')

class AntonymReplacer(object):
    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []

        while i < l:
            word = sent[i]

            if word == 'not' and i+1 < l:
                ant = self.replace(sent[i+1])

                if ant:
                    words.append(ant)
                    i += 2
                    continue

            words.append(word)
            i += 1

        return words

def preprocess(text):
    
    #1. Generating the list of words in the tweet (hastags and other punctuations removed)
    text_blob = TextBlob(text)
    text = ' '.join(text_blob.words)
    
    #2. clean the number 
    text = re.sub(r'[0-9]', '', text)
    
    #3. lower the text
    text = text.lower()
    
    #4. conver the emoji to text form
    text = emoji.demojize(text)
    
    #5. remove punctuation 
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    
    #6. tokenize the text
    text = word_tokenize(text)
    
    #7. remove empty token
    text = [t for t in text if len(t) > 0]
    
    #8. remove non-alphabetical token
    text = [t for t in text if t.isalpha()]
    
    #9. replace the negation token
    replacer  = AntonymReplacer()
    text = replacer.replace_negations(text)
    
    #10. remove the stopwords
    text = [i for i in text if i not in stopwords]
    
    #11. stem the text
    porter_stemmer = PorterStemmer()
    text = [porter_stemmer.stem(w) for w in text]
    
    return text


In [8]:
# Load the pretrained SVM model and TfidfVectorizer
sentiment_model = joblib.load(open("../Sentiment Analzer/SVM_tfidf_model.sav", 'rb'))
vectorizer = joblib.load(open("../Sentiment Analzer/tfidf_vectorizer.sav", 'rb'))

# Preprocess the review texts
review = data['review'].astype(str).values

# Transform the review texts using the loaded TfidfVectorizer
X_test_tfidf = vectorizer.transform(review)

# Predict sentiment scores using the SVM model
score = sentiment_model.predict(X_test_tfidf)

# Convert sentiment scores to numerical values
score = score.astype(float)

# Classify sentiment scores into positive or negative labels
label = ['positive' if s >= 0.5 else 'negative' for s in score]


# Add sentiment labels to the DataFrame
data['Sentiment_Score'] = score
data['Sentiment'] = label


In [9]:
data

Unnamed: 0.1,Unnamed: 0,review_id,product,review,date,Clean,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Sentiment_Score,Sentiment
0,0,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,need extra storage for your ps5 look no further.,"Jun 30, 2023 8:26 PM","['need', 'extra', 'storage', 'look']",1,0,0,0,0,0,0,0,0,0,0.0,negative
1,1,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,caught this on sale for $129.,"Jun 30, 2023 8:26 PM","['catch', 'sale']",0,0,0,0,0,0,0,0,0,0,1.0,positive
2,2,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,well worth the money.,"Jun 30, 2023 8:26 PM","['well', 'worth', 'money']",0,0,0,0,0,0,0,0,0,0,1.0,positive
3,3,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,zero issues.,"Jun 30, 2023 8:26 PM",['issue'],0,1,0,0,0,0,0,1,0,0,0.0,negative
4,4,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,took about 2 mins to install.,"Jun 30, 2023 8:26 PM","['take', 'min', 'install']",0,0,1,0,0,1,0,0,0,0,0.0,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88639,88639,22019,WD - My Passport 1TB External USB Type-C Porta...,"if you are going to use this on a chromebook, ...","Dec 5, 2020 1:40 PM","['go', 'use', 'chromebook', 'mileage', 'vary',...",0,0,1,0,1,0,0,0,1,1,0.0,negative
88640,88640,22019,WD - My Passport 1TB External USB Type-C Porta...,"if you have a windows 10 device, it will do a ...","Dec 5, 2020 1:40 PM","['window', 'device', 'great', 'job', 'transfer...",0,0,1,0,0,0,0,0,0,0,1.0,positive
88641,88641,22019,WD - My Passport 1TB External USB Type-C Porta...,it has 930gb capacity with only two small file...,"Dec 5, 2020 1:40 PM","['gb', 'capacity', 'small', 'file', 'default']",1,0,0,0,0,0,0,0,0,0,0.0,negative
88642,88642,22019,WD - My Passport 1TB External USB Type-C Porta...,"overall, this is a nice compact ssd with fast ...","Dec 5, 2020 1:40 PM","['overall', 'nice', 'compact', 'ssd', 'fast', ...",1,0,0,0,0,0,0,1,0,0,1.0,positive


In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,review_id,product,review,date,Clean,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Sentiment_Score,Sentiment
0,0,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,need extra storage for your ps5 look no further.,"Jun 30, 2023 8:26 PM","['need', 'extra', 'storage', 'look']",1,0,0,0,0,0,0,0,0,0,0.0,negative
1,1,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,caught this on sale for $129.,"Jun 30, 2023 8:26 PM","['catch', 'sale']",0,0,0,0,0,0,0,0,0,0,1.0,positive
2,2,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,well worth the money.,"Jun 30, 2023 8:26 PM","['well', 'worth', 'money']",0,0,0,0,0,0,0,0,0,0,1.0,positive
3,3,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,zero issues.,"Jun 30, 2023 8:26 PM",['issue'],0,1,0,0,0,0,0,1,0,0,0.0,negative
4,4,0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,took about 2 mins to install.,"Jun 30, 2023 8:26 PM","['take', 'min', 'install']",0,0,1,0,0,1,0,0,0,0,0.0,negative


In [11]:
positive_reviews = data[data['Sentiment'] == 'positive']
results = pd.DataFrame()

# Calculate the mean sentiment score for each topic
for topic in ['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']:
    data[topic] *= data['Sentiment_Score']

sentiment_topics = data.groupby('product')[['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9']].mean().reset_index()

# Rename the columns for better readability
column_mapping = {'product': 'Computer Storage',
                  'Topic 0': 'Storage Performance',
                  'Topic 1': 'Product Reliability',
                  'Topic 2': 'User Experience',
                  'Topic 3': 'Price Affordability',
                  'Topic 4': 'Compatibility Connectivity',
                  'Topic 5': 'Task Performance',
                  'Topic 6': 'Build Quality',
                  'Topic 7': 'Customer Support',
                  'Topic 8': 'Versatility Adaptability',
                  'Topic 9': 'Satisfaction'}
topic = sentiment_topics.rename(columns=column_mapping)
topic
    

Unnamed: 0,Computer Storage,Storage Performance,Product Reliability,User Experience,Price Affordability,Compatibility Connectivity,Task Performance,Build Quality,Customer Support,Versatility Adaptability,Satisfaction
0,CORSAIR - MP600 PRO LPX 2TB Internal SSD PCIe ...,0.209677,0.016129,0.129032,0.0,0.016129,0.048387,0.016129,0.016129,0.048387,0.048387
1,Crucial - MX500 1TB Internal SSD SATA,0.166667,0.030303,0.287879,0.0,0.121212,0.060606,0.0,0.015152,0.090909,0.030303
2,Crucial - P3 1TB Internal SSD PCIe Gen 3 x4 NVMe,0.234043,0.042553,0.255319,0.0,0.042553,0.0,0.0,0.06383,0.06383,0.021277
3,Crucial - X6 SE 1TB External USB-C/USB-A Porta...,0.203125,0.015625,0.15625,0.0,0.078125,0.015625,0.015625,0.078125,0.09375,0.03125
4,"LaCie - Rugged 5TB External USB-C, USB 3.1 Gen...",0.117647,0.044118,0.132353,0.0,0.073529,0.014706,0.0,0.044118,0.073529,0.102941
5,Samsung - 870 EVO 1TB Internal SSD SATA,0.220339,0.025424,0.211864,0.0,0.050847,0.042373,0.008475,0.025424,0.09322,0.008475
6,Samsung - 970 EVO Plus 2TB Internal SSD PCIe G...,0.22449,0.061224,0.122449,0.0,0.061224,0.040816,0.020408,0.081633,0.020408,0.0
7,Samsung - 980 PRO Heatsink 2TB Internal SSD PC...,0.171053,0.013158,0.157895,0.0,0.078947,0.092105,0.0,0.039474,0.013158,0.092105
8,Samsung - 990 PRO 2TB Internal SSD PCle Gen 4x...,0.071429,0.0,0.119048,0.011905,0.0,0.059524,0.011905,0.011905,0.02381,0.059524
9,Samsung - T7 2TB External USB 3.2 Gen 2 Portab...,0.123077,0.061538,0.184615,0.0,0.061538,0.0,0.0,0.092308,0.076923,0.030769


In [12]:
topics = ['Storage Performance', 'Product Reliability',	'User Experience',	'Price Affordability',	'Compatibility Connectivity',	'Task Performance',	'Build Quality', 'Customer Support',	'Versatility Adaptability',	'Satisfaction' ]

result = pd.DataFrame()
for t in topics:
    # Find the 5 largest values in the specified column
    top_5_products = topic.nlargest(5, t).reset_index()

    result[f'{t}'] = top_5_products['Computer Storage']

result

Unnamed: 0,Storage Performance,Product Reliability,User Experience,Price Affordability,Compatibility Connectivity,Task Performance,Build Quality,Customer Support,Versatility Adaptability,Satisfaction
0,WD - Blue SA510 1TB Internal SSD SATA,Samsung - T7 2TB External USB 3.2 Gen 2 Portab...,Seagate - Barracuda 2TB Internal SATA Hard Dri...,WD - Blue SA510 1TB Internal SSD SATA,WD - BLACK D10 8TB External USB 3.2 Gen 1 Port...,Seagate - FireCuda 530 2TB Internal SSD PCIe G...,SanDisk - Ultra Dual Drive Go 256GB USB Type-A...,Samsung - T7 2TB External USB 3.2 Gen 2 Portab...,Seagate - Barracuda 2TB Internal SATA Hard Dri...,"LaCie - Rugged 5TB External USB-C, USB 3.1 Gen..."
1,Crucial - P3 1TB Internal SSD PCIe Gen 3 x4 NVMe,Samsung - 970 EVO Plus 2TB Internal SSD PCIe G...,Crucial - MX500 1TB Internal SSD SATA,WD - easystore 8TB External USB 3.0 Hard Drive...,Crucial - MX500 1TB Internal SSD SATA,WD - BLACK SN770 1TB Internal SSD PCIe Gen 4 x4,WD - easystore 8TB External USB 3.0 Hard Drive...,Samsung - 970 EVO Plus 2TB Internal SSD PCIe G...,SanDisk - Ultra Dual Drive Go 256GB USB Type-A...,SanDisk - Extreme Portable 4TB External USB-C ...
2,WD - easystore 240GB Internal SSD SATA,Seagate - Barracuda 2TB Internal SATA Hard Dri...,Seagate - FireCuda 530 2TB Internal SSD PCIe G...,Seagate - Barracuda 2TB Internal SATA Hard Dri...,WD - easystore 240GB Internal SSD SATA,Samsung - 980 PRO Heatsink 2TB Internal SSD PC...,SanDisk - Ultra 512GB USB 3.0 Flash Drive - Black,Crucial - X6 SE 1TB External USB-C/USB-A Porta...,Seagate - One Touch 2TB External USB 3.0 Porta...,Samsung - 980 PRO Heatsink 2TB Internal SSD PC...
3,Samsung - 970 EVO Plus 2TB Internal SSD PCIe G...,Seagate - One Touch 2TB External USB 3.0 Porta...,Crucial - P3 1TB Internal SSD PCIe Gen 3 x4 NVMe,Samsung - 990 PRO 2TB Internal SSD PCle Gen 4x...,WD - Easystore 5TB External USB 3.0 Portable H...,Seagate - Barracuda 2TB Internal SATA Hard Dri...,Samsung - 970 EVO Plus 2TB Internal SSD PCIe G...,Crucial - P3 1TB Internal SSD PCIe Gen 3 x4 NVMe,Crucial - X6 SE 1TB External USB-C/USB-A Porta...,SanDisk - Ultra Dual Drive Go 256GB USB Type-A...
4,Samsung - 870 EVO 1TB Internal SSD SATA,"LaCie - Rugged 5TB External USB-C, USB 3.1 Gen...",WD - BLACK D10 8TB External USB 3.2 Gen 1 Port...,Seagate - FireCuda 530 2TB Internal SSD PCIe G...,WD - easystore 8TB External USB 3.0 Hard Drive...,WD - easystore 8TB External USB 3.0 Hard Drive...,Seagate - Barracuda 2TB Internal SATA Hard Dri...,Seagate - FireCuda 530 2TB Internal SSD PCIe G...,Samsung - 870 EVO 1TB Internal SSD SATA,Samsung - 990 PRO 2TB Internal SSD PCle Gen 4x...


# Save the top 5 ranking to an Excel file


In [13]:
result.to_excel('Top5_Ranking.xlsx')