In [1]:
#Training semi-supervised Anchored COREX model
import numpy as np
import scipy.sparse as ss
import pickle
from corextopic import corextopic as ct
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from ast import literal_eval

data_anchored_corex = pd.read_excel("preprocessed_data.xlsx")
train_topics = pd.read_excel("Training_topics.xlsx")
df_anchored_corex = pd.DataFrame()


# Drop rows with NaN values in 'Keys' column
train_topics = train_topics.dropna(subset=['Keys'])

data_words_anchored_corex = []
for x in data_anchored_corex['Clean']:
    data_words_anchored_corex.append(' '.join(literal_eval(x)))
keywords = []
for x in train_topics['Keys']:
    keywords.append([word.strip() for word in literal_eval(x)])  # Remove leading/trailing spaces

vectorizer_anchored_corex = CountVectorizer(stop_words='english', max_features=20000, binary=True)

doc_word_anchored_corex = vectorizer_anchored_corex.fit_transform(data_words_anchored_corex)
doc_word_anchored_corex = ss.csr_matrix(doc_word_anchored_corex)

words = list(vectorizer_anchored_corex.vocabulary_.keys())  # Accessing feature names directly from vocabulary

#Train the CorEx topic model with 6 topics
topic_model_anchored_corex = ct.Corex(n_hidden=10, words=words, max_iter=1000, verbose=False, seed=2022)
topic_model_anchored_corex.fit(doc_word_anchored_corex, words=words, anchors = keywords, anchor_strength=3)

#Save the model and topics
pickle.dump(topic_model_anchored_corex, open('Anchored_CorEx_Train_model_May24.sav', 'wb'))
topic_list_anchored_corex = topic_model_anchored_corex.get_topics()

df_anchored_corex['Topics'] = topic_list_anchored_corex
df_anchored_corex.to_excel('Anchored_CorEx_topics_final_May24.xlsx')



In [16]:
# Sample code to adjust anchor words based on vocabulary
missing_anchors = []
for anchor_set in keywords:
    for word in anchor_set:
        if word not in words:
            missing_anchors.append(word)
            print(f"WARNING: Anchor word not in word column labels provided to CorEx: {word}")

# Print missing anchors
print("Missing anchor words:", missing_anchors)

# Optionally, remove missing anchor words from keywords (if necessary)
keywords = [[word for word in anchor_set if word in words] for anchor_set in keywords]


Missing anchor words: ['well', 'last', 'less', 'find', 'show', 'give', 'much', 'system', 'system', 'well', 'several', 'please']


In [17]:
# Print all topics from the CorEx topic model
anchored_corex_topics = topic_model_anchored_corex.get_topics()
for n,topic in enumerate(anchored_corex_topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ', '.join(topic_words))

0: storage, technology, fast, postit, ssd, retail, default, tb, misuse, installing
1: increase, emojis, novice, slowness, cellular, force, issue, pocket, speedy, annoying
2: install, fun, blaze, great, upgrade, use, outer, basically, fake, run
3: itune, tablet, browser, throttle, confident, lose, superior, research, bias, dell
4: connect, store, external, policy, properly, pc, old, launch, cellphone, slow
5: install, rf, differ, negate, subscribing, complicated, camera, run, galaxy, apply
6: heat, pack, seriously, moment, low, sshd, sandisc, league, amazon, handy
7: legit, raspberry, usuable, opportunity, compact, fullsize, pleasantly, consider, bit, issue
8: ethernet, hype, everytime, nonexistent, use, discontinue, short, eye, volume, outstanding
9: glad, regular, mileage, activation, happy, beginning, love, personal, array, satisfied


In [2]:
#label the review with topics
import pandas as pd
import pickle
from ast import literal_eval

data = pd.read_excel("preprocessed_data.xlsx")
aspect_list = [[],[],[],[],[],[],[],[],[],[]]
words_set = []
for x in data['Clean']:
    words_set.append(set(literal_eval(x)))

model = pickle.load(open("Anchored_CorEx_Train_model_May24.sav", 'rb'))
topic_list = []
for i, topic_words in enumerate(model.get_topics()):
  topic_list.append(set([words[0] for words in topic_words if words[1] > 0]))

for words in words_set:
  for i,topic_words in enumerate(topic_list):
      if (words & topic_words):
        aspect_list[i].append(1)
      else :
        aspect_list[i].append(0)
for i in range(10):
  data['Topic ' + str(i)] = aspect_list[i]
data.to_excel('labelled_aspect_May24.xlsx')