In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import pickle
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import guidedlda
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter
pd.set_option('display.width',-1)

In [None]:
# run file containing custom functions
%run -i '../scripts/helper_functions.py'

In [None]:
# load data
df = pd.read_csv('../data/all_data.csv')
df.head()

In [None]:
X, vocab, word2id, vectorizer = doc_term_matrix(df, 'reviews')

In [None]:
# Normal LDA without seeding
model = guidedlda.GuidedLDA(n_topics=12, n_iter=100, random_state=7, refresh=20)
model.fit(X)

In [None]:
topic_word = model.topic_word_
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))


In [None]:
# Guided LDA with seed topics.
seed_topic_list = [['pain','insomnia','depression','nausea', 'inflammation', 'anxiety'],
                   ['indica','sativa','hybrid'],
                   ['relaxing','arousing','euphoria','focusing','sleep','energy'],
                   ['taste','smell','flavor','aroma','color']]
                    
model = guidedlda.GuidedLDA(n_topics=10, n_iter=1000, random_state=7, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id
        
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

In [None]:
joblib.dump(model, 'trained_lda.pkl')

In [None]:
trained_model = joblib.load('trained_lda.pkl')

# Document-topic distributions

In [None]:
trained_model = joblib.load('trained_lda.pkl') # load trained model

In [None]:
# List of keywords
word_list = [', '.join(np.array(vocab)[list(reversed(X[i,:].argsort()))[0:10]]) for i in range(len(X.toarray()))]

In [None]:
# Extract the topic vector
doc_topic = trained_model.transform(X)

In [None]:
columns10 = ['topic {}'.format(i) for i in range(10)]
topic_vector = pd.DataFrame(doc_topic, columns = columns10)
topic_vector.round(2).head()

In [None]:
# Assign topic if it is more than 70% dominant
num_topic = topic_threshold(doc_topic, topic_vector, threshold =0.7)

In [None]:
# Drop sentence with no topic
df = df_doc_topic[df_doc_topic['topics'] != 'None'].reset_index(drop = True)
df.head()

In [None]:
topic_dict ={0: 'plant',
             1: 'wellness',
             2: 'effect',
             3: 'sensation'}

In [None]:
df.topics = df.topics.map(topic_dict)

In [None]:
df.to_csv('../data/customer_topics.csv', index = False)