In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
import pickle
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import guidedlda
import nltk
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from collections import Counter

ModuleNotFoundError: No module named 'guidedlda'

In [61]:
# run file containing custom functions
%run -i '../scripts/helper_functions.py'

  text = re.sub("\d+", " ", text)  # remove digits


In [17]:
# load data
df = pd.read_csv('../data/all_data.csv')
df.head()

Unnamed: 0,reviews,rating,strain_name,year,quarter
0,fire get it why it's hot,5,707 Headband,2016,2
1,Extremely relaxing. Good head space. Does well...,4,3X Crazy,2019,4
2,"I really like the taste of this strain, great ...",5,707 Headband,2016,2
3,Great flavor takes me back to when I first smo...,5,3X Crazy,2019,4
4,This 1 is dankkk\nMy favourite strain at the m...,5,3X Crazy,2019,3


In [39]:
X, vocab, word2id, vectorizer = doc_term_matrix(df, 'reviews')

In [66]:
type(X)

scipy.sparse.csr.csr_matrix

In [40]:
# Normal LDA without seeding
model = guidedlda.GuidedLDA(n_topics=12, n_iter=100, random_state=7, refresh=20)
model.fit(X)

INFO:guidedlda:n_documents: 181055
INFO:guidedlda:vocab_size: 51091
INFO:guidedlda:n_words: 3842312
INFO:guidedlda:n_topics: 12
INFO:guidedlda:n_iter: 100
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:guidedlda:<0> log likelihood: -43301302
INFO:guidedlda:<20> log likelihood: -29839662
INFO:guidedlda:<40> log likelihood: -29306956
INFO:guidedlda:<60> log likelihood: -29075396
INFO:guidedlda:<80> log likelihood: -28945957
INFO:guidedlda:<99> log likelihood: -28853992


<guidedlda.guidedlda.GuidedLDA at 0x1a2cb165f8>

In [41]:
topic_word = model.topic_word_
n_top_words = 10
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))


Topic 0: high good strain great taste nice smell smoke really like
Topic 1: strain high great get like good one taste smoke time
Topic 2: strain high taste great smell good bud like nice smoke
Topic 3: strain high pain great good get feel one like make
Topic 4: strain get feel like high make pain time one smoke
Topic 5: strain high smoke like taste get good great smell one
Topic 6: strain high great good feel get smoke taste like one
Topic 7: high strain taste great smell good nice like smoke bud
Topic 8: strain high great pain feel good get make one anxiety
Topic 9: strain like bud smell taste get one high smoke great
Topic 10: strain high like taste feel great get smell good effect
Topic 11: strain high feel get like make smoke good great time


In [46]:
# Guided LDA with seed topics.
seed_topic_list = [['pain','insomnia','depression','nausea', 'inflammation', 'anxiety'], ['indica','sativa','hybrid'], ['relaxed','arouse','euphoria','focus','sleep','energy'], ['taste','smell','flavor','aroma','color']]
                    
model = guidedlda.GuidedLDA(n_topics=4, n_iter=1000, random_state=7, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id
        
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

INFO:guidedlda:n_documents: 181055
INFO:guidedlda:vocab_size: 51091
INFO:guidedlda:n_words: 3842312
INFO:guidedlda:n_topics: 10
INFO:guidedlda:n_iter: 1000
  if sparse and not np.issubdtype(doc_word.dtype, int):
INFO:guidedlda:<0> log likelihood: -41752481
INFO:guidedlda:<20> log likelihood: -29432064
INFO:guidedlda:<40> log likelihood: -28953197
INFO:guidedlda:<60> log likelihood: -28779151
INFO:guidedlda:<80> log likelihood: -28678757
INFO:guidedlda:<100> log likelihood: -28624028
INFO:guidedlda:<120> log likelihood: -28578720
INFO:guidedlda:<140> log likelihood: -28541719
INFO:guidedlda:<160> log likelihood: -28512948
INFO:guidedlda:<180> log likelihood: -28488027
INFO:guidedlda:<200> log likelihood: -28468452
INFO:guidedlda:<220> log likelihood: -28450825
INFO:guidedlda:<240> log likelihood: -28437753
INFO:guidedlda:<260> log likelihood: -28426171
INFO:guidedlda:<280> log likelihood: -28420988
INFO:guidedlda:<300> log likelihood: -28414377
INFO:guidedlda:<320> log likelihood: -2840

<guidedlda.guidedlda.GuidedLDA at 0x1a2c4a77f0>

In [47]:
joblib.dump(model, 'trained_lda.pkl')

['trained_lda.pkl']

# Document-topic distributions

In [48]:
trained_model = joblib.load('trained_lda.pkl') # load trained model

In [87]:
# List of keywords
from numpy import matrix
#from matrix import argsort
word_list = [', '.join(np.array(vocab)[list(reversed(X[i,:]np.argsort()))[0:10]]) for i in range(len(X.toarray()))]

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [52]:
# Extract the topic vector
doc_topic = trained_model.transform(X)

  if sparse and not np.issubdtype(doc_word.dtype, int):


In [92]:
columns10 = ['topic {}'.format(i) for i in range(10)]
topic_vector = pd.DataFrame(doc_topic, columns = columns10)
topic_vector.round(2).iloc[50:60,:]

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
50,0.46,0.37,0.01,0.0,0.0,0.0,0.05,0.02,0.0,0.1
51,0.09,0.34,0.32,0.0,0.05,0.01,0.02,0.14,0.0,0.02
52,0.04,0.0,0.0,0.0,0.01,0.02,0.0,0.35,0.0,0.57
53,0.62,0.01,0.21,0.0,0.01,0.14,0.0,0.0,0.0,0.0
54,0.0,0.0,0.02,0.03,0.0,0.0,0.04,0.0,0.9,0.0
55,0.0,0.0,0.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.64,0.0,0.2,0.0,0.0,0.0,0.01,0.0,0.0,0.14
57,0.0,0.0,0.0,0.0,0.99,0.0,0.0,0.0,0.01,0.0
58,0.02,0.02,0.04,0.0,0.02,0.0,0.02,0.06,0.02,0.8
59,0.01,0.01,0.0,0.6,0.0,0.0,0.02,0.0,0.01,0.35


In [62]:
# Assign topic if it is more than 70% dominant
num_topic = topic_threshold(doc_topic, topic_vector, threshold =0.7)

In [63]:
# Drop sentence with no topic
df = df_doc_topic[df_doc_topic['topics'] != 'None'].reset_index(drop = True)
df.head()

NameError: name 'df_doc_topic' is not defined

In [None]:
topic_dict ={0: 'plant',
             1: 'wellness',
             2: 'effect',
             3: 'sensation'}

In [None]:
df.topics = df.topics.map(topic_dict)

In [None]:
df.to_csv('../data/customer_topics.csv', index = False)

In [93]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(trained_model, corpus, dictionary)
vis

NameError: name 'corpus' is not defined