# Overview

In this notebook, I use LDA to model latent topics. Latent models don't always yield useful results and it is difficult to know whether they will be useful or not until we actually start. 

Steps:

1. Group the Data by restauarant and then squish the review text together. 

Extensions:

1. Consider only positive reviews. (Happy in the same say, upset for all different ways)
2. COnsider only negative reviews. 

In [17]:
import pandas as pd
from gensim.models.ldamulticore import LdaModel
import gensim

In [16]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [66]:
combined_data = pd.read_hdf('../data/restaurant_reviews.hdf')

In [67]:
combined_data.columns

Index(['date', 'review_id', 'text', 'user_id', 'city', 'latitude', 'longitude',
       'name', 'neighborhoods', 'stars', 'hours'],
      dtype='object')

In [68]:
g_rest = combined_data.groupby(combined_data.name)

In [69]:
len(g_rest.stars.mean())

15321

In [70]:
text = g_rest.text.apply(lambda x: '\n'.join(x)).reset_index()

In [73]:
text.text.head()

0    So this place isn't spectacular, but it is def...
1    I really like this place. The portions are lar...
2    Delicious Sushi! Very big, fresh, tasty, flavo...
3    Pizza was just plain, nothing special. I have ...
4    I don't like the pizza hear and if you go for ...
Name: text, dtype: object

## Basic Preprocessing

In [75]:
text.text = text.text.apply(lambda x: gensim.utils.simple_preprocess(x))

In [76]:
text.text[:5]

0    [so, this, place, isn, spectacular, but, it, i...
1    [really, like, this, place, the, portions, are...
2    [delicious, sushi, very, big, fresh, tasty, fl...
3    [pizza, was, just, plain, nothing, special, ha...
4    [don, like, the, pizza, hear, and, if, you, go...
Name: text, dtype: object

In [9]:
# import spacy
# import os
# data_dir = '/Users/Will/anaconda/envs/py34/lib/python3.4/site-packages/spacy/en/data'
# nlp = spacy.en.English(data_dir=data_dir)

Now we need to make a gensim dictionary that 

In [77]:
dictionary = gensim.corpora.Dictionary(text.text)

In [78]:
print(dictionary)

Dictionary(242663 unique tokens: ['baghala', 'apes', 'luwan', 'soyons', 'souhaiter']...)


Now we get the Corpus into the format we want and save it to disk.

## Removing Stopwords

To start out with, we use the nltk stopwords. If this turns out to cause problems, we will also start excluding the most common words from the current corpus.

In [81]:
import nltk

In [82]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [83]:
stopwords_ids = list(map(dictionary.token2id.get, stopwords))

In [84]:
len(stopwords_ids)

127

In [85]:
dictionary.filter_tokens(stopwords_ids)

In [86]:
?dictionary.filter_extremes()

In [87]:
dictionary.filter_extremes(no_below=10, no_above=0.1)

In [88]:
len(dictionary)

41555

In [89]:
dictionary.compactify()

## Generating the Corpus

In [90]:
corpus = [dictionary.doc2bow(t) for t in .text]

## Saving

In [92]:
corpora.MmCorpus.serialize('../data/deerwester.mm', corpus)

In [93]:
dictionary.save('../data/words.dict') 

# Modelling

I should probably remove stopwords at some point

In [20]:
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.mmcorpus import MmCorpus 
from gensim.models.ldamulticore import LdaMulticore

In [11]:
dictionary = Dictionary.load('../data/words.dict')

In [14]:
corpus = MmCorpus('../data/deerwester.mm')

I recommend running this on a power EC2 server. Or else it will take too long. 

In [None]:
all_lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20, workers=31,
                       chunksize=50, passes=1)

In [27]:
all_lda.save('../data/ldamodel')

In [33]:
print(all_lda)

LdaModel(num_terms=242663, num_topics=20, decay=0.5, chunksize=100)


In [35]:
all_lda.print_topics()

['0.050*the + 0.033*and + 0.027*to + 0.019*it + 0.016*of + 0.014*was + 0.014*is + 0.013*for + 0.013*in + 0.011*this',
 '0.054*the + 0.037*and + 0.023*to + 0.019*pizza + 0.019*it + 0.019*was + 0.016*of + 0.015*is + 0.013*for + 0.012*in',
 '0.056*the + 0.039*and + 0.023*to + 0.019*was + 0.018*it + 0.017*is + 0.016*of + 0.013*for + 0.011*in + 0.010*food',
 '0.056*the + 0.032*and + 0.024*to + 0.020*was + 0.019*of + 0.017*it + 0.014*for + 0.013*is + 0.012*in + 0.011*buffet',
 '0.053*the + 0.036*and + 0.026*to + 0.017*of + 0.016*it + 0.016*was + 0.014*is + 0.012*for + 0.011*in + 0.010*we',
 '0.055*the + 0.037*and + 0.022*was + 0.022*to + 0.019*it + 0.015*of + 0.013*for + 0.012*we + 0.011*is + 0.010*in',
 '0.057*the + 0.037*and + 0.022*to + 0.019*it + 0.018*was + 0.017*of + 0.015*is + 0.013*in + 0.011*for + 0.010*pho',
 '0.056*the + 0.035*and + 0.023*to + 0.018*it + 0.018*was + 0.016*burger + 0.014*of + 0.012*is + 0.012*in + 0.011*for',
 '0.053*the + 0.035*and + 0.022*to + 0.019*it + 0.019*wa

# Single Core

In [96]:
lda_single = LdaModel(corpus=corpus, id2word=dictionary, num_topics=20,
                     chunksize=300)

In [97]:
lda_single.print_topics()

['0.050*pad + 0.032*pittsburgh + 0.023*tom + 0.011*truck + 0.009*panang + 0.009*drunken + 0.007*ew + 0.006*satay + 0.006*papaya + 0.005*curries',
 '0.006*pepperoni + 0.004*subway + 0.004*zpizza + 0.004*philly + 0.003*factory + 0.003*refund + 0.003*pies + 0.002*hut + 0.002*cheesesteak + 0.002*wich',
 '0.021*brisket + 0.017*spaghetti + 0.012*ravioli + 0.010*lasagna + 0.009*ricotta + 0.009*bruschetta + 0.008*gelato + 0.008*marinara + 0.008*tiramisu + 0.007*pastas',
 '0.307*pho + 0.068*vietnamese + 0.033*unphogettable + 0.013*mi + 0.011*boba + 0.010*vermicelli + 0.010*sprouts + 0.010*brisket + 0.009*bo + 0.008*plaza',
 '0.013*charlotte + 0.006*grits + 0.004*cornbread + 0.004*tots + 0.003*vig + 0.003*uptown + 0.003*ahi + 0.003*soul + 0.003*kale + 0.002*lemonade',
 '0.034*pita + 0.031*hummus + 0.027*gyro + 0.018*falafel + 0.014*edinburgh + 0.013*feta + 0.012*mediterranean + 0.010*gyros + 0.007*kabob + 0.007*eastern',
 '0.019*pub + 0.013*madison + 0.010*shake + 0.008*pickles + 0.007*pretzel +

Saving it

In [98]:
lda_single.save('../data/all_lda')

# Positive Reviews Only

In [111]:
SAVE = False
def generate_corpus(df, grouper='name', text='text'):
    """
    Generates a dictionary with stopwords and most frequent words removed.
    Also tells how many items have been removed.

    :param df: pd.DataFrame() 
    :return:
    """
    # Combine the text on a restaurant level
    grouped_df = df.groupby(grouper)
    all_review_text = grouped_df[text].apply(lambda x: '\n'.join(x)).reset_index()

    # Do some Processing
    all_review_text[text] = all_review_text[text].apply(
        lambda x: gensim.utils.simple_preprocess(x))

    dictionary = gensim.corpora.Dictionary(all_review_text[text])
    original_dict_length = len(dictionary)

    stopwords = set(nltk.corpus.stopwords.words('english'))
    stopwords_ids = list(map(dictionary.token2id.get, stopwords))
    dictionary.filter_tokens(stopwords_ids)
    dictionary.filter_extremes(no_below=10, no_above=0.1)

    dictionary.compactify()
    percent_removed = 1 - (len(dictionary) / original_dict_length)
    logging.log('Removed {} items from our dictionary. {} %'
                .format(original_dict_length - len(dictionary)),
                percent_removed)

    # Generate the corpus
    corpus = [dictionary.doc2bow(t) for t in df[text]]

    if SAVE:
        gensim.corpora.MmCorpus.serialize('../data/deerwester.mm', corpus)
        dictionary.save('../data/words.dict')
        
    return corpus, dictionary

In [108]:
good_reviews = combined_data[combined_data.stars.isin([4,5])]

In [None]:
good_corpus, good_dict = generate_corpus(good_reviews)

## Negative Reviews Only

In [100]:
bad_reviews = combined_data[combined_data.stars.isin([1,2])]

In [None]:
bad_corpus, bad_dict = generate_corpus(bad_reviews)

# References

[Good tutorial](https://radimrehurek.com/gensim/tut1.html#corpus-formats) on how to get Corpora in the right format.  

http://nbviewer.ipython.org/gist/langmore/6820351