In [423]:
from gensim import corpora, models, similarities
import pickle
import string
import pandas as pd

In [542]:
import sys

sys.setrecursionlimit(100000)

In [424]:
def remove_punctuation(x):
    x = str(x)
    return x.translate(string.maketrans('',''),string.punctuation)

In [505]:
beers = pd.read_pickle('all_beer_reviews.pkl')

In [506]:
#beers = beers[beers.num_reviews >= 10]
beers.reset_index(inplace=True)

In [507]:
beers.shape

(20470, 20)

In [512]:
documents = [review for review in beers.reviews]

In [513]:
documents = [' '.join(review) for review in documents]

In [514]:
documents = [remove_punctuation(doc) for doc in documents]

In [515]:
documents = [review.lower() for review in documents]

In [516]:
documents = [re.sub(r'[0-9]','',doc) for doc in documents]

In [517]:
stoplist = set('thanks an one little just has be up had no with is this it i but that on not very some as was like from its bit at more into there my pours for a of the and to in'.split())

In [518]:
# add brewery acronyms to stoplist
brewery_acronyms = set('fff rr br gi pte '.split())
stoplist = stoplist.union(brewery_acronyms)

In [519]:
# add brewery names to stoplist
brewery_words = []
for brewery in beers.brewery_name:
    for word in brewery.lower().split():
        brewery_words.append(remove_punctuation(word.encode('utf-8')))
    
brewery_words = set(brewery_words)
stoplist = stoplist.union(brewery_words)

In [520]:
# add weird beer name words to stoplist, like stoopid and heady etc. pliny, shit like that
beer_name_words = []
for beer in beers.name:
    for word in beer.lower().split():
        beer_name_words.append(remove_punctuation(word.encode('utf-8')))
        
beer_name_frequency = defaultdict(int)
for word in beer_name_words:
    beer_name_frequency[word] += 1
    
sorted(beer_name_frequency.items(), key = lambda x: -x[1])
beer_name_words = [word for word in beer_name_words if beer_name_frequency[word] < 9]
beer_name_words = set(beer_name_words)
stoplist = stoplist.union(beer_name_words)

In [521]:
texts = [[word for word in document.lower().split() if word not in stoplist and len(word) > 2]for document in documents]

In [522]:
from collections import defaultdict

In [523]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 50]for text in texts]

In [524]:
dictionary = corpora.Dictionary(texts)

In [525]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [526]:
tfidf = models.TfidfModel(corpus)

In [527]:
corpus_tfidf = tfidf[corpus]

In [528]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=500)

In [529]:
index = similarities.MatrixSimilarity(lsi[corpus])

In [545]:
text_input = 'Heady Topper'

In [546]:
# get the reviews for a beer
beer_name_inputted = 1
try:
    doc= documents[beers[beers.name == text_input].index[0]]
except IndexError:
    print 'Beer Name Not Inputted'
    doc = text_input
    beer_name_inputted = 0
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

sims = index[vec_lsi]
similar_beers = []
for beer in sorted(enumerate(sims), key = lambda x: -x[1])[beer_name_inputted:beer_name_inputted+5]:
    similar_beers.append(beer[0])
    print(beers.name.iloc[beer[0]] + '\t:\t%.2f' % (beer[1]*100))
similar_beers = beers.iloc[similar_beers,:]

Abrasive Ale	:	96.61
Ephraim	:	96.26
Abner	:	96.07
Otter Creek Brewing / Lawson's Double Dose IPA	:	95.97
Stone Enjoy By IPA	:	95.97


In [547]:
input_beer_keywords = []
for item in sorted(corpus_tfidf[beers[beers.name == text_input].index[0]], key = lambda x: -x[1])[:5]:
    if frequency[dictionary[item[0]]] > 50:
        input_beer_keywords.append(dictionary[item[0]])

similar_beer_words = []
for beer in list(similar_beers.index):
    similar_beer_words.append([dictionary[item[0]] for item in
                               sorted(corpus_tfidf[beer], key = lambda x: -x[1])[:5]
                               if dictionary[item[0]] in input_beer_keywords])

In [548]:
print input_beer_keywords
print similar_beer_words

[u'grapefruit', u'dipa', u'tropical', u'pineapple', u'mango']
[[u'dipa', u'tropical', u'grapefruit', u'pineapple'], [u'dipa', u'grapefruit', u'tropical', u'mango', u'pineapple'], [u'dipa', u'grapefruit', u'mango', u'tropical'], [u'dipa', u'tropical', u'grapefruit', u'mango'], [u'tropical', u'grapefruit', u'pineapple', u'mango']]


In [549]:
def get_beer_keywords(text_input):
    input_beer_keywords = []
    for item in sorted(corpus_tfidf[beers[beers.name == text_input].index[0]], key = lambda x: -x[1])[:5]:
        input_beer_keywords.append(dictionary[item[0]])

    similar_beer_words = []
    for beer in list(similar_beers.index):
        similar_beer_words.append([dictionary[item[0]] for item in sorted(corpus_tfidf[beer], key = lambda x: -x[1])[:5] if dictionary[item[0]] in input_beer_keywords])
    return (input_beer_keywords, similar_beer_words)

get_beer_keywords(text_input)


([u'grapefruit', u'dipa', u'tropical', u'pineapple', u'mango'],
 [[u'dipa', u'tropical', u'grapefruit', u'pineapple'],
  [u'dipa', u'grapefruit', u'tropical', u'mango', u'pineapple'],
  [u'dipa', u'grapefruit', u'mango', u'tropical'],
  [u'dipa', u'tropical', u'grapefruit', u'mango'],
  [u'tropical', u'grapefruit', u'pineapple', u'mango']])

In [535]:
pickle.dump(documents,open('flask/app/models/documents.pkl','wb'))

In [536]:
pickle.dump(dictionary,open('flask/app/models/dictionary.pkl','wb'))

In [537]:
pickle.dump(lsi,open('flask/app/models/lsi.pkl','wb'))

In [538]:
pickle.dump(corpus,open('flask/app/models/corpus.pkl','wb'))

In [539]:
pickle.dump(index,open('flask/app/models/index.pkl','wb'))

In [543]:
beers.to_pickle('flask/app/models/beer_review_df.pkl')

In [544]:
pickle.dump(corpus_tfidf,open('flask/app/models/tfidf.pkl','wb'))

In [457]:
from sklearn.cluster import KMeans

In [458]:
import textblob
# The fuck is this shit b

# Kens lemmatizer, hm

SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve", '..']
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca", "’m", '..', "'re", 'n/a', '%hesitation', 'nnj', 'dnmt', 'think', 'yeah'] + list(STOPWORDS)) 

def tokenizeText(sample):
    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

vectorizer = TfidfVectorizer(tokenizer=tokenizeText, max_features= 1000, use_idf=True)

NameError: name 'stopwords' is not defined