## Init ##

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# now can import drive files as usual files with the path like
# /content/drive/My Drive/location_of_the_file


Mounted at /content/drive


In [None]:
import pandas as pd

DRIVE_INITIAL_PATH = r"/content/drive/My Drive/Data Mining"

def adapt_path_to_drive(original_path):
  return "/".join([DRIVE_INITIAL_PATH, original_path])



OUR_PP_RECIPES_PATH = r"data/our_pp_recipes.csv"
OUR_PP_INTERACTIONS_PATH = r"data/our_pp_interactions.csv"

OUR_PP_RECIPES_PATH = adapt_path_to_drive(OUR_PP_RECIPES_PATH)
OUR_PP_INTERACTIONS_PATH = adapt_path_to_drive(OUR_PP_INTERACTIONS_PATH)


PP_INTERACTIONS_DF = pd.read_csv(OUR_PP_INTERACTIONS_PATH)
PP_RECIPES_DF = pd.read_csv(OUR_PP_RECIPES_PATH)



In [None]:
date_ranges_start_points = ['2000-01-01','2003-03-01', '2006-05-01', '2009-07-01', '2012-09-01','2015-11-01']
end_point = '2019-01-01'

def get_review_divides(recipe_filtered_df):
  filtered_interactions_sorted = recipe_filtered_df.merge(PP_INTERACTIONS_DF, how="inner", on="recipe_id").sort_values("date")
  array_of_divides = []
  for i in range(len(date_ranges_start_points)):
    if i == len(date_ranges_start_points) - 1:
      cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
    else:
      cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
    array_of_divides.append(cur_df)
  
  array_of_divides = [divide_part["review"].str.cat(sep =' ') for divide_part in array_of_divides]
  return array_of_divides





In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

STOP_WORDS = [x.lower() for x in set(stopwords.words('english'))]
PUNCTUATION_TO_REMOVE = [",", ".", ";", "!", '"', "?", '“', '”', "‘", '’',
                         "[", "]", "{", "}",')','(', '...', "n't", "'s","''", "-"
                         ,':', '``', 'br', 'gt']
NO_MEANING_TO_REMOVE = ["lt", "quot", "next", "thanks", "thank", "also", "use", "used", "'ve", "add", "added", "br/", "039", "zwt8", "ca", "'m", "'ll", "--"]

# bigrm = nltk.bigrams(tokens) produce bigrams

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
def tokenizer(data):
  not_in_list = STOP_WORDS + PUNCTUATION_TO_REMOVE + NO_MEANING_TO_REMOVE
  tokens = word_tokenize(data)
  tokens = [x.lower() for x in tokens if x not in not_in_list and len(x) > 1]
  print("done tokenizing")
  return tokens


In [None]:
def filter_adjectives_adverbs_only(cleaned_tokens):
  posed_text = pos_tag(cleaned_tokens)
  adjectives_adverbs_content = [x[0] for x in posed_text if (x[1] in ['JJ', 'JJR',
                                                         'JJS', 'RB', 'RBR', 'RBS'])]
  return adjectives_adverbs_content

## Check Params ##

### Check vegetarian ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[PP_RECIPES_DF["is_vegetarian_tags"]]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 5061), ('make', 2792), ('made', 2735), ('good', 2699), ('great', 2649), ('easy', 1941), ('really', 1743), ('time', 1678), ('like', 1655), ('one', 1498), ('would', 1395), ('delicious', 1251), ('dish', 1190), ('little', 1190), ('much', 1144), ('wonderful', 1137), ('taste', 1092), ('loved', 1027), ('well', 972), ('instead', 944), ('flavor', 943), ('nice', 942), ('try', 879), ('think', 861), ('fresh', 851), ('love', 841), ('garlic', 809), ('cheese', 804), ('bit', 802), ('bread', 777), ('sauce', 755), ('served', 750), ('way', 746), ('even', 730), ('could', 712), ('making', 701), ('tasty', 690), ('salad', 683), ('butter', 617), ('sugar', 607), ('sharing', 591), ('family', 577), ('enjoyed', 554), ('yummy', 546), ('perfect', 546), ('minutes', 542), ('sweet', 541), ('put', 540), ('excellent', 529), ('thought', 527)]
[('recipe', 30044), ('made', 18497), ('make', 16508), ('great', 16197), ('good', 14918), ('time', 10648), ('easy', 10598), ('really', 10259), ('like', 9682), ('would', 8

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 604), (('great', 'recipe'), 505), (('really', 'good'), 255), (('side', 'dish'), 247), (('olive', 'oil'), 176), (('recipe', 'made'), 175), (('really', 'enjoyed'), 170), (('followed', 'recipe'), 165), (('quick', 'easy'), 161), (('last', 'night'), 159), (('peanut', 'butter'), 158), (('sour', 'cream'), 131), (('good', 'recipe'), 129), (('first', 'time'), 129), (('sharing', 'recipe'), 117), (('easy', 'prepare'), 114), (('lemon', 'juice'), 113), (('made', 'recipe'), 112), (('red', 'pepper'), 111), (('recipe', 'great'), 111), (('1/2', 'cup'), 107), (('recipe', 'easy'), 107), (('ice', 'cream'), 104), (('even', 'better'), 104), (('good', 'easy'), 103), (('definitely', 'make'), 103), (('cream', 'cheese'), 98), (('1/4', 'cup'), 98), (('brown', 'sugar'), 96), (('wonderful', 'recipe'), 96), (('easy', 'recipe'), 92), (('would', 'make'), 92), (('recipe', 'exactly'), 92), (('think', 'would'), 91), (('halved', 'recipe'), 91), (('turned', 'great'), 90), (('recipe', 'good'), 88), (('r

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 68), (('made', 'last', 'night'), 52), (('red', 'pepper', 'flakes'), 50), (('cut', 'recipe', 'half'), 44), (('easy', 'put', 'together'), 42), (('another', 'great', 'recipe'), 41), (('vanilla', 'ice', 'cream'), 35), (('good', 'easy', 'make'), 33), (('recipe', 'easy', 'make'), 30), (('great', 'side', 'dish'), 30), (('red', 'chilli', 'powder'), 28), (('delicious', 'easy', 'make'), 26), (('whole', 'wheat', 'flour'), 26), (('quick', 'easy', 'make'), 25), (('first', 'time', 'made'), 25), (('easy', 'make', 'great'), 23), (('yum', 'yum', 'yum'), 22), (('followed', 'directions', 'exactly'), 20), (('great', 'recipe', 'made'), 19), (('dinner', 'last', 'night'), 19), (('ground', 'black', 'pepper'), 19), (('would', 'change', 'thing'), 19), (('great', 'recipe', 'great'), 19), (('tsp', 'red', 'chilli'), 18), (('really', 'really', 'good'), 18), (('whole', 'family', 'loved'), 18), (('family', 'really', 'enjoyed'), 17), (('sharing', 'great', 'recipe'), 17), (('easy', 

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 2649), ('good', 2598), ('easy', 1867), ('really', 1743), ('recipe', 1696), ('delicious', 1233), ('little', 1166), ('much', 1141), ('wonderful', 1045), ('well', 961), ('instead', 944), ('nice', 914), ('fresh', 816), ('dish', 730), ('even', 730), ('garlic', 672), ('tasty', 528), ('cheese', 521), ('first', 509), ('better', 480), ('excellent', 451), ('still', 444), ('definitely', 420), ('sweet', 413), ('green', 412), ('right', 387), ('hot', 384), ('whole', 378), ('perfect', 377), ('enough', 369), ('red', 368), ('together', 366), ('never', 361), ('last', 361), ('best', 355), ('quick', 355), ('quite', 349), ('different', 347), ('simple', 338), ('sure', 328), ('always', 313), ('top', 306), ('oven', 302), ('ever', 281), ('exactly', 277), ('salad', 267), ('extra', 261), ('fantastic', 258), ('big', 258), ('pretty', 254)]
[('great', 16197), ('good', 14390), ('really', 10259), ('easy', 10158), ('recipe', 10083), ('delicious', 7575), ('little', 7444), ('much', 6970), ('nice', 5651), ('in

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 287), (('really', 'good'), 271), (('quick', 'easy'), 149), (('good', 'easy'), 138), (('great', 'easy'), 107), (('even', 'better'), 102), (('easy', 'delicious'), 101), (('easy', 'great'), 99), (('best', 'ever'), 92), (('recipe', 'great'), 91), (('easy', 'good'), 85), (('good', 'really'), 82), (('good', 'great'), 81), (('much', 'better'), 80), (('delicious', 'easy'), 79), (('pretty', 'good'), 78), (('absolutely', 'delicious'), 75), (('great', 'great'), 73), (('great', 'good'), 69), (('recipe', 'easy'), 69), (('easy', 'together'), 69), (('great', 'really'), 69), (('really', 'great'), 68), (('recipe', 'good'), 66), (('easy', 'tasty'), 66), (('really', 'nice'), 66), (('side', 'dish'), 64), (('good', 'good'), 62), (('good', 'little'), 58), (('recipe', 'exactly'), 58), (('really', 'really'), 56), (('really', 'easy'), 56), (('good', 'recipe'), 55), (('good', 'nice'), 54), (('delicious', 'great'), 53), (('much', 'great'), 52), (('great', 'dish'), 51), (('easy', 'recipe'),

### Check vegan ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[PP_RECIPES_DF["is_vegan_tag"]]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 1199), ('made', 676), ('good', 673), ('great', 666), ('make', 627), ('really', 491), ('like', 437), ('easy', 396), ('time', 392), ('one', 367), ('little', 323), ('would', 318), ('wonderful', 314), ('taste', 313), ('dish', 311), ('much', 310), ('nice', 290), ('delicious', 285), ('rice', 282), ('fresh', 278), ('well', 267), ('flavor', 262), ('garlic', 253), ('instead', 252), ('sauce', 239), ('love', 238), ('way', 235), ('bread', 234), ('bit', 225), ('try', 220), ('loved', 215), ('think', 212), ('salad', 205), ('pepper', 203), ('served', 197), ('oil', 195), ('even', 184), ('tasty', 183), ('tomatoes', 179), ('could', 178), ('salt', 173), ('water', 172), ('making', 163), ('enjoyed', 161), ('excellent', 154), ('lemon', 153), ('1/2', 151), ('perfect', 148), ('first', 147), ('sugar', 146)]
[('recipe', 7052), ('made', 4268), ('great', 4041), ('make', 3703), ('good', 3579), ('really', 2550), ('time', 2509), ('like', 2459), ('easy', 2452), ('would', 2134), ('one', 1888), ('delicious',

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 112), (('great', 'recipe'), 93), (('side', 'dish'), 76), (('really', 'good'), 69), (('olive', 'oil'), 59), (('peanut', 'butter'), 55), (('last', 'night'), 50), (('red', 'pepper'), 47), (('recipe', 'made'), 45), (('lemon', 'juice'), 42), (('really', 'enjoyed'), 37), (('1/2', 'tsp'), 35), (('first', 'time'), 35), (('followed', 'recipe'), 35), (('quick', 'easy'), 32), (('halved', 'recipe'), 31), (('good', 'recipe'), 30), (('1/4', 'cup'), 29), (('great', 'way'), 29), (('easy', 'prepare'), 28), (('turned', 'great'), 28), (('bread', 'machine'), 28), (('even', 'better'), 27), (('1/2', 'cup'), 26), (('yum', 'yum'), 26), (('definitely', 'make'), 26), (('much', 'better'), 26), (('recipe', 'good'), 25), (('whole', 'wheat'), 25), (('recipe', 'easy'), 25), (('absolutely', 'delicious'), 24), (('pepper', 'flakes'), 24), (('nice', 'change'), 24), (('little', 'bit'), 24), (('everyone', 'loved'), 23), (('really', 'liked'), 23), (('salt', 'pepper'), 23), (('taste', 'good'), 23), (('th

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('red', 'pepper', 'flakes'), 19), (('made', 'last', 'night'), 18), (('red', 'chilli', 'powder'), 18), (('cut', 'recipe', 'half'), 15), (('followed', 'recipe', 'exactly'), 15), (('yum', 'yum', 'yum'), 13), (('tsp', 'red', 'chilli'), 11), (('dinner', 'last', 'night'), 10), (('whole', 'wheat', 'flour'), 9), (('easy', 'put', 'together'), 9), (('halved', 'recipe', 'made'), 9), (('still', 'turned', 'great'), 8), (('another', 'great', 'recipe'), 8), (('ground', 'black', 'pepper'), 8), (('served', 'side', 'dish'), 8), (('made', 'lunch', 'afternoon'), 8), (('crushed', 'red', 'pepper'), 8), (('delicious', 'easy', 'make'), 7), (('red', 'bell', 'pepper'), 7), (('first', 'time', 'made'), 7), (('followed', 'directions', 'exactly'), 7), (('1/2', 'tsp', 'salt'), 6), (('made', 'side', 'dish'), 6), (('fresh', 'lemon', 'juice'), 6), (('much', 'posting', 'recipe'), 6), (('hot', 'summer', 'day'), 5), (('easy', 'make', 'great'), 5), (('always', 'looking', 'new'), 5), (('tsp', 'turmeric', 'powder'), 5), ((

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 666), ('good', 652), ('really', 491), ('easy', 378), ('recipe', 376), ('little', 317), ('much', 310), ('wonderful', 288), ('delicious', 284), ('nice', 279), ('fresh', 269), ('well', 267), ('instead', 252), ('garlic', 209), ('even', 184), ('dish', 172), ('tasty', 147), ('first', 147), ('hot', 136), ('red', 128), ('excellent', 125), ('green', 114), ('whole', 113), ('better', 111), ('perfect', 109), ('sweet', 107), ('still', 107), ('quite', 104), ('right', 100), ('definitely', 100), ('never', 99), ('sure', 92), ('best', 92), ('last', 91), ('always', 86), ('enough', 83), ('salad', 80), ('extra', 79), ('quick', 79), ('together', 78), ('simple', 78), ('lemon', 74), ('fantastic', 71), ('new', 71), ('healthy', 71), ('often', 70), ('oven', 69), ('olive', 69), ('different', 69), ('regular', 68)]
[('great', 4041), ('good', 3427), ('really', 2550), ('easy', 2371), ('recipe', 2305), ('delicious', 1743), ('little', 1712), ('much', 1707), ('nice', 1521), ('well', 1392), ('fresh', 1333), ('

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('really', 'good'), 73), (('great', 'recipe'), 46), (('great', 'easy'), 32), (('even', 'better'), 28), (('quick', 'easy'), 27), (('good', 'easy'), 26), (('much', 'better'), 26), (('good', 'really'), 25), (('easy', 'great'), 25), (('absolutely', 'delicious'), 24), (('good', 'nice'), 24), (('really', 'nice'), 23), (('really', 'great'), 21), (('good', 'great'), 20), (('side', 'dish'), 20), (('really', 'well'), 20), (('great', 'good'), 19), (('best', 'ever'), 18), (('easy', 'tasty'), 17), (('pretty', 'good'), 17), (('little', 'less'), 16), (('great', 'little'), 16), (('easy', 'delicious'), 15), (('much', 'good'), 15), (('recipe', 'great'), 15), (('really', 'really'), 15), (('recipe', 'really'), 15), (('easy', 'together'), 15), (('great', 'great'), 15), (('great', 'really'), 15), (('good', 'much'), 14), (('delicious', 'really'), 14), (('good', 'little'), 14), (('recipe', 'good'), 13), (('quite', 'tasty'), 13), (('really', 'tasty'), 13), (('nice', 'great'), 13), (('easy', 'good'), 13), (('

### Check minutes Till 20 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[PP_RECIPES_DF["minutes"] <= 20]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 6393), ('make', 3892), ('good', 3788), ('made', 3695), ('great', 3681), ('easy', 3022), ('really', 2148), ('like', 2096), ('one', 1962), ('time', 1954), ('would', 1721), ('delicious', 1616), ('little', 1612), ('taste', 1456), ('much', 1439), ('loved', 1420), ('wonderful', 1415), ('sauce', 1378), ('flavor', 1350), ('salad', 1306), ('nice', 1222), ('love', 1192), ('try', 1155), ('well', 1148), ('served', 1103), ('instead', 1066), ('cheese', 1034), ('think', 1030), ('fresh', 994), ('garlic', 962), ('dish', 937), ('could', 924), ('tasty', 901), ('bit', 892), ('chicken', 878), ('making', 863), ('way', 863), ('sharing', 855), ('cream', 833), ('sugar', 809), ('dressing', 807), ('butter', 806), ('even', 806), ('put', 797), ('quick', 782), ('yummy', 773), ('excellent', 729), ('simple', 729), ('thought', 698), ('sweet', 697)]
[('recipe', 41919), ('made', 27490), ('great', 24369), ('make', 23826), ('good', 22660), ('easy', 17678), ('really', 14333), ('like', 14143), ('time', 13272), (

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 897), (('great', 'recipe'), 638), (('really', 'good'), 363), (('quick', 'easy'), 340), (('peanut', 'butter'), 254), (('last', 'night'), 230), (('really', 'enjoyed'), 217), (('recipe', 'made'), 214), (('sour', 'cream'), 205), (('side', 'dish'), 203), (('cream', 'cheese'), 201), (('olive', 'oil'), 196), (('good', 'recipe'), 177), (('easy', 'prepare'), 173), (('sharing', 'recipe'), 170), (('lemon', 'juice'), 167), (('followed', 'recipe'), 166), (('recipe', 'great'), 158), (('recipe', 'easy'), 156), (('ice', 'cream'), 154), (('first', 'time'), 151), (('everyone', 'loved'), 143), (('easy', 'recipe'), 137), (('really', 'liked'), 137), (('put', 'together'), 132), (('1/2', 'cup'), 130), (('made', 'recipe'), 130), (('definitely', 'make'), 129), (('wonderful', 'recipe'), 126), (('good', 'easy'), 125), (('time', 'make'), 124), (('would', 'make'), 116), (('much', 'better'), 115), (('big', 'hit'), 114), (('think', 'would'), 113), (('simple', 'make'), 112), (('red', 'pepper'), 11

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('made', 'last', 'night'), 71), (('followed', 'recipe', 'exactly'), 71), (('easy', 'put', 'together'), 65), (('recipe', 'easy', 'make'), 61), (('quick', 'easy', 'make'), 58), (('another', 'great', 'recipe'), 57), (('red', 'pepper', 'flakes'), 50), (('cut', 'recipe', 'half'), 43), (('good', 'easy', 'make'), 42), (('would', 'change', 'thing'), 38), (('first', 'time', 'made'), 33), (('delicious', 'easy', 'make'), 32), (('great', 'recipe', 'easy'), 32), (('dinner', 'last', 'night'), 31), (('easy', 'make', 'great'), 29), (('easy', 'make', 'delicious'), 29), (('whole', 'family', 'loved'), 29), (('vanilla', 'ice', 'cream'), 29), (('great', 'recipe', 'made'), 28), (('easy', 'make', 'good'), 28), (('really', 'easy', 'make'), 27), (('sharing', 'great', 'recipe'), 25), (('easy', 'make', 'tastes'), 24), (('recipe', 'quick', 'easy'), 24), (('red', 'wine', 'vinegar'), 24), (('followed', 'directions', 'exactly'), 23), (('hot', 'summer', 'day'), 22), (('great', 'side', 'dish'), 22), (('recipe', 'gre

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 3681), ('good', 3670), ('easy', 2915), ('really', 2148), ('recipe', 2110), ('delicious', 1593), ('little', 1585), ('much', 1437), ('wonderful', 1279), ('nice', 1188), ('well', 1134), ('instead', 1066), ('fresh', 958), ('even', 806), ('garlic', 795), ('quick', 716), ('tasty', 685), ('cheese', 630), ('first', 623), ('better', 618), ('hot', 595), ('excellent', 580), ('simple', 566), ('dish', 562), ('sweet', 557), ('best', 538), ('still', 531), ('definitely', 528), ('right', 512), ('last', 498), ('green', 496), ('salad', 482), ('together', 481), ('perfect', 478), ('sure', 471), ('never', 467), ('always', 464), ('different', 445), ('enough', 441), ('whole', 430), ('red', 424), ('ever', 383), ('quite', 378), ('big', 363), ('new', 340), ('real', 339), ('exactly', 332), ('top', 321), ('yummy', 317), ('extra', 317)]
[('great', 24369), ('good', 21793), ('easy', 16957), ('recipe', 14434), ('really', 14333), ('delicious', 10559), ('little', 10553), ('much', 10005), ('nice', 8268), ('won

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('really', 'good'), 396), (('great', 'recipe'), 366), (('quick', 'easy'), 307), (('good', 'easy'), 214), (('great', 'easy'), 174), (('easy', 'good'), 151), (('easy', 'great'), 145), (('easy', 'delicious'), 144), (('good', 'great'), 142), (('best', 'ever'), 131), (('good', 'really'), 128), (('good', 'good'), 125), (('much', 'better'), 120), (('great', 'great'), 116), (('recipe', 'great'), 114), (('delicious', 'easy'), 110), (('easy', 'quick'), 110), (('great', 'good'), 107), (('easy', 'tasty'), 107), (('pretty', 'good'), 102), (('easy', 'together'), 102), (('even', 'better'), 96), (('recipe', 'easy'), 93), (('really', 'easy'), 85), (('really', 'great'), 85), (('wonderful', 'easy'), 83), (('absolutely', 'delicious'), 81), (('recipe', 'good'), 79), (('good', 'nice'), 76), (('good', 'recipe'), 75), (('good', 'little'), 74), (('easy', 'recipe'), 74), (('really', 'nice'), 73), (('easy', 'really'), 71), (('recipe', 'exactly'), 68), (('fast', 'easy'), 67), (('much', 'great'), 65), (('fat', '

### Check minutes 20 Till 40 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["minutes"] >= 20) & (PP_RECIPES_DF["minutes"] <= 40)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 10591), ('make', 5915), ('good', 5508), ('made', 5440), ('great', 5352), ('easy', 4818), ('time', 3315), ('really', 3239), ('like', 2950), ('one', 2920), ('would', 2645), ('sauce', 2636), ('little', 2525), ('delicious', 2519), ('loved', 2402), ('dish', 2317), ('wonderful', 2262), ('flavor', 2131), ('much', 2013), ('taste', 1994), ('chicken', 1924), ('instead', 1793), ('nice', 1770), ('served', 1755), ('well', 1731), ('try', 1625), ('think', 1620), ('tasty', 1607), ('cheese', 1586), ('garlic', 1540), ('family', 1476), ('love', 1472), ('bit', 1459), ('could', 1382), ('fresh', 1377), ('making', 1344), ('even', 1298), ('salad', 1236), ('sharing', 1222), ('way', 1217), ('quick', 1202), ('minutes', 1192), ('excellent', 1162), ('butter', 1149), ('cream', 1143), ('husband', 1123), ('enjoyed', 1119), ('put', 1084), ('pepper', 1069), ('dinner', 1057)]
[('recipe', 69186), ('made', 40305), ('make', 37667), ('great', 35296), ('good', 33380), ('easy', 27896), ('time', 22687), ('really', 

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 1458), (('great', 'recipe'), 1055), (('really', 'good'), 523), (('quick', 'easy'), 520), (('followed', 'recipe'), 382), (('really', 'enjoyed'), 379), (('recipe', 'made'), 346), (('last', 'night'), 342), (('sour', 'cream'), 333), (('side', 'dish'), 321), (('easy', 'prepare'), 315), (('good', 'recipe'), 306), (('olive', 'oil'), 267), (('good', 'easy'), 267), (('recipe', 'easy'), 255), (('made', 'recipe'), 253), (('definitely', 'make'), 249), (('family', 'loved'), 247), (('easy', 'recipe'), 247), (('recipe', 'exactly'), 245), (('sharing', 'recipe'), 241), (('1/2', 'cup'), 232), (('recipe', 'great'), 224), (('cream', 'cheese'), 224), (('really', 'liked'), 223), (('wonderful', 'recipe'), 221), (('red', 'pepper'), 217), (('chocolate', 'chips'), 210), (('everyone', 'loved'), 201), (('first', 'time'), 196), (('peanut', 'butter'), 196), (('put', 'together'), 190), (('green', 'beans'), 189), (('easy', 'tasty'), 185), (('turned', 'great'), 185), (('brown', 'sugar'), 182), (('e

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 179), (('made', 'last', 'night'), 112), (('easy', 'put', 'together'), 95), (('red', 'pepper', 'flakes'), 93), (('good', 'easy', 'make'), 88), (('cut', 'recipe', 'half'), 85), (('recipe', 'easy', 'make'), 83), (('another', 'great', 'recipe'), 82), (('quick', 'easy', 'make'), 73), (('would', 'change', 'thing'), 73), (('whole', 'family', 'loved'), 59), (('dinner', 'last', 'night'), 55), (('delicious', 'easy', 'make'), 52), (('recipe', 'quick', 'easy'), 46), (('really', 'enjoyed', 'recipe'), 45), (('great', 'recipe', 'made'), 44), (('great', 'side', 'dish'), 41), (('easy', 'make', 'tasty'), 39), (('really', 'easy', 'make'), 39), (('easy', 'quick', 'make'), 39), (('family', 'really', 'enjoyed'), 39), (('made', 'recipe', 'exactly'), 39), (('chocolate', 'chip', 'cookies'), 38), (('easy', 'make', 'great'), 38), (('followed', 'directions', 'exactly'), 38), (('crushed', 'red', 'pepper'), 37), (('whole', 'wheat', 'flour'), 36), (('easy', 'make', 'good'), 36), 

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 5352), ('good', 5331), ('easy', 4615), ('recipe', 3483), ('really', 3239), ('delicious', 2490), ('little', 2473), ('wonderful', 2029), ('much', 2009), ('instead', 1793), ('well', 1706), ('nice', 1706), ('dish', 1456), ('fresh', 1327), ('even', 1298), ('garlic', 1264), ('tasty', 1224), ('quick', 1091), ('cheese', 1015), ('green', 958), ('excellent', 954), ('definitely', 928), ('still', 925), ('first', 874), ('better', 848), ('hot', 812), ('together', 805), ('enough', 783), ('best', 781), ('last', 745), ('simple', 740), ('sweet', 738), ('sure', 728), ('different', 711), ('red', 693), ('never', 684), ('right', 670), ('whole', 660), ('perfect', 653), ('quite', 637), ('exactly', 610), ('top', 608), ('ever', 604), ('oven', 594), ('always', 554), ('extra', 553), ('big', 533), ('salad', 527), ('absolutely', 515), ('pretty', 514)]
[('great', 35296), ('good', 32219), ('easy', 26641), ('recipe', 23679), ('really', 21146), ('little', 16268), ('delicious', 16193), ('much', 14489), ('inst

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 607), (('really', 'good'), 572), (('quick', 'easy'), 477), (('good', 'easy'), 386), (('best', 'ever'), 262), (('great', 'easy'), 246), (('easy', 'good'), 236), (('easy', 'delicious'), 235), (('easy', 'tasty'), 220), (('easy', 'great'), 212), (('delicious', 'easy'), 197), (('recipe', 'great'), 191), (('good', 'really'), 183), (('easy', 'quick'), 181), (('even', 'better'), 178), (('good', 'great'), 170), (('good', 'good'), 163), (('absolutely', 'delicious'), 157), (('pretty', 'good'), 156), (('recipe', 'exactly'), 156), (('recipe', 'easy'), 154), (('easy', 'together'), 151), (('really', 'easy'), 148), (('great', 'great'), 145), (('good', 'little'), 139), (('really', 'great'), 138), (('great', 'good'), 137), (('easy', 'recipe'), 135), (('much', 'better'), 132), (('good', 'recipe'), 124), (('great', 'really'), 121), (('really', 'enjoyed'), 120), (('wonderful', 'easy'), 118), (('recipe', 'good'), 118), (('easy', 'really'), 115), (('still', 'great'), 113), (('tasty', '

### Check minutes Till 40 till 90 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["minutes"] >= 40) & (PP_RECIPES_DF["minutes"] <= 90)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 12813), ('make', 7033), ('made', 6810), ('good', 6438), ('great', 6155), ('easy', 5095), ('time', 4228), ('one', 3670), ('really', 3625), ('like', 3487), ('would', 3266), ('delicious', 3148), ('sauce', 2983), ('dish', 2894), ('wonderful', 2814), ('little', 2751), ('loved', 2733), ('chicken', 2682), ('flavor', 2471), ('cake', 2333), ('much', 2252), ('taste', 2170), ('cheese', 2117), ('instead', 2116), ('family', 1994), ('nice', 1924), ('think', 1891), ('well', 1868), ('try', 1831), ('served', 1815), ('cream', 1757), ('bit', 1679), ('making', 1647), ('tasty', 1633), ('could', 1598), ('even', 1592), ('garlic', 1572), ('minutes', 1525), ('love', 1506), ('soup', 1442), ('sharing', 1416), ('excellent', 1415), ('husband', 1412), ('bread', 1407), ('put', 1400), ('potatoes', 1357), ('half', 1348), ('fresh', 1328), ('dinner', 1316), ('way', 1286)]
[('recipe', 77183), ('made', 45623), ('make', 40880), ('great', 36352), ('good', 35654), ('easy', 26818), ('time', 26805), ('really', 2292

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 1638), (('great', 'recipe'), 1330), (('really', 'good'), 555), (('sour', 'cream'), 502), (('followed', 'recipe'), 482), (('recipe', 'made'), 425), (('last', 'night'), 416), (('really', 'enjoyed'), 407), (('good', 'recipe'), 378), (('quick', 'easy'), 369), (('easy', 'prepare'), 358), (('recipe', 'easy'), 355), (('wonderful', 'recipe'), 349), (('recipe', 'exactly'), 323), (('family', 'loved'), 319), (('made', 'recipe'), 319), (('sharing', 'recipe'), 316), (('side', 'dish'), 315), (('definitely', 'make'), 308), (('first', 'time'), 295), (('good', 'easy'), 280), (('1/2', 'cup'), 275), (('cream', 'cheese'), 269), (('time', 'make'), 267), (('put', 'together'), 260), (('really', 'liked'), 260), (('comfort', 'food'), 256), (('recipe', 'good'), 249), (('turned', 'great'), 248), (('recipe', 'great'), 248), (('easy', 'recipe'), 247), (('ice', 'cream'), 242), (('brown', 'sugar'), 240), (('everyone', 'loved'), 239), (('excellent', 'recipe'), 218), (('olive', 'oil'), 217), (('chi

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 248), (('easy', 'put', 'together'), 152), (('made', 'last', 'night'), 140), (('recipe', 'easy', 'make'), 126), (('cut', 'recipe', 'half'), 117), (('good', 'easy', 'make'), 110), (('vanilla', 'ice', 'cream'), 105), (('would', 'change', 'thing'), 91), (('another', 'great', 'recipe'), 81), (('whole', 'family', 'loved'), 77), (('great', 'recipe', 'easy'), 63), (('red', 'pepper', 'flakes'), 63), (('dinner', 'last', 'night'), 62), (('sharing', 'great', 'recipe'), 58), (('family', 'really', 'enjoyed'), 57), (('quick', 'easy', 'make'), 57), (('delicious', 'easy', 'make'), 56), (('followed', 'directions', 'exactly'), 53), (('great', 'recipe', 'made'), 53), (('yum', 'yum', 'yum'), 52), (('first', 'time', 'made'), 51), (('got', 'rave', 'reviews'), 49), (('easy', 'make', 'good'), 48), (('cream', 'chicken', 'soup'), 48), (('cream', 'mushroom', 'soup'), 45), (('made', 'several', 'times'), 45), (('really', 'easy', 'make'), 43), (('great', 'easy', 'make'), 42), (('

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('good', 6252), ('great', 6155), ('easy', 4908), ('recipe', 4324), ('really', 3625), ('delicious', 3112), ('little', 2700), ('wonderful', 2529), ('much', 2250), ('instead', 2116), ('nice', 1857), ('dish', 1851), ('well', 1841), ('even', 1592), ('cheese', 1371), ('garlic', 1319), ('fresh', 1288), ('tasty', 1243), ('excellent', 1149), ('still', 1130), ('definitely', 1128), ('first', 1119), ('best', 1013), ('oven', 970), ('sure', 966), ('better', 939), ('together', 931), ('top', 927), ('enough', 906), ('sweet', 891), ('last', 889), ('green', 869), ('right', 813), ('whole', 803), ('hot', 801), ('perfect', 797), ('never', 794), ('exactly', 789), ('quick', 788), ('different', 778), ('ever', 753), ('simple', 700), ('quite', 698), ('extra', 658), ('absolutely', 636), ('big', 630), ('red', 627), ('chicken', 626), ('always', 587), ('rich', 575)]
[('great', 36352), ('good', 34435), ('recipe', 26153), ('easy', 25723), ('really', 22926), ('delicious', 18187), ('little', 16553), ('much', 15788), ('

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 845), (('really', 'good'), 593), (('good', 'easy'), 395), (('best', 'ever'), 331), (('quick', 'easy'), 319), (('great', 'easy'), 285), (('easy', 'good'), 276), (('easy', 'great'), 244), (('easy', 'delicious'), 243), (('recipe', 'great'), 228), (('recipe', 'exactly'), 225), (('good', 'great'), 217), (('recipe', 'easy'), 215), (('delicious', 'easy'), 208), (('even', 'better'), 207), (('easy', 'together'), 203), (('good', 'good'), 196), (('easy', 'tasty'), 194), (('absolutely', 'delicious'), 191), (('good', 'really'), 180), (('pretty', 'good'), 173), (('good', 'little'), 165), (('great', 'great'), 163), (('really', 'great'), 154), (('great', 'really'), 154), (('easy', 'recipe'), 153), (('really', 'easy'), 151), (('recipe', 'good'), 150), (('wonderful', 'easy'), 147), (('good', 'recipe'), 146), (('great', 'good'), 139), (('really', 'enjoyed'), 128), (('great', 'dish'), 120), (('much', 'better'), 120), (('still', 'great'), 118), (('good', 'nice'), 114), (('easy', 'qui

### Check minutes 120 to 180 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["minutes"] >= 120) & (PP_RECIPES_DF["minutes"] <= 180)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 1377), ('made', 762), ('make', 708), ('great', 657), ('good', 635), ('time', 482), ('easy', 408), ('sauce', 395), ('one', 373), ('like', 366), ('would', 348), ('wonderful', 339), ('really', 338), ('chicken', 326), ('flavor', 289), ('delicious', 284), ('much', 256), ('little', 248), ('bread', 244), ('taste', 238), ('loved', 235), ('dish', 223), ('soup', 210), ('served', 207), ('family', 207), ('try', 204), ('instead', 198), ('well', 195), ('nice', 189), ('garlic', 180), ('bit', 174), ('even', 168), ('meat', 168), ('cheese', 165), ('tender', 163), ('sharing', 163), ('first', 162), ('love', 161), ('making', 161), ('think', 160), ('could', 159), ('tasty', 158), ('best', 155), ('excellent', 155), ('put', 153), ('fresh', 153), ('way', 152), ('cream', 148), ('cooked', 147), ('dinner', 146)]
[('recipe', 9704), ('made', 5668), ('make', 4704), ('great', 4441), ('good', 3936), ('time', 3385), ('easy', 2660), ('one', 2627), ('really', 2489), ('like', 2474), ('would', 2288), ('sauce', 2

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('great', 'recipe'), 144), (('easy', 'make'), 135), (('followed', 'recipe'), 60), (('mashed', 'potatoes'), 53), (('really', 'good'), 52), (('recipe', 'made'), 46), (('sharing', 'recipe'), 40), (('wonderful', 'recipe'), 39), (('first', 'time'), 39), (('recipe', 'exactly'), 39), (('good', 'recipe'), 37), (('last', 'night'), 37), (('time', 'make'), 37), (('pork', 'chops'), 33), (('made', 'recipe'), 33), (('green', 'beans'), 32), (('really', 'enjoyed'), 32), (('definitely', 'make'), 31), (('everyone', 'loved'), 30), (('sour', 'cream'), 30), (('cooking', 'time'), 29), (('1/2', 'cup'), 28), (('ever', 'made'), 28), (('recipe', 'easy'), 28), (('recipe', 'great'), 28), (('make', 'sure'), 27), (('excellent', 'recipe'), 27), (('even', 'better'), 27), (('family', 'loved'), 27), (('much', 'sharing'), 26), (('bread', 'machine'), 26), (('followed', 'directions'), 26), (('good', 'made'), 25), (('cream', 'cheese'), 25), (('comfort', 'food'), 25), (('olive', 'oil'), 24), (('would', 'make'), 24), (('ea

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 27), (('made', 'last', 'night'), 15), (('followed', 'directions', 'exactly'), 14), (('served', 'mashed', 'potatoes'), 12), (('would', 'change', 'thing'), 10), (('red', 'pepper', 'flakes'), 9), (('whole', 'family', 'loved'), 8), (('great', 'recipe', 'easy'), 8), (('recipe', 'easy', 'make'), 8), (('cut', 'recipe', 'half'), 8), (('sharing', 'wonderful', 'recipe'), 8), (('boneless', 'skinless', 'chicken'), 8), (('first', 'time', 'ever'), 7), (('followed', 'recipe', 'letter'), 7), (('another', 'great', 'recipe'), 7), (('first', 'time', 'made'), 7), (('second', 'time', 'made'), 6), (('easy', 'make', 'good'), 6), (('blue', 'cheese', 'dressing'), 6), (('doo', 'doo', 'doo'), 6), (('split', 'pea', 'soup'), 6), (('got', 'rave', 'reviews'), 6), (('cooked', 'low', 'hours'), 6), (('onion', 'soup', 'mix'), 6), (('recipe', 'exactly', 'written'), 6), (('directions', 'easy', 'follow'), 5), (('loved', 'easy', 'make'), 5), (('easy', 'make', 'tasty'), 5), (('still', 'ta

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 657), ('good', 621), ('recipe', 459), ('easy', 393), ('really', 338), ('wonderful', 311), ('delicious', 281), ('much', 256), ('little', 239), ('instead', 198), ('well', 190), ('nice', 181), ('even', 168), ('first', 161), ('garlic', 157), ('best', 154), ('fresh', 152), ('dish', 146), ('excellent', 131), ('ever', 116), ('tasty', 109), ('right', 109), ('oven', 109), ('last', 108), ('cheese', 108), ('definitely', 102), ('never', 101), ('sure', 100), ('exactly', 98), ('green', 97), ('perfect', 97), ('still', 97), ('better', 95), ('whole', 92), ('enough', 92), ('hot', 85), ('fantastic', 83), ('together', 83), ('top', 80), ('different', 79), ('sweet', 78), ('quite', 78), ('red', 75), ('chicken', 72), ('bread', 71), ('maybe', 69), ('many', 68), ('extra', 68), ('absolutely', 66), ('always', 61)]
[('great', 4441), ('good', 3816), ('recipe', 3252), ('easy', 2552), ('really', 2489), ('much', 1996), ('delicious', 1984), ('wonderful', 1814), ('little', 1809), ('instead', 1406), ('well', 1

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 82), (('best', 'ever'), 56), (('really', 'good'), 56), (('even', 'better'), 29), (('good', 'good'), 27), (('good', 'easy'), 26), (('recipe', 'exactly'), 25), (('really', 'great'), 24), (('great', 'easy'), 23), (('great', 'great'), 21), (('recipe', 'great'), 20), (('good', 'really'), 19), (('good', 'great'), 19), (('good', 'recipe'), 18), (('easy', 'good'), 18), (('recipe', 'good'), 18), (('absolutely', 'delicious'), 17), (('easy', 'great'), 17), (('really', 'easy'), 17), (('easy', 'tasty'), 17), (('easy', 'delicious'), 17), (('much', 'better'), 15), (('great', 'good'), 15), (('great', 'really'), 14), (('pretty', 'good'), 14), (('recipe', 'easy'), 14), (('still', 'good'), 14), (('wonderful', 'recipe'), 14), (('nice', 'great'), 13), (('good', 'nice'), 13), (('tasty', 'easy'), 12), (('first', 'ever'), 12), (('easy', 'together'), 12), (('much', 'wonderful'), 12), (('well', 'worth'), 12), (('much', 'great'), 12), (('recipe', 'really'), 11), (('great', 'first'), 11), (

### Check minutes 180 to 300 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["minutes"] >= 180) & (PP_RECIPES_DF["minutes"] <= 300)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 1371), ('made', 760), ('great', 728), ('make', 688), ('good', 658), ('bread', 520), ('time', 448), ('easy', 439), ('one', 385), ('wonderful', 373), ('like', 346), ('really', 341), ('would', 322), ('flavor', 305), ('delicious', 287), ('sauce', 268), ('loved', 260), ('much', 257), ('little', 255), ('taste', 254), ('chicken', 235), ('try', 221), ('well', 208), ('family', 200), ('put', 198), ('making', 197), ('served', 196), ('instead', 189), ('cooked', 187), ('think', 184), ('could', 182), ('soup', 176), ('nice', 172), ('bit', 170), ('dish', 166), ('roast', 166), ('sharing', 166), ('excellent', 163), ('cheese', 162), ('meat', 161), ('even', 161), ('love', 159), ('perfect', 158), ('hours', 157), ('water', 153), ('turned', 150), ('way', 147), ('crockpot', 139), ('husband', 138), ('oven', 136)]
[('recipe', 8527), ('made', 5068), ('make', 4122), ('great', 3997), ('good', 3434), ('bread', 3226), ('time', 3042), ('easy', 2392), ('one', 2294), ('like', 2161), ('sauce', 2138), ('reall

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('great', 'recipe'), 160), (('easy', 'make'), 139), (('bread', 'machine'), 67), (('good', 'recipe'), 53), (('recipe', 'made'), 49), (('really', 'good'), 49), (('last', 'night'), 44), (('wonderful', 'recipe'), 38), (('crock', 'pot'), 36), (('followed', 'recipe'), 36), (('cream', 'cheese'), 36), (('definitely', 'make'), 35), (('ice', 'cream'), 34), (('turned', 'great'), 34), (('made', 'recipe'), 33), (('great', 'flavor'), 33), (('1/2', 'cup'), 33), (('put', 'together'), 33), (('easy', 'prepare'), 32), (('sharing', 'recipe'), 32), (('first', 'time'), 31), (('family', 'loved'), 29), (('mashed', 'potatoes'), 28), (('even', 'better'), 28), (('onion', 'soup'), 27), (('recipe', 'great'), 27), (('brown', 'sugar'), 27), (('recipe', 'easy'), 26), (('recipe', 'exactly'), 26), (('would', 'make'), 26), (('make', 'sure'), 26), (('really', 'enjoyed'), 26), (('excellent', 'recipe'), 24), (('absolutely', 'delicious'), 24), (('change', 'made'), 24), (('ever', 'made'), 24), (('everyone', 'loved'), 24), 

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('made', 'last', 'night'), 18), (('easy', 'put', 'together'), 15), (('followed', 'recipe', 'exactly'), 15), (('onion', 'soup', 'mix'), 11), (('vanilla', 'ice', 'cream'), 10), (('good', 'easy', 'make'), 10), (('french', 'onion', 'soup'), 9), (('recipe', 'easy', 'make'), 9), (('another', 'great', 'recipe'), 9), (('served', 'mashed', 'potatoes'), 8), (('easy', 'make', 'great'), 8), (('cut', 'recipe', 'half'), 7), (('would', 'change', 'thing'), 7), (('editor', 'note', 'recipe'), 7), (('bread', 'machine', 'recipe'), 7), (('sharing', 'great', 'recipe'), 7), (('really', 'good', 'recipe'), 6), (('great', 'recipe', 'made'), 6), (('recipe', 'great', 'recipe'), 6), (('whole', 'family', 'loved'), 6), (('made', 'house', 'smell'), 6), (('fat', 'free', 'cream'), 6), (('whole', 'wheat', 'flour'), 6), (('followed', 'directions', 'exactly'), 6), (('worked', 'like', 'charm'), 6), (('first', 'time', 'ever'), 6), (('great', 'recipe', 'love'), 6), (('gravy', 'mashed', 'potatoes'), 5), (('dinner', 'last', 

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 728), ('good', 639), ('recipe', 436), ('easy', 419), ('wonderful', 347), ('really', 341), ('delicious', 285), ('much', 257), ('little', 251), ('well', 207), ('instead', 189), ('nice', 168), ('even', 161), ('first', 136), ('bread', 136), ('excellent', 133), ('oven', 126), ('still', 123), ('best', 118), ('dish', 118), ('fresh', 118), ('sure', 115), ('definitely', 113), ('garlic', 107), ('perfect', 106), ('right', 104), ('never', 103), ('better', 102), ('cheese', 102), ('whole', 102), ('sweet', 102), ('ever', 100), ('tasty', 99), ('last', 97), ('extra', 92), ('enough', 92), ('absolutely', 88), ('exactly', 87), ('hot', 80), ('fantastic', 77), ('together', 75), ('green', 74), ('top', 71), ('white', 70), ('low', 70), ('gravy', 69), ('big', 68), ('quite', 67), ('different', 67), ('new', 65)]
[('great', 3997), ('good', 3331), ('recipe', 2836), ('easy', 2296), ('really', 2127), ('much', 1756), ('delicious', 1718), ('wonderful', 1639), ('little', 1562), ('instead', 1162), ('well', 114

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 87), (('really', 'good'), 54), (('best', 'ever'), 42), (('great', 'easy'), 29), (('even', 'better'), 29), (('great', 'great'), 27), (('wonderful', 'easy'), 26), (('good', 'good'), 26), (('good', 'great'), 25), (('good', 'easy'), 24), (('absolutely', 'delicious'), 24), (('easy', 'great'), 24), (('good', 'recipe'), 23), (('recipe', 'great'), 23), (('easy', 'good'), 23), (('pretty', 'good'), 23), (('easy', 'delicious'), 21), (('wonderful', 'great'), 21), (('great', 'really'), 20), (('absolutely', 'wonderful'), 19), (('really', 'great'), 19), (('good', 'little'), 18), (('recipe', 'exactly'), 18), (('easy', 'together'), 18), (('delicious', 'easy'), 17), (('much', 'great'), 17), (('recipe', 'good'), 17), (('good', 'really'), 15), (('great', 'good'), 15), (('really', 'well'), 15), (('much', 'better'), 15), (('still', 'great'), 15), (('recipe', 'easy'), 14), (('quick', 'easy'), 14), (('delicious', 'great'), 14), (('delicious', 'really'), 13), (('easy', 'tasty'), 12), (('

### Check minutes 300 and above ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["minutes"] >= 300)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 2516), ('great', 1285), ('good', 1241), ('made', 1221), ('make', 1151), ('chicken', 978), ('easy', 941), ('time', 939), ('sauce', 861), ('one', 719), ('really', 646), ('would', 610), ('crockpot', 599), ('wonderful', 595), ('like', 579), ('hours', 554), ('flavor', 540), ('roast', 538), ('loved', 537), ('tender', 531), ('served', 514), ('delicious', 512), ('cooked', 502), ('meat', 491), ('little', 480), ('taste', 446), ('much', 427), ('family', 406), ('cooking', 399), ('think', 396), ('put', 395), ('dish', 381), ('try', 380), ('could', 355), ('well', 348), ('potatoes', 347), ('soup', 343), ('instead', 338), ('rice', 336), ('pork', 332), ('bit', 329), ('beef', 328), ('gravy', 328), ('husband', 316), ('tasty', 309), ('turned', 306), ('cut', 302), ('even', 300), ('meal', 294), ('day', 288)]
[('recipe', 11047), ('made', 6034), ('great', 5451), ('good', 5259), ('make', 5196), ('time', 4206), ('easy', 4150), ('chicken', 3670), ('sauce', 3498), ('really', 3011), ('one', 2930), ('lik

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('great', 'recipe'), 274), (('easy', 'make'), 226), (('crock', 'pot'), 119), (('really', 'good'), 118), (('mashed', 'potatoes'), 115), (('followed', 'recipe'), 99), (('chicken', 'breasts'), 90), (('brown', 'sugar'), 83), (('family', 'loved'), 81), (('pork', 'chops'), 80), (('made', 'recipe'), 78), (('good', 'recipe'), 75), (('served', 'rice'), 75), (('recipe', 'made'), 73), (('definitely', 'make'), 72), (('easy', 'prepare'), 71), (('last', 'night'), 70), (('cooking', 'time'), 70), (('meat', 'tender'), 67), (('recipe', 'exactly'), 67), (('turned', 'great'), 65), (('time', 'make'), 65), (('recipe', 'easy'), 63), (('really', 'enjoyed'), 63), (('first', 'time'), 62), (('cup', 'water'), 58), (('onion', 'soup'), 57), (('put', 'together'), 57), (('hours', 'low'), 56), (('sour', 'cream'), 56), (('great', 'flavor'), 55), (('good', 'easy'), 54), (('1/2', 'cup'), 54), (('ranch', 'dressing'), 54), (('green', 'beans'), 52), (('husband', 'loved'), 51), (('mushroom', 'soup'), 51), (('cooked', 'low'

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 43), (('served', 'mashed', 'potatoes'), 39), (('onion', 'soup', 'mix'), 36), (('easy', 'put', 'together'), 32), (('cream', 'mushroom', 'soup'), 22), (('whole', 'family', 'loved'), 21), (('cooked', 'low', 'hours'), 21), (('recipe', 'easy', 'make'), 21), (('ranch', 'dressing', 'mix'), 19), (('boneless', 'skinless', 'chicken'), 18), (('made', 'last', 'night'), 17), (('would', 'change', 'thing'), 17), (('cooked', 'high', 'hours'), 16), (('skinless', 'chicken', 'breasts'), 16), (('brown', 'gravy', 'mix'), 16), (('good', 'easy', 'make'), 15), (('cream', 'chicken', 'soup'), 15), (('followed', 'directions', 'exactly'), 15), (('made', 'recipe', 'exactly'), 15), (('french', 'onion', 'soup'), 14), (('red', 'pepper', 'flakes'), 13), (('1/2', 'cup', 'water'), 13), (('great', 'recipe', 'made'), 13), (('pork', 'chops', 'tender'), 12), (('house', 'smelled', 'wonderful'), 12), (('sharing', 'great', 'recipe'), 12), (('another', 'great', 'recipe'), 12), (('served', 'e

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 1285), ('good', 1201), ('easy', 896), ('recipe', 828), ('really', 646), ('wonderful', 559), ('delicious', 509), ('little', 471), ('much', 427), ('well', 347), ('instead', 338), ('even', 300), ('first', 269), ('low', 262), ('nice', 260), ('dish', 249), ('garlic', 242), ('best', 241), ('definitely', 226), ('excellent', 223), ('tasty', 215), ('still', 210), ('last', 207), ('gravy', 206), ('chicken', 200), ('ever', 194), ('whole', 189), ('never', 179), ('enough', 177), ('sure', 171), ('exactly', 171), ('better', 170), ('hot', 170), ('fresh', 170), ('oven', 169), ('green', 167), ('high', 163), ('together', 149), ('dry', 149), ('sweet', 147), ('long', 143), ('perfect', 140), ('red', 138), ('quite', 137), ('right', 131), ('absolutely', 130), ('big', 121), ('extra', 120), ('fantastic', 119), ('top', 119)]
[('great', 5451), ('good', 5089), ('easy', 3976), ('recipe', 3664), ('really', 3011), ('delicious', 2457), ('much', 2243), ('little', 2150), ('wonderful', 2117), ('instead', 1738),

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 153), (('really', 'good'), 128), (('best', 'ever'), 99), (('good', 'easy'), 80), (('easy', 'good'), 58), (('great', 'easy'), 56), (('good', 'great'), 53), (('easy', 'great'), 53), (('even', 'better'), 50), (('good', 'good'), 49), (('recipe', 'great'), 46), (('delicious', 'easy'), 44), (('recipe', 'easy'), 42), (('recipe', 'exactly'), 40), (('easy', 'together'), 39), (('really', 'great'), 38), (('easy', 'delicious'), 38), (('absolutely', 'delicious'), 37), (('pretty', 'good'), 37), (('good', 'really'), 36), (('great', 'great'), 35), (('great', 'good'), 33), (('good', 'little'), 33), (('high', 'low'), 33), (('recipe', 'good'), 32), (('great', 'really'), 30), (('wonderful', 'great'), 29), (('quick', 'easy'), 29), (('easy', 'tasty'), 28), (('easy', 'wonderful'), 28), (('wonderful', 'easy'), 27), (('easy', 'really'), 26), (('excellent', 'easy'), 26), (('easy', 'recipe'), 25), (('still', 'great'), 25), (('well', 'worth'), 24), (('definitely', 'keeper'), 24), (('really'

### Check number of steps under 8 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[PP_RECIPES_DF["n_steps"] <= 8]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 17567), ('good', 9826), ('make', 9769), ('great', 9477), ('made', 9397), ('easy', 8031), ('time', 5734), ('really', 5377), ('like', 5167), ('one', 5143), ('would', 4477), ('delicious', 4239), ('little', 4147), ('loved', 4055), ('sauce', 4041), ('wonderful', 4037), ('flavor', 3831), ('chicken', 3705), ('taste', 3602), ('dish', 3455), ('much', 3371), ('served', 3054), ('nice', 2959), ('instead', 2825), ('try', 2784), ('well', 2771), ('think', 2762), ('cheese', 2657), ('garlic', 2604), ('tasty', 2566), ('family', 2533), ('love', 2515), ('bit', 2462), ('could', 2327), ('fresh', 2306), ('salad', 2273), ('making', 2257), ('even', 2242), ('cream', 2186), ('sharing', 2157), ('put', 2151), ('way', 2082), ('excellent', 2045), ('bread', 1931), ('husband', 1878), ('dinner', 1851), ('thought', 1845), ('enjoyed', 1818), ('rice', 1802), ('sugar', 1795)]
[('recipe', 96749), ('made', 57962), ('great', 52023), ('make', 51748), ('good', 49631), ('easy', 39971), ('time', 32248), ('really', 306

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 2311), (('great', 'recipe'), 1837), (('really', 'good'), 878), (('quick', 'easy'), 748), (('sour', 'cream'), 638), (('last', 'night'), 608), (('followed', 'recipe'), 589), (('recipe', 'made'), 585), (('really', 'enjoyed'), 569), (('easy', 'prepare'), 566), (('side', 'dish'), 544), (('good', 'recipe'), 527), (('recipe', 'easy'), 445), (('family', 'loved'), 438), (('definitely', 'make'), 419), (('made', 'recipe'), 416), (('wonderful', 'recipe'), 408), (('sharing', 'recipe'), 407), (('good', 'easy'), 405), (('olive', 'oil'), 404), (('1/2', 'cup'), 399), (('cream', 'cheese'), 399), (('recipe', 'exactly'), 398), (('first', 'time'), 390), (('easy', 'recipe'), 385), (('really', 'liked'), 384), (('put', 'together'), 382), (('recipe', 'great'), 376), (('everyone', 'loved'), 366), (('time', 'make'), 355), (('mashed', 'potatoes'), 337), (('brown', 'sugar'), 336), (('recipe', 'good'), 333), (('ice', 'cream'), 325), (('chicken', 'breasts'), 322), (('even', 'better'), 313), (('le

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 291), (('easy', 'put', 'together'), 216), (('made', 'last', 'night'), 180), (('recipe', 'easy', 'make'), 154), (('another', 'great', 'recipe'), 140), (('good', 'easy', 'make'), 139), (('cut', 'recipe', 'half'), 131), (('red', 'pepper', 'flakes'), 127), (('would', 'change', 'thing'), 112), (('quick', 'easy', 'make'), 106), (('whole', 'family', 'loved'), 106), (('vanilla', 'ice', 'cream'), 93), (('dinner', 'last', 'night'), 93), (('delicious', 'easy', 'make'), 79), (('great', 'recipe', 'easy'), 78), (('sharing', 'great', 'recipe'), 78), (('onion', 'soup', 'mix'), 77), (('easy', 'make', 'great'), 74), (('first', 'time', 'made'), 74), (('followed', 'directions', 'exactly'), 73), (('great', 'recipe', 'made'), 71), (('cream', 'mushroom', 'soup'), 69), (('recipe', 'quick', 'easy'), 68), (('easy', 'make', 'good'), 63), (('red', 'wine', 'vinegar'), 63), (('great', 'side', 'dish'), 62), (('easy', 'make', 'tasty'), 61), (('easy', 'make', 'made'), 61), (('serve

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('good', 9534), ('great', 9477), ('easy', 7712), ('recipe', 5892), ('really', 5377), ('delicious', 4190), ('little', 4068), ('wonderful', 3688), ('much', 3366), ('nice', 2867), ('instead', 2825), ('well', 2738), ('even', 2242), ('fresh', 2235), ('garlic', 2168), ('dish', 2156), ('tasty', 1908), ('excellent', 1676), ('cheese', 1665), ('first', 1573), ('quick', 1556), ('definitely', 1551), ('still', 1541), ('better', 1454), ('green', 1427), ('hot', 1409), ('best', 1395), ('last', 1370), ('sweet', 1362), ('together', 1309), ('simple', 1307), ('sure', 1247), ('enough', 1230), ('perfect', 1165), ('right', 1156), ('never', 1150), ('whole', 1136), ('different', 1125), ('red', 1073), ('ever', 1036), ('top', 1034), ('exactly', 1019), ('oven', 1015), ('quite', 1003), ('always', 963), ('extra', 932), ('big', 910), ('absolutely', 865), ('salad', 864), ('chicken', 846)]
[('great', 52023), ('good', 47863), ('easy', 38345), ('recipe', 32964), ('really', 30629), ('delicious', 23200), ('little', 22771

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 1084), (('really', 'good'), 950), (('quick', 'easy'), 674), (('good', 'easy'), 619), (('great', 'easy'), 454), (('easy', 'good'), 440), (('best', 'ever'), 420), (('easy', 'great'), 400), (('easy', 'delicious'), 380), (('good', 'great'), 377), (('easy', 'tasty'), 330), (('delicious', 'easy'), 317), (('recipe', 'great'), 313), (('even', 'better'), 313), (('easy', 'together'), 307), (('good', 'good'), 302), (('good', 'really'), 285), (('pretty', 'good'), 279), (('great', 'great'), 278), (('recipe', 'easy'), 274), (('recipe', 'exactly'), 273), (('absolutely', 'delicious'), 252), (('easy', 'quick'), 245), (('really', 'great'), 244), (('wonderful', 'easy'), 233), (('recipe', 'good'), 230), (('much', 'better'), 228), (('really', 'easy'), 226), (('easy', 'recipe'), 223), (('great', 'good'), 221), (('good', 'little'), 221), (('good', 'recipe'), 217), (('great', 'really'), 209), (('wonderful', 'great'), 182), (('tasty', 'easy'), 176), (('much', 'great'), 175), (('really', 

### Check number of steps 8 till 18###

In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["n_steps"] >= 8) & (PP_RECIPES_DF["n_steps"] <= 18)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 16289), ('make', 8977), ('made', 8630), ('good', 8064), ('great', 7886), ('easy', 6227), ('time', 5289), ('really', 4674), ('one', 4619), ('like', 4397), ('sauce', 4229), ('would', 4178), ('delicious', 3955), ('wonderful', 3523), ('little', 3509), ('chicken', 3333), ('loved', 3331), ('dish', 3246), ('flavor', 3216), ('much', 2970), ('taste', 2800), ('instead', 2673), ('cheese', 2467), ('well', 2448), ('think', 2436), ('nice', 2410), ('served', 2392), ('family', 2387), ('try', 2382), ('could', 2097), ('bit', 2092), ('making', 2083), ('love', 2061), ('tasty', 2011), ('garlic', 1953), ('even', 1940), ('cream', 1925), ('cake', 1885), ('put', 1812), ('excellent', 1796), ('sharing', 1781), ('way', 1755), ('husband', 1747), ('fresh', 1745), ('minutes', 1735), ('bread', 1716), ('half', 1636), ('dinner', 1591), ('butter', 1582), ('thought', 1569)]
[('recipe', 106382), ('made', 63178), ('make', 56534), ('great', 51419), ('good', 49141), ('easy', 37382), ('time', 36576), ('really', 31

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 2067), (('great', 'recipe'), 1687), (('really', 'good'), 742), (('followed', 'recipe'), 619), (('last', 'night'), 532), (('really', 'enjoyed'), 528), (('quick', 'easy'), 488), (('sour', 'cream'), 486), (('recipe', 'made'), 481), (('good', 'recipe'), 456), (('sharing', 'recipe'), 419), (('recipe', 'easy'), 410), (('made', 'recipe'), 402), (('definitely', 'make'), 398), (('recipe', 'exactly'), 387), (('family', 'loved'), 380), (('wonderful', 'recipe'), 377), (('easy', 'prepare'), 375), (('first', 'time'), 371), (('1/2', 'cup'), 350), (('cream', 'cheese'), 346), (('turned', 'great'), 332), (('good', 'easy'), 323), (('side', 'dish'), 319), (('time', 'make'), 318), (('really', 'liked'), 316), (('recipe', 'great'), 314), (('easy', 'recipe'), 306), (('brown', 'sugar'), 301), (('recipe', 'good'), 293), (('olive', 'oil'), 292), (('even', 'better'), 288), (('red', 'pepper'), 283), (('everyone', 'loved'), 282), (('put', 'together'), 279), (('excellent', 'recipe'), 275), (('wou

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 286), (('made', 'last', 'night'), 182), (('recipe', 'easy', 'make'), 143), (('easy', 'put', 'together'), 143), (('good', 'easy', 'make'), 136), (('cut', 'recipe', 'half'), 135), (('another', 'great', 'recipe'), 116), (('red', 'pepper', 'flakes'), 107), (('would', 'change', 'thing'), 106), (('vanilla', 'ice', 'cream'), 96), (('quick', 'easy', 'make'), 88), (('whole', 'family', 'loved'), 87), (('dinner', 'last', 'night'), 79), (('followed', 'directions', 'exactly'), 73), (('family', 'really', 'enjoyed'), 72), (('great', 'recipe', 'made'), 71), (('delicious', 'easy', 'make'), 71), (('great', 'recipe', 'easy'), 65), (('first', 'time', 'made'), 64), (('sharing', 'great', 'recipe'), 58), (('whole', 'wheat', 'flour'), 58), (('easy', 'make', 'good'), 57), (('otherwise', 'followed', 'recipe'), 57), (('easy', 'make', 'tasted'), 57), (('served', 'mashed', 'potatoes'), 55), (('easy', 'make', 'great'), 54), (('got', 'rave', 'reviews'), 52), (('made', 'recipe', '

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 7886), ('good', 7815), ('easy', 5981), ('recipe', 5410), ('really', 4674), ('delicious', 3910), ('little', 3437), ('wonderful', 3189), ('much', 2968), ('instead', 2673), ('well', 2416), ('nice', 2330), ('dish', 2075), ('even', 1940), ('fresh', 1690), ('garlic', 1622), ('cheese', 1620), ('tasty', 1535), ('first', 1492), ('excellent', 1443), ('definitely', 1428), ('still', 1409), ('best', 1314), ('sure', 1270), ('better', 1268), ('enough', 1214), ('oven', 1180), ('last', 1166), ('together', 1121), ('sweet', 1118), ('green', 1106), ('whole', 1091), ('never', 1076), ('quick', 1063), ('hot', 1054), ('top', 1050), ('right', 1046), ('exactly', 1033), ('ever', 998), ('perfect', 986), ('different', 967), ('quite', 929), ('red', 894), ('extra', 843), ('absolutely', 842), ('big', 831), ('simple', 826), ('chicken', 805), ('always', 803), ('fantastic', 718)]
[('great', 51419), ('good', 47451), ('recipe', 35903), ('easy', 35792), ('really', 31399), ('delicious', 24911), ('little', 23115),

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 1017), (('really', 'good'), 793), (('good', 'easy'), 478), (('best', 'ever'), 453), (('quick', 'easy'), 444), (('great', 'easy'), 326), (('easy', 'delicious'), 319), (('easy', 'good'), 312), (('easy', 'great'), 299), (('even', 'better'), 289), (('delicious', 'easy'), 276), (('good', 'great'), 266), (('good', 'good'), 261), (('recipe', 'great'), 259), (('good', 'really'), 257), (('recipe', 'exactly'), 245), (('absolutely', 'delicious'), 235), (('recipe', 'easy'), 234), (('great', 'great'), 226), (('easy', 'tasty'), 216), (('easy', 'together'), 209), (('pretty', 'good'), 207), (('recipe', 'good'), 206), (('good', 'little'), 203), (('great', 'really'), 203), (('great', 'good'), 197), (('really', 'great'), 197), (('really', 'easy'), 188), (('much', 'better'), 186), (('good', 'recipe'), 172), (('easy', 'recipe'), 170), (('really', 'enjoyed'), 161), (('still', 'great'), 158), (('wonderful', 'easy'), 157), (('easy', 'quick'), 155), (('delicious', 'great'), 144), (('grea

### check number of steps larger than 18 ###

In [None]:
filtered_df = PP_RECIPES_DF.loc[PP_RECIPES_DF["n_steps"] >= 18]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 2202), ('made', 1254), ('make', 1125), ('great', 932), ('good', 867), ('time', 786), ('one', 597), ('cake', 597), ('easy', 567), ('like', 562), ('would', 542), ('sauce', 541), ('really', 503), ('wonderful', 476), ('much', 476), ('delicious', 454), ('little', 423), ('chicken', 420), ('well', 377), ('loved', 368), ('taste', 359), ('dish', 334), ('flavor', 328), ('try', 326), ('making', 316), ('think', 313), ('instead', 305), ('could', 293), ('even', 286), ('bit', 283), ('best', 271), ('family', 270), ('chocolate', 257), ('minutes', 252), ('nice', 250), ('love', 240), ('cream', 237), ('bread', 233), ('first', 233), ('perfect', 230), ('half', 225), ('served', 222), ('excellent', 220), ('way', 218), ('cheese', 217), ('dinner', 214), ('get', 210), ('pan', 209), ('butter', 209), ('soup', 208)]
[('recipe', 17120), ('made', 10502), ('make', 8913), ('great', 7177), ('good', 6788), ('time', 6280), ('one', 4770), ('really', 4731), ('sauce', 4506), ('like', 4444), ('would', 4252), ('eas

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('great', 'recipe'), 212), (('easy', 'make'), 177), (('recipe', 'made'), 86), (('followed', 'recipe'), 71), (('really', 'good'), 63), (('first', 'time'), 62), (('well', 'worth'), 60), (('wonderful', 'recipe'), 59), (('last', 'night'), 55), (('good', 'recipe'), 54), (('really', 'enjoyed'), 54), (('made', 'recipe'), 53), (('sour', 'cream'), 52), (('turned', 'great'), 47), (('time', 'make'), 46), (('recipe', 'exactly'), 46), (('easy', 'follow'), 45), (('cream', 'cheese'), 44), (('absolutely', 'delicious'), 44), (('recipe', 'easy'), 42), (('olive', 'oil'), 41), (('change', 'thing'), 40), (('definitely', 'make'), 40), (('time', 'consuming'), 39), (('ever', 'made'), 38), (('recipe', 'great'), 38), (('sharing', 'recipe'), 38), (('recipe', 'good'), 37), (('worth', 'effort'), 36), (('1/2', 'cup'), 35), (('mean', 'chef'), 35), (('would', 'make'), 34), (('lemon', 'juice'), 34), (('even', 'though'), 32), (('red', 'pepper'), 32), (('much', 'posting'), 32), (('cake', 'mix'), 32), (('chicken', 'bro

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 30), (('well', 'worth', 'effort'), 17), (('made', 'last', 'night'), 17), (('would', 'change', 'thing'), 14), (('recipe', 'easy', 'make'), 12), (('easy', 'put', 'together'), 12), (('vanilla', 'ice', 'cream'), 12), (('cut', 'recipe', 'half'), 11), (('first', 'time', 'made'), 11), (('great', 'recipe', 'made'), 11), (('dough', 'easy', 'work'), 11), (('recipe', 'easy', 'follow'), 10), (('could', 'give', 'stars'), 10), (('little', 'time', 'consuming'), 10), (('good', 'easy', 'make'), 10), (('sharing', 'great', 'recipe'), 10), (('really', 'easy', 'make'), 10), (('made', 'recipe', 'exactly'), 10), (('got', 'rave', 'reviews'), 10), (('whole', 'family', 'loved'), 9), (('wish', 'could', 'give'), 9), (('followed', 'directions', 'exactly'), 9), (('much', 'posting', 'recipe'), 9), (('change', 'anything', 'recipe'), 9), (('sharing', 'wonderful', 'recipe'), 9), (('red', 'pepper', 'flakes'), 8), (('easy', 'make', 'made'), 8), (('recipe', 'great', 'recipe'), 8), (('b

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 932), ('good', 843), ('recipe', 701), ('easy', 538), ('really', 503), ('much', 476), ('delicious', 451), ('wonderful', 428), ('little', 420), ('well', 371), ('instead', 305), ('even', 286), ('best', 266), ('nice', 236), ('first', 231), ('dish', 208), ('ever', 205), ('never', 204), ('excellent', 182), ('still', 182), ('oven', 171), ('fresh', 170), ('perfect', 170), ('definitely', 170), ('enough', 169), ('garlic', 163), ('sure', 151), ('cheese', 147), ('right', 146), ('last', 144), ('better', 143), ('absolutely', 138), ('tasty', 137), ('exactly', 137), ('top', 133), ('quite', 130), ('worth', 129), ('whole', 128), ('sweet', 127), ('rich', 120), ('different', 119), ('together', 118), ('green', 115), ('red', 115), ('fantastic', 111), ('many', 108), ('extra', 107), ('hot', 104), ('always', 101), ('long', 100)]
[('great', 7177), ('good', 6568), ('recipe', 5742), ('really', 4731), ('easy', 3939), ('much', 3685), ('delicious', 3503), ('little', 3461), ('wonderful', 3114), ('well', 28

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 125), (('best', 'ever'), 101), (('really', 'good'), 73), (('well', 'worth'), 46), (('absolutely', 'delicious'), 46), (('recipe', 'great'), 42), (('good', 'easy'), 38), (('easy', 'follow'), 32), (('recipe', 'exactly'), 32), (('good', 'good'), 31), (('good', 'really'), 29), (('good', 'little'), 27), (('easy', 'good'), 25), (('quick', 'easy'), 25), (('even', 'better'), 24), (('great', 'easy'), 24), (('recipe', 'easy'), 23), (('great', 'great'), 23), (('great', 'good'), 22), (('great', 'really'), 22), (('really', 'great'), 22), (('pretty', 'good'), 22), (('much', 'better'), 22), (('delicious', 'easy'), 21), (('really', 'easy'), 20), (('recipe', 'wonderful'), 20), (('wonderful', 'recipe'), 20), (('good', 'great'), 20), (('easy', 'recipe'), 19), (('much', 'recipe'), 19), (('easy', 'together'), 18), (('really', 'really'), 18), (('wonderful', 'great'), 17), (('easy', 'great'), 17), (('recipe', 'good'), 17), (('good', 'first'), 17), (('easy', 'delicious'), 16), (('easy', 

### Check number of ingredients till 5 ###


In [None]:
filtered_df = PP_RECIPES_DF.loc[PP_RECIPES_DF["n_ingredients"] <= 5]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 4879), ('make', 3023), ('made', 2827), ('great', 2826), ('good', 2824), ('easy', 2813), ('like', 1639), ('time', 1631), ('really', 1569), ('one', 1473), ('would', 1300), ('little', 1240), ('loved', 1171), ('delicious', 1144), ('wonderful', 1075), ('taste', 1066), ('much', 1061), ('try', 998), ('flavor', 929), ('love', 849), ('nice', 789), ('well', 785), ('way', 784), ('cream', 777), ('sauce', 771), ('chicken', 764), ('think', 753), ('could', 730), ('tasty', 718), ('cheese', 715), ('served', 713), ('simple', 709), ('dish', 706), ('butter', 684), ('making', 675), ('bit', 672), ('garlic', 671), ('sharing', 666), ('sugar', 664), ('chocolate', 663), ('family', 659), ('yummy', 659), ('put', 655), ('even', 632), ('instead', 608), ('quick', 578), ('potatoes', 577), ('get', 571), ('sweet', 561), ('mix', 557)]
[('recipe', 29899), ('made', 19315), ('make', 18117), ('great', 17319), ('good', 16115), ('easy', 16067), ('time', 10279), ('like', 10145), ('really', 9607), ('one', 8392), ('w

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 762), (('great', 'recipe'), 485), (('quick', 'easy'), 261), (('really', 'good'), 256), (('peanut', 'butter'), 204), (('recipe', 'made'), 187), (('ice', 'cream'), 173), (('last', 'night'), 171), (('sour', 'cream'), 162), (('cream', 'cheese'), 160), (('recipe', 'easy'), 153), (('side', 'dish'), 149), (('easy', 'recipe'), 148), (('good', 'recipe'), 147), (('good', 'easy'), 125), (('everyone', 'loved'), 124), (('easy', 'prepare'), 121), (('sharing', 'recipe'), 121), (('recipe', 'great'), 115), (('first', 'time'), 113), (('olive', 'oil'), 112), (('mashed', 'potatoes'), 108), (('family', 'loved'), 106), (('definitely', 'make'), 106), (('followed', 'recipe'), 106), (('made', 'recipe'), 104), (('1/2', 'cup'), 104), (('really', 'enjoyed'), 104), (('easy', 'tasty'), 102), (('wonderful', 'recipe'), 101), (('simple', 'make'), 101), (('put', 'together'), 98), (('easy', 'delicious'), 95), (('year', 'old'), 95), (('ranch', 'dressing'), 93), (('chocolate', 'chips'), 91), (('much', 

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('recipe', 'easy', 'make'), 54), (('followed', 'recipe', 'exactly'), 53), (('easy', 'put', 'together'), 52), (('made', 'last', 'night'), 52), (('vanilla', 'ice', 'cream'), 41), (('good', 'easy', 'make'), 38), (('onion', 'soup', 'mix'), 38), (('red', 'pepper', 'flakes'), 32), (('quick', 'easy', 'make'), 30), (('another', 'great', 'recipe'), 29), (('whole', 'family', 'loved'), 27), (('easy', 'make', 'great'), 26), (('first', 'time', 'made'), 26), (('sharing', 'great', 'recipe'), 26), (('cut', 'recipe', 'half'), 25), (('delicious', 'easy', 'make'), 25), (('easy', 'make', 'good'), 23), (('great', 'recipe', 'made'), 23), (('ranch', 'dressing', 'mix'), 23), (('made', 'several', 'times'), 22), (('dinner', 'last', 'night'), 21), (('would', 'change', 'thing'), 21), (('served', 'mashed', 'potatoes'), 21), (('great', 'side', 'dish'), 21), (('easy', 'make', 'tastes'), 20), (('cream', 'mushroom', 'soup'), 20), (('great', 'easy', 'recipe'), 20), (('sweetened', 'condensed', 'milk'), 19), (('halved'

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 2826), ('good', 2739), ('easy', 2704), ('recipe', 1664), ('really', 1569), ('little', 1222), ('delicious', 1124), ('much', 1060), ('wonderful', 987), ('well', 779), ('nice', 768), ('even', 632), ('instead', 608), ('garlic', 565), ('tasty', 558), ('simple', 530), ('quick', 526), ('first', 494), ('fresh', 477), ('better', 459), ('dish', 447), ('cheese', 443), ('sweet', 440), ('still', 435), ('never', 423), ('definitely', 395), ('best', 384), ('sure', 383), ('hot', 373), ('perfect', 373), ('last', 367), ('always', 361), ('together', 358), ('excellent', 356), ('right', 356), ('enough', 353), ('top', 338), ('different', 332), ('oven', 304), ('green', 304), ('ever', 303), ('whole', 294), ('quite', 280), ('new', 279), ('low', 274), ('yummy', 268), ('big', 268), ('pretty', 265), ('maybe', 252), ('old', 251)]
[('great', 17319), ('good', 15551), ('easy', 15393), ('recipe', 10150), ('really', 9607), ('little', 7739), ('delicious', 7380), ('much', 6964), ('nice', 5057), ('wonderful', 49

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 295), (('really', 'good'), 272), (('quick', 'easy'), 233), (('good', 'easy'), 200), (('great', 'easy'), 176), (('easy', 'good'), 174), (('easy', 'great'), 161), (('easy', 'delicious'), 123), (('good', 'great'), 116), (('easy', 'tasty'), 109), (('great', 'great'), 102), (('recipe', 'easy'), 101), (('delicious', 'easy'), 98), (('much', 'better'), 96), (('good', 'really'), 90), (('recipe', 'great'), 88), (('best', 'ever'), 87), (('even', 'better'), 85), (('wonderful', 'easy'), 84), (('really', 'easy'), 73), (('good', 'little'), 72), (('easy', 'quick'), 71), (('pretty', 'good'), 70), (('easy', 'together'), 70), (('recipe', 'good'), 69), (('easy', 'recipe'), 69), (('good', 'good'), 67), (('absolutely', 'delicious'), 66), (('easy', 'really'), 63), (('good', 'recipe'), 61), (('much', 'great'), 57), (('great', 'good'), 56), (('simple', 'easy'), 56), (('great', 'little'), 55), (('easy', 'wonderful'), 54), (('really', 'nice'), 54), (('really', 'great'), 54), (('nice', 'eas

### Check number of ingredients 5 till 10 ###


In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["n_ingredients"] >= 5) & (PP_RECIPES_DF["n_ingredients"] <= 10)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 20445), ('make', 11396), ('made', 11080), ('good', 10767), ('great', 10619), ('easy', 9230), ('time', 6646), ('really', 5966), ('one', 5800), ('like', 5651), ('would', 5064), ('delicious', 4976), ('little', 4599), ('wonderful', 4565), ('loved', 4439), ('sauce', 4289), ('chicken', 4263), ('flavor', 4191), ('dish', 3989), ('taste', 3903), ('much', 3750), ('nice', 3318), ('served', 3262), ('cheese', 3192), ('try', 3171), ('think', 3153), ('instead', 3152), ('well', 3119), ('garlic', 3009), ('family', 2938), ('tasty', 2811), ('bit', 2760), ('love', 2702), ('making', 2678), ('could', 2653), ('cream', 2523), ('even', 2517), ('put', 2425), ('sharing', 2419), ('fresh', 2395), ('way', 2321), ('excellent', 2269), ('bread', 2246), ('butter', 2199), ('minutes', 2149), ('husband', 2126), ('potatoes', 2065), ('dinner', 2052), ('sugar', 2038), ('thought', 2005)]
[('recipe', 119698), ('made', 70707), ('make', 64242), ('great', 60490), ('good', 57518), ('easy', 47935), ('time', 40376), ('re

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('easy', 'make'), 2855), (('great', 'recipe'), 2169), (('really', 'good'), 986), (('quick', 'easy'), 811), (('followed', 'recipe'), 718), (('last', 'night'), 690), (('recipe', 'made'), 682), (('sour', 'cream'), 658), (('really', 'enjoyed'), 639), (('good', 'recipe'), 615), (('easy', 'prepare'), 599), (('side', 'dish'), 574), (('recipe', 'easy'), 553), (('made', 'recipe'), 510), (('sharing', 'recipe'), 502), (('recipe', 'exactly'), 500), (('family', 'loved'), 491), (('wonderful', 'recipe'), 482), (('good', 'easy'), 480), (('definitely', 'make'), 471), (('cream', 'cheese'), 466), (('olive', 'oil'), 463), (('first', 'time'), 461), (('1/2', 'cup'), 441), (('easy', 'recipe'), 439), (('put', 'together'), 435), (('really', 'liked'), 422), (('recipe', 'great'), 417), (('everyone', 'loved'), 403), (('mashed', 'potatoes'), 398), (('time', 'make'), 390), (('turned', 'great'), 384), (('ice', 'cream'), 361), (('brown', 'sugar'), 361), (('lemon', 'juice'), 344), (('even', 'better'), 342), (('peanu

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 361), (('easy', 'put', 'together'), 238), (('made', 'last', 'night'), 225), (('recipe', 'easy', 'make'), 188), (('good', 'easy', 'make'), 175), (('another', 'great', 'recipe'), 157), (('cut', 'recipe', 'half'), 147), (('would', 'change', 'thing'), 138), (('quick', 'easy', 'make'), 132), (('whole', 'family', 'loved'), 120), (('red', 'pepper', 'flakes'), 120), (('vanilla', 'ice', 'cream'), 112), (('delicious', 'easy', 'make'), 99), (('followed', 'directions', 'exactly'), 99), (('great', 'recipe', 'made'), 98), (('great', 'recipe', 'easy'), 91), (('easy', 'make', 'great'), 89), (('first', 'time', 'made'), 89), (('dinner', 'last', 'night'), 88), (('easy', 'make', 'good'), 86), (('sharing', 'great', 'recipe'), 83), (('family', 'really', 'enjoyed'), 77), (('recipe', 'quick', 'easy'), 75), (('easy', 'make', 'made'), 73), (('served', 'mashed', 'potatoes'), 73), (('onion', 'soup', 'mix'), 69), (('made', 'recipe', 'exactly'), 69), (('got', 'rave', 'reviews'),

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 10619), ('good', 10455), ('easy', 8852), ('recipe', 6834), ('really', 5966), ('delicious', 4925), ('little', 4501), ('wonderful', 4152), ('much', 3744), ('nice', 3205), ('instead', 3152), ('well', 3080), ('even', 2517), ('garlic', 2511), ('dish', 2453), ('fresh', 2322), ('tasty', 2136), ('cheese', 2028), ('excellent', 1854), ('first', 1843), ('definitely', 1782), ('still', 1740), ('quick', 1708), ('best', 1604), ('better', 1590), ('last', 1506), ('sweet', 1505), ('green', 1474), ('together', 1445), ('sure', 1431), ('enough', 1426), ('never', 1385), ('oven', 1377), ('simple', 1370), ('whole', 1325), ('perfect', 1320), ('right', 1314), ('hot', 1310), ('top', 1293), ('exactly', 1257), ('ever', 1228), ('different', 1224), ('quite', 1136), ('always', 1091), ('big', 1076), ('red', 1064), ('extra', 1060), ('absolutely', 1030), ('chicken', 964), ('pretty', 955)]
[('great', 60490), ('good', 55572), ('easy', 45940), ('recipe', 40580), ('really', 35813), ('delicious', 28480), ('little'

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 1260), (('really', 'good'), 1082), (('quick', 'easy'), 714), (('good', 'easy'), 697), (('great', 'easy'), 529), (('best', 'ever'), 505), (('easy', 'good'), 497), (('easy', 'great'), 479), (('easy', 'delicious'), 446), (('good', 'great'), 403), (('delicious', 'easy'), 357), (('recipe', 'great'), 345), (('even', 'better'), 343), (('easy', 'tasty'), 342), (('good', 'really'), 339), (('recipe', 'easy'), 339), (('good', 'good'), 333), (('easy', 'together'), 331), (('recipe', 'exactly'), 323), (('absolutely', 'delicious'), 316), (('pretty', 'good'), 311), (('great', 'great'), 297), (('easy', 'quick'), 274), (('great', 'good'), 272), (('really', 'easy'), 268), (('recipe', 'good'), 264), (('really', 'great'), 256), (('good', 'little'), 254), (('wonderful', 'easy'), 253), (('much', 'better'), 251), (('great', 'really'), 248), (('good', 'recipe'), 243), (('easy', 'recipe'), 237), (('wonderful', 'great'), 204), (('much', 'great'), 203), (('good', 'nice'), 200), (('easy', 'r

### Check number of ingredients 15 till 20 ###


In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["n_ingredients"] >= 15) & (PP_RECIPES_DF["n_ingredients"] <= 20)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 2331), ('make', 1183), ('made', 1157), ('good', 1029), ('sauce', 1023), ('great', 982), ('time', 816), ('one', 644), ('like', 605), ('would', 603), ('really', 592), ('easy', 536), ('chicken', 496), ('delicious', 491), ('flavor', 486), ('dish', 480), ('wonderful', 476), ('loved', 456), ('much', 452), ('little', 446), ('instead', 386), ('taste', 378), ('well', 374), ('think', 373), ('served', 363), ('soup', 349), ('family', 321), ('try', 307), ('nice', 302), ('bit', 302), ('could', 296), ('even', 292), ('cheese', 289), ('meat', 283), ('love', 278), ('excellent', 277), ('making', 274), ('rice', 272), ('half', 271), ('cooked', 270), ('pepper', 255), ('1/2', 250), ('fresh', 250), ('tasty', 250), ('beef', 249), ('dinner', 246), ('red', 242), ('husband', 241), ('cut', 236), ('ingredients', 235)]
[('recipe', 14508), ('made', 8531), ('make', 6971), ('great', 6641), ('good', 6269), ('sauce', 5749), ('time', 5117), ('really', 4430), ('would', 3960), ('one', 3952), ('like', 3950), ('lo

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('great', 'recipe'), 227), (('easy', 'make'), 174), (('followed', 'recipe'), 91), (('italian', 'sausage'), 81), (('recipe', 'made'), 80), (('ground', 'beef'), 78), (('sour', 'cream'), 76), (('really', 'good'), 76), (('red', 'pepper'), 75), (('really', 'enjoyed'), 74), (('last', 'night'), 68), (('good', 'recipe'), 62), (('wonderful', 'recipe'), 60), (('definitely', 'make'), 59), (('made', 'recipe'), 59), (('time', 'make'), 58), (('1/2', 'cup'), 57), (('family', 'loved'), 55), (('brown', 'sugar'), 54), (('spaghetti', 'sauce'), 53), (('turned', 'great'), 51), (('recipe', 'good'), 49), (('really', 'liked'), 49), (('recipe', 'exactly'), 47), (('first', 'time'), 46), (('easy', 'prepare'), 44), (('would', 'make'), 43), (('excellent', 'recipe'), 43), (('great', 'flavor'), 42), (('recipe', 'easy'), 42), (('red', 'wine'), 42), (('recipe', 'great'), 41), (('tomato', 'sauce'), 40), (('quick', 'easy'), 40), (('think', 'would'), 39), (('everyone', 'loved'), 38), (('even', 'better'), 38), (('ever',

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('followed', 'recipe', 'exactly'), 36), (('red', 'pepper', 'flakes'), 26), (('cut', 'recipe', 'half'), 24), (('made', 'last', 'night'), 21), (('whole', 'family', 'loved'), 17), (('hot', 'italian', 'sausage'), 16), (('would', 'change', 'thing'), 16), (('another', 'great', 'recipe'), 15), (('recipe', 'easy', 'make'), 14), (('great', 'recipe', 'made'), 14), (('dinner', 'last', 'night'), 14), (('good', 'luck', 'contest'), 14), (('crushed', 'red', 'pepper'), 13), (('lb', 'ground', 'beef'), 13), (('easy', 'put', 'together'), 13), (('good', 'easy', 'make'), 11), (('cup', 'red', 'wine'), 11), (('lean', 'ground', 'beef'), 10), (('well', 'worth', 'effort'), 10), (('otherwise', 'followed', 'recipe'), 10), (('mild', 'italian', 'sausage'), 10), (('served', 'mashed', 'potatoes'), 10), (('second', 'time', 'made'), 9), (('got', 'rave', 'reviews'), 9), (('little', 'time', 'consuming'), 9), (('easy', 'make', 'made'), 9), (('red', 'bell', 'pepper'), 9), (('followed', 'directions', 'exactly'), 9), (('yu

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('good', 1005), ('great', 982), ('recipe', 787), ('really', 592), ('easy', 512), ('delicious', 481), ('much', 452), ('little', 437), ('wonderful', 426), ('instead', 386), ('well', 368), ('dish', 311), ('even', 292), ('nice', 291), ('fresh', 243), ('hot', 229), ('definitely', 227), ('excellent', 224), ('best', 219), ('red', 218), ('still', 211), ('first', 198), ('cheese', 195), ('tasty', 195), ('garlic', 194), ('enough', 193), ('green', 182), ('better', 180), ('ever', 173), ('sure', 170), ('last', 163), ('different', 162), ('together', 162), ('never', 157), ('italian', 156), ('quite', 152), ('sweet', 145), ('whole', 141), ('right', 140), ('extra', 139), ('chicken', 134), ('perfect', 134), ('exactly', 131), ('absolutely', 130), ('many', 119), ('fantastic', 114), ('top', 112), ('always', 104), ('salad', 102), ('white', 102)]
[('great', 6641), ('good', 6037), ('recipe', 4889), ('really', 4430), ('much', 3190), ('delicious', 3088), ('instead', 3052), ('easy', 2932), ('little', 2923), ('won

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('great', 'recipe'), 125), (('best', 'ever'), 98), (('really', 'good'), 79), (('good', 'easy'), 42), (('even', 'better'), 40), (('quick', 'easy'), 36), (('recipe', 'exactly'), 36), (('recipe', 'great'), 35), (('absolutely', 'delicious'), 33), (('good', 'good'), 33), (('good', 'really'), 31), (('pretty', 'good'), 31), (('easy', 'good'), 29), (('recipe', 'good'), 28), (('great', 'great'), 27), (('well', 'worth'), 27), (('good', 'great'), 26), (('really', 'easy'), 26), (('good', 'little'), 26), (('still', 'great'), 25), (('really', 'nice'), 25), (('much', 'better'), 24), (('great', 'dish'), 24), (('easy', 'delicious'), 23), (('great', 'really'), 23), (('great', 'easy'), 22), (('easy', 'tasty'), 22), (('good', 'recipe'), 22), (('wonderful', 'recipe'), 21), (('really', 'enjoyed'), 21), (('easy', 'great'), 20), (('easy', 'follow'), 20), (('much', 'good'), 20), (('recipe', 'easy'), 20), (('great', 'good'), 20), (('still', 'good'), 20), (('dish', 'great'), 20), (('recipe', 'really'), 20), ((

### Check number of ingredients 20 or larger ###


In [None]:
filtered_df = PP_RECIPES_DF.loc[(PP_RECIPES_DF["n_ingredients"] >= 20)]
review_divides = get_review_divides(filtered_df)
print("hi")
tokens1, tokens2, tokens3, tokens4, tokens5 = [tokenizer(x) for x in review_divides]



hi
done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


In [None]:
# unigrams
print(Counter(tokens1).most_common()[:50])

print(Counter(tokens2).most_common()[:50])

print(Counter(tokens3).most_common()[:50])

print(Counter(tokens4).most_common()[:50])

print(Counter(tokens5).most_common()[:50])

[('recipe', 280), ('made', 156), ('make', 129), ('great', 120), ('good', 117), ('sauce', 100), ('chili', 100), ('time', 98), ('like', 87), ('one', 68), ('would', 65), ('wonderful', 64), ('really', 57), ('delicious', 57), ('dish', 57), ('little', 53), ('much', 50), ('best', 50), ('chicken', 48), ('ingredients', 46), ('flavor', 45), ('family', 44), ('cheese', 44), ('instead', 43), ('meat', 42), ('beans', 42), ('well', 42), ('think', 40), ('could', 40), ('1/2', 40), ('loved', 40), ('hot', 39), ('taste', 38), ('pepper', 38), ('put', 37), ('easy', 37), ('served', 37), ('salad', 37), ('even', 36), ('left', 35), ('bit', 35), ('nice', 34), ('excellent', 33), ('soup', 31), ('beef', 31), ('spices', 31), ('love', 31), ('perfect', 31), ('thought', 30), ('day', 30)]
[('recipe', 2394), ('made', 1366), ('make', 1050), ('great', 971), ('good', 969), ('sauce', 910), ('time', 879), ('really', 689), ('like', 681), ('one', 650), ('chili', 622), ('would', 554), ('chicken', 550), ('loved', 538), ('much', 49

In [None]:
#bigram
bigram_tokens1 = nltk.bigrams(tokens1) # produce bigrams
print(Counter(bigram_tokens1).most_common()[:50])

bigram_tokens2 = nltk.bigrams(tokens2) # produce bigrams
print(Counter(bigram_tokens2).most_common()[:50])

bigram_tokens3 = nltk.bigrams(tokens3) # produce bigrams
print(Counter(bigram_tokens3).most_common()[:50])

bigram_tokens4 = nltk.bigrams(tokens4) # produce bigrams
print(Counter(bigram_tokens4).most_common()[:50])


bigram_tokens5 = nltk.bigrams(tokens5) # produce bigrams
print(Counter(bigram_tokens5).most_common()[:50])

[(('great', 'recipe'), 31), (('sour', 'cream'), 14), (('chili', 'powder'), 13), (('easy', 'make'), 11), (('made', 'recipe'), 11), (('wonderful', 'recipe'), 10), (('many', 'times'), 9), (('ever', 'made'), 9), (('well', 'worth'), 9), (('last', 'night'), 9), (('ground', 'beef'), 8), (('definitely', 'make'), 8), (('change', 'thing'), 8), (('really', 'enjoyed'), 8), (('followed', 'recipe'), 8), (('best', 'chili'), 8), (('really', 'good'), 7), (('1/2', 'cup'), 7), (('hot', 'sauce'), 7), (('change', 'made'), 7), (('followed', 'directions'), 7), (('time', 'make'), 7), (('kidney', 'beans'), 7), (('chili', 'recipe'), 7), (('brown', 'sugar'), 7), (('could', 'find'), 6), (('recipe', 'really'), 6), (('red', 'pepper'), 6), (('put', 'together'), 6), (('family', 'loved'), 6), (('cheese', 'top'), 6), (('recipe', 'made'), 6), (('chili', 'ever'), 6), (('worth', 'effort'), 6), (('made', 'changes'), 6), (('first', 'time'), 6), (('cups', 'water'), 6), (('good', 'recipe'), 6), (('going', 'make'), 6), (('grea

In [None]:
#trigram
trigram_tokens1 = nltk.trigrams(tokens1) # produce bigrams
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens2 = nltk.trigrams(tokens2) # produce bigrams
print(Counter(trigram_tokens2).most_common()[:50])

trigram_tokens3 = nltk.trigrams(tokens3) # produce bigrams
print(Counter(trigram_tokens3).most_common()[:50])

trigram_tokens4 = nltk.trigrams(tokens4) # produce bigrams
print(Counter(trigram_tokens4).most_common()[:50])


trigram_tokens5 = nltk.trigrams(tokens5) # produce bigrams
print(Counter(trigram_tokens5).most_common()[:50])

[(('best', 'chili', 'ever'), 6), (('well', 'worth', 'effort'), 4), (('worth', 'time', 'effort'), 3), (('cans', 'kidney', 'beans'), 3), (('made', 'last', 'night'), 3), (('left', 'browning', 'sauce'), 3), (('red', 'pepper', 'flakes'), 3), (('cut', 'recipe', 'half'), 3), (('sour', 'cream', 'top'), 3), (('recipe', 'many', 'times'), 2), (('lean', 'ground', 'beef'), 2), (('long', 'list', 'ingredients'), 2), (('soup', 'absolutely', 'wonderful'), 2), (('even', 'better', 'day'), 2), (('family', 'loved', 'husband'), 2), (('like', 'refried', 'beans'), 2), (('cooked', '1/2', 'hours'), 2), (('easy', 'put', 'together'), 2), (('would', 'change', 'thing'), 2), (('whole', 'family', 'loved'), 2), (('followed', 'directions', 'exactly'), 2), (('jelly', 'roll', 'pan'), 2), (('spaghetti', 'chili', 'cheese'), 2), (('chili', 'cheese', 'onions'), 2), (('ds', 'said', 'best'), 2), (('black', 'pepper', 'powder'), 2), (('red', 'chilli', 'powder'), 2), (('green', 'bell', 'pepper'), 2), (('tastes', 'much', 'like'), 

In [None]:
# # #  adjectives and adverbs only # # #

tokens_adj_adv_1, tokens_adj_adv_2, tokens_adj_adv_3, tokens_adj_adv_4, tokens_adj_adv_5 = [filter_adjectives_adverbs_only(x) for x in [tokens1, tokens2, tokens3, tokens4, tokens5]]
# unigrams
print(Counter(tokens_adj_adv_1).most_common()[:50])

print(Counter(tokens_adj_adv_2).most_common()[:50])

print(Counter(tokens_adj_adv_3).most_common()[:50])

print(Counter(tokens_adj_adv_4).most_common()[:50])

print(Counter(tokens_adj_adv_5).most_common()[:50])

[('great', 120), ('good', 114), ('recipe', 94), ('really', 57), ('wonderful', 56), ('delicious', 56), ('little', 51), ('much', 50), ('best', 48), ('instead', 43), ('well', 41), ('dish', 39), ('hot', 37), ('even', 36), ('easy', 34), ('nice', 33), ('cheese', 30), ('ever', 29), ('excellent', 28), ('definitely', 27), ('red', 26), ('sure', 24), ('garlic', 24), ('better', 24), ('last', 23), ('many', 23), ('absolutely', 23), ('first', 22), ('whole', 21), ('enough', 21), ('perfect', 20), ('worth', 20), ('never', 19), ('right', 19), ('still', 18), ('different', 17), ('tasty', 17), ('sweet', 17), ('fabulous', 17), ('favorite', 16), ('long', 16), ('oven', 16), ('sour', 16), ('green', 16), ('salad', 16), ('fresh', 15), ('exactly', 15), ('together', 14), ('top', 14), ('chili', 14)]
[('great', 971), ('good', 937), ('recipe', 776), ('really', 689), ('much', 492), ('delicious', 440), ('instead', 437), ('well', 412), ('little', 409), ('wonderful', 388), ('even', 341), ('easy', 312), ('first', 272), ('b

In [None]:
# # #  adjectives and adverbs only # # #

#bigram
bigram_tokens_adj_adv_1 = nltk.bigrams(tokens_adj_adv_1) # produce bigrams
print(Counter(bigram_tokens_adj_adv_1).most_common()[:50])

bigram_tokens_adj_adv_2 = nltk.bigrams(tokens_adj_adv_2) # produce bigrams
print(Counter(bigram_tokens_adj_adv_2).most_common()[:50])

bigram_tokens_adj_adv_3 = nltk.bigrams(tokens_adj_adv_3) # produce bigrams
print(Counter(bigram_tokens_adj_adv_3).most_common()[:50])

bigram_tokens_adj_adv_4 = nltk.bigrams(tokens_adj_adv_4) # produce bigrams
print(Counter(bigram_tokens_adj_adv_4).most_common()[:50])


bigram_tokens_adj_adv_5 = nltk.bigrams(tokens_adj_adv_5) # produce bigrams
print(Counter(bigram_tokens_adj_adv_5).most_common()[:50])

[(('best', 'ever'), 18), (('great', 'recipe'), 15), (('really', 'good'), 8), (('well', 'worth'), 8), (('good', 'great'), 7), (('pretty', 'good'), 6), (('recipe', 'great'), 6), (('absolutely', 'delicious'), 5), (('absolutely', 'fabulous'), 5), (('cheese', 'top'), 5), (('good', 'little'), 5), (('good', 'recipe'), 4), (('garlic', 'garlic'), 4), (('little', 'sweet'), 4), (('really', 'nice'), 4), (('absolutely', 'wonderful'), 4), (('great', 'great'), 4), (('really', 'really'), 4), (('recipe', 'wonderful'), 4), (('wonderful', 'recipe'), 4), (('good', 'best'), 4), (('recipe', 'good'), 4), (('really', 'well'), 4), (('half', 'instead'), 3), (('great', 'dish'), 3), (('much', 'good'), 3), (('overall', 'good'), 3), (('still', 'wonderful'), 3), (('wonderful', 'even'), 3), (('even', 'better'), 3), (('tasty', 'recipe'), 3), (('definitely', 'keeper'), 3), (('good', 'good'), 3), (('much', 'better'), 3), (('tsp', 'black'), 3), (('good', 'better'), 3), (('good', 'certainly'), 3), (('dark', 'brown'), 3), 

## Check Sugar##

### sugar freq in reviews ###

In [None]:
date_ranges_start_points = ['2000-01-01','2003-03-01', '2006-05-01', '2009-07-01', '2012-09-01','2015-11-01']
end_point = '2019-01-01'


### stats per divide ####
# filter the reviews with sugar word in it for each divide period
# calculate total amount and percentage out of total divide's reviews

filtered_interactions_sorted = PP_INTERACTIONS_DF.sort_values("date")
array_of_divides = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides.append(cur_df)

array_of_divides_of_sugar = [df[df['review'].str.lower().str.contains("sugar")] for df in array_of_divides]


here


In [None]:
sugar_amount = [len(x) for x in array_of_divides_of_sugar]
total_amount = [len(x) for x in array_of_divides]
sugar_percentage = [sugar_amount[i]/total_amount[i] for i in range(len(sugar_amount))]
print("absolute amount of reviews contains sugar per period: {0}".format(sugar_amount))
print("absolute amount of reviews per period: {0}".format(total_amount))
print("percentage of reviews which contains sugar per period: {0}".format(sugar_percentage))





absolute amount of reviews contains sugar per period: [1283, 9757, 29506, 20675, 8779, 5915]
absolute amount of reviews per period: [28980, 166699, 455054, 294278, 118060, 69296]
percentage of reviews which contains sugar per period: [0.044271911663216014, 0.05853064505485936, 0.06484065627376091, 0.07025669604931391, 0.0743604946637303, 0.08535846224890326]


### dessert freq in commnets ###

In [None]:
filtered_interactions_sorted = PP_INTERACTIONS_DF.sort_values("date")
array_of_divides = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides.append(cur_df)

array_of_divides_of_dessert = [df[(df['review'].str.lower().str.contains("dessert")) | (df['review'].str.lower().str.contains("desserts"))] for df in array_of_divides]

dessert_amount = [len(x) for x in array_of_divides_of_dessert]
total_amount = [len(x) for x in array_of_divides]
dessert_percentage = [dessert_amount[i]/total_amount[i] for i in range(len(dessert_amount))]
print("absolute amount of reviews contains dessert per period: {0}".format(dessert_amount))
print("absolute amount of reviews per period: {0}".format(total_amount))
print("percentage of reviews which contains dessert per period: {0}".format(dessert_percentage))


here
absolute amount of reviews contains dessert per period: [555, 3158, 7691, 3975, 1321, 536]
absolute amount of reviews per period: [28980, 166699, 455054, 294278, 118060, 69296]
percentage of reviews which contains dessert per period: [0.019151138716356108, 0.018944324800988607, 0.016901290835812894, 0.01350763563705068, 0.011189225817380993, 0.00773493419533595]


### sweet freq in reviews ###

In [None]:
filtered_interactions_sorted = PP_INTERACTIONS_DF.sort_values("date")
array_of_divides = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides.append(cur_df)

print("here")
array_of_divides_of_sweet = [df[(df['review'].str.lower().str.contains("sweet")) | (df['review'].str.lower().str.contains("sweets"))] for df in array_of_divides]

sweet_amount = [len(x) for x in array_of_divides_of_sweet]
total_amount = [len(x) for x in array_of_divides]
sweet_percentage = [sweet_amount[i]/total_amount[i] for i in range(len(sweet_amount))]
print("absolute amount of reviews contains sweet per period: {0}".format(sweet_amount))
print("absolute amount of reviews per period: {0}".format(total_amount))
print("percentage of reviews which contains dessert per period: {0}".format(sweet_percentage))


here
absolute amount of reviews contains sweet per period: [1722, 11475, 33217, 22294, 8851, 4354]
absolute amount of reviews per period: [28980, 166699, 455054, 294278, 118060, 69296]
percentage of reviews which contains dessert per period: [0.059420289855072465, 0.068836645690736, 0.07299573237461927, 0.07575829657670638, 0.07497035405725902, 0.06283190948972524]


### desserts sweets tags freqs in recipes reviewed###

In [None]:
def convert_tags_field_to_list(df):
    df["tags"] = [list(set(tags_str[2:-2].split("', '"))) for tags_str in df["tags"].tolist()]
    return df


desserts_sweets_tags_path = adapt_path_to_drive(r"data/tags_categories/dessert_or_sweet.csv")

tags_set = set(pd.read_csv(desserts_sweets_tags_path, encoding="utf-8").iloc[:, 0].tolist())
if isinstance(PP_RECIPES_DF["tags"][0], str):
  PP_RECIPES_DF = convert_tags_field_to_list(PP_RECIPES_DF)
PP_RECIPES_DF["is_dessert_sweet_tags"] = PP_RECIPES_DF["tags"].apply(lambda x: len(list(set(x).intersection(tags_set))) > 0)

cols_to_keep = ["recipe_id", "is_dessert_sweet_tags", "calories", "total_fat_pdv", "sugar_pdv", "sodium_pdv", "protein_pdv", "saturated_fat_pdv", "carbohydrates_pdv"]

enriched_interactions = PP_INTERACTIONS_DF.merge(PP_RECIPES_DF[cols_to_keep], how="left", on="recipe_id")


In [None]:
filtered_interactions_sorted = enriched_interactions.sort_values("date")
array_of_divides_of_sweet = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides_of_sweet.append(cur_df)

# print stats of is_dessert_sweet_tags per period
# print describe per period


In [None]:
dessert_sweet_tags_amount = [len(x.loc[x["is_dessert_sweet_tags"]]) for x in array_of_divides_of_sweet]
total_amount = [len(x) for x in array_of_divides]
dessert_sweet_tags_percentage = [dessert_sweet_tags_amount[i]/total_amount[i] for i in range(len(dessert_sweet_tags_amount))]
print("absolute amount of reviews for dessert/sweet tagged recipes per period: {0}".format(dessert_sweet_tags_amount))
print("absolute amount of reviews per period: {0}".format(total_amount))
print("percentage of reviews for dessert/sweet tagged recipes per period: {0}".format(dessert_sweet_tags_percentage))


absolute amount of reviews for dessert/sweet tagged recipes per period: [5800, 34385, 91723, 55845, 22201, 16935]
absolute amount of reviews per period: [28980, 166699, 455054, 294278, 118060, 69296]
percentage of reviews for dessert/sweet tagged recipes per period: [0.20013802622498275, 0.2062699836231771, 0.2015650889784509, 0.18976953764807428, 0.1880484499407081, 0.24438640036942968]


### Nutrition Stats per Period ###

In [None]:
filtered_interactions_sorted = enriched_interactions.sort_values("date")
array_of_divides = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides.append(cur_df)


In [None]:
# check describes
nutrition_cols = ["calories", "total_fat_pdv", "sugar_pdv", "sodium_pdv", "protein_pdv", "saturated_fat_pdv", "carbohydrates_pdv"]

describes_df = pd.DataFrame(array_of_divides[0][nutrition_cols].describe()).transpose().reset_index()
for df in array_of_divides[1:]:
  cur_df = pd.DataFrame(df[nutrition_cols].describe()).transpose().reset_index()
  describes_df = pd.concat([describes_df, cur_df])


In [None]:
dict_of_nutritions = {k: v for k, v in describes_df.groupby('index')}

In [None]:
dict_of_nutritions["calories"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["calories"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,calories,28980.0,485.998671,712.151004,0.0,181.875,326.05,530.9,25712.6
2003-2006,calories,166699.0,499.15028,887.904151,0.0,178.6,319.1,528.1,44239.8
2006-2009,calories,455054.0,449.915699,1166.509345,0.0,174.1,308.0,506.1,434360.2
2009-2012,calories,294278.0,441.172891,684.819507,0.0,174.2,307.8,504.2,45609.0
2012-2015,calories,118060.0,465.872838,1435.220558,0.0,182.1,323.4,529.625,434360.2
2015-2018,calories,69296.0,493.989284,883.007352,0.0,182.3,324.3,553.8,101614.7


In [None]:
dict_of_nutritions["total_fat_pdv"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["total_fat_pdv"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,total_fat_pdv,28980.0,36.808903,65.814203,0.0,8.0,21.0,41.0,3105.0
2003-2006,total_fat_pdv,166699.0,38.017445,82.797054,0.0,9.0,21.0,42.0,4331.0
2006-2009,total_fat_pdv,455054.0,33.663042,70.041365,0.0,8.0,20.0,39.0,4331.0
2009-2012,total_fat_pdv,294278.0,33.17266,67.112845,0.0,8.0,20.0,39.0,4331.0
2012-2015,total_fat_pdv,118060.0,35.288108,67.824336,0.0,9.0,21.0,42.0,4331.0
2015-2018,total_fat_pdv,69296.0,37.197212,100.75843,0.0,9.0,21.0,43.0,17183.0


In [None]:
dict_of_nutritions["sugar_pdv"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["sugar_pdv"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,sugar_pdv,28980.0,85.111111,261.187504,0.0,9.0,24.0,70.0,11042.0
2003-2006,sugar_pdv,166699.0,88.426655,269.23008,0.0,9.0,24.0,70.0,18943.0
2006-2009,sugar_pdv,455054.0,76.80574,792.694222,0.0,9.0,24.0,67.0,362729.0
2009-2012,sugar_pdv,294278.0,73.298687,218.460724,0.0,9.0,23.0,65.0,18127.0
2012-2015,sugar_pdv,118060.0,78.195638,1081.266672,0.0,9.0,23.0,63.0,362729.0
2015-2018,sugar_pdv,69296.0,87.271098,265.886437,0.0,9.0,26.0,75.0,14495.0


In [None]:
dict_of_nutritions["sodium_pdv"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["sodium_pdv"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,sodium_pdv,28980.0,32.76646,105.283915,0.0,6.0,16.0,35.0,4774.0
2003-2006,sodium_pdv,166699.0,34.556224,139.578767,0.0,6.0,16.0,35.0,14152.0
2006-2009,sodium_pdv,455054.0,30.581716,106.026468,0.0,6.0,15.0,34.0,9443.0
2009-2012,sodium_pdv,294278.0,31.061384,109.719628,0.0,6.0,16.0,35.0,14664.0
2012-2015,sodium_pdv,118060.0,33.330213,111.829249,0.0,7.0,17.0,37.0,7094.0
2015-2018,sodium_pdv,69296.0,36.024172,199.315571,0.0,7.0,17.0,38.0,29338.0


In [None]:
dict_of_nutritions["protein_pdv"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["protein_pdv"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,protein_pdv,28980.0,38.541511,55.203491,0.0,7.0,20.0,57.0,2638.0
2003-2006,protein_pdv,166699.0,37.790053,83.086644,0.0,7.0,19.0,55.0,6552.0
2006-2009,protein_pdv,455054.0,35.177144,76.15831,0.0,7.0,18.0,52.0,6552.0
2009-2012,protein_pdv,294278.0,34.606433,70.255138,0.0,7.0,18.0,51.0,6552.0
2012-2015,protein_pdv,118060.0,36.639666,63.268578,0.0,7.0,20.0,55.0,6552.0
2015-2018,protein_pdv,69296.0,36.951585,83.11534,0.0,7.0,18.0,53.0,6540.0


In [None]:
dict_of_nutritions["saturated_fat_pdv"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["saturated_fat_pdv"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,saturated_fat_pdv,28980.0,47.911387,96.584454,0.0,8.0,26.0,55.0,6385.0
2003-2006,saturated_fat_pdv,166699.0,48.999058,112.841677,0.0,8.0,25.0,54.0,6385.0
2006-2009,saturated_fat_pdv,455054.0,42.907284,92.214625,0.0,7.0,23.0,51.0,6269.0
2009-2012,saturated_fat_pdv,294278.0,41.845109,87.542509,0.0,7.0,22.0,50.0,6875.0
2012-2015,saturated_fat_pdv,118060.0,44.625512,85.711955,0.0,8.0,25.0,55.0,6269.0
2015-2018,saturated_fat_pdv,69296.0,47.930847,106.917007,0.0,9.0,26.0,55.0,10395.0


In [None]:
dict_of_nutritions["carbohydrates_pdv"].index=['2000-2003','2003-2006', '2006-2009', '2009-2012', '2012-2015','2015-2018']
dict_of_nutritions["carbohydrates_pdv"]

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
2000-2003,carbohydrates_pdv,28980.0,15.600621,31.010362,0.0,4.0,9.0,16.0,948.0
2003-2006,carbohydrates_pdv,166699.0,16.263211,35.395319,0.0,4.0,9.0,16.0,2565.0
2006-2009,carbohydrates_pdv,455054.0,14.598663,80.779892,0.0,4.0,9.0,16.0,36098.0
2009-2012,carbohydrates_pdv,294278.0,14.269262,26.916257,0.0,4.0,9.0,16.0,1511.0
2012-2015,carbohydrates_pdv,118060.0,14.970795,108.740871,0.0,4.0,9.0,16.0,36098.0
2015-2018,carbohydrates_pdv,69296.0,16.315487,32.258283,0.0,4.0,9.0,17.0,1554.0


In [None]:
date_ranges_start_points = ['2000-01-01','2003-03-01', '2006-05-01', '2009-07-01', '2012-09-01','2015-11-01']
end_point = '2019-01-01'


### stats per divide ####
# filter the reviews with sugar word in it for each divide period
# calculate total amount and percentage out of total divide's reviews

filtered_interactions_sorted = PP_INTERACTIONS_DF.sort_values("date")
array_of_divides = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides.append(cur_df)

array_of_divides_of_sugar = [df[df['review'].str.lower().str.contains("sugar")] for df in array_of_divides]


here


In [None]:
array_of_divides_of_sugar[4]["review"].iloc[:20].tolist()

["Usually I don't really peppers on my food.  And I was surprised with the taste of this salmon.  In fact the fresh black pepper with the brown sugar (which I used) is great with the salmon.  It was just a little bit and peppery.  Thanks Kerfuffle :)  Made for 123 hit wonders",
 "Sweet, creamy, and right up my alley! I used cream of coconut and half-and-half mixed, and a packet of Truvia for the sugar (didn't feel like grinding my extra gritty sugar to powder for this). I would definitely say that the nutmeg is crucial here; otherwise, I think the drink might be a tad too sweet without it. Rather than pouring it over shaved ice, I just tossed some ice cubes in my shaker and strained them out afterwards. I found that this two serving recipe actually makes one normal (martini glass) sized drink. I would definitely do this drink again, thanks for posting! Made for The Honeys for ZWT8",
 "Ok this is really important: do NOT make this recipe with truvia instead of sugar. There was barely en