In [None]:
from google.colab import drive
drive.mount('/content/drive')

# now can import drive files as usual files with the path like
# /content/drive/My Drive/location_of_the_file


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk import everygrams
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

STOP_WORDS = [x.lower() for x in set(stopwords.words('english'))]
PUNCTUATION_TO_REMOVE = [",", ".", ";", "!", '"', "?", '“', '”', "‘", '’',
                         "[", "]", "{", "}",')','(', '...', "n't", "'s","''", "-"
                         ,':', '``', 'br', 'gt']
NO_MEANING_TO_REMOVE = ["lt", "quot", "next", "thanks", "thank", "also", "use", "used", "'ve", "add", "added", "br/", "039"]                      

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import pandas as pd

DRIVE_INITIAL_PATH = r"/content/drive/My Drive/Data Mining"

def adapt_path_to_drive(original_path):
  return "/".join([DRIVE_INITIAL_PATH, original_path])



OUR_PP_RECIPES_PATH = r"data/our_pp_recipes.csv"
OUR_PP_INTERACTIONS_PATH = r"data/our_pp_interactions.csv"

OUR_PP_RECIPES_PATH = adapt_path_to_drive(OUR_PP_RECIPES_PATH)
OUR_PP_INTERACTIONS_PATH = adapt_path_to_drive(OUR_PP_INTERACTIONS_PATH)


PP_INTERACTIONS_DF = pd.read_csv(OUR_PP_INTERACTIONS_PATH)
PP_RECIPES_DF = pd.read_csv(OUR_PP_RECIPES_PATH)



In [None]:
date_ranges_start_points = ['2000-01-01','2004-01-01', '2008-01-01', '2012-01-01', '2016-01-01']
end_point = '2020-01-01'

filtered_interactions_sorted = PP_INTERACTIONS_DF.sort_values("date")
array_of_divides = []
for i in range(len(date_ranges_start_points)):
  if i == len(date_ranges_start_points) - 1:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < end_point)]
  else:
    cur_df = filtered_interactions_sorted.loc[(filtered_interactions_sorted["date"] >= date_ranges_start_points[i]) & (filtered_interactions_sorted["date"] < date_ranges_start_points[i+1])]
  cur_df = cur_df.fillna("")
  array_of_divides.append(cur_df)

array_of_divides_of_sugar = [df[df['review'].str.lower().str.contains("sugar")] for df in array_of_divides] # array of dataframe each dataframe is for a certain time range and contains only reviews which contains the word sugar


# Strings fitting

In [None]:
def get_review_divides(sugar_reviews_df):
  """
  concatenating strings from review param into a one long string
  """
  long_str = ''
  for string in sugar_reviews_df["review"]:
    long_str += ' ' + string
  return long_str

sugar_text1, sugar_text2, sugar_text3, sugar_text4, sugar_text5 = [get_review_divides(x) for x in array_of_divides_of_sugar] 
sugar_text_lst = [sugar_text1, sugar_text2, sugar_text3, sugar_text4, sugar_text5]

# Tokenizing & String cleaning

In [None]:
def tokenizer(data):
  not_in_list = STOP_WORDS + PUNCTUATION_TO_REMOVE + NO_MEANING_TO_REMOVE
  tokens = word_tokenize(data)
  tokens = [x.lower() for x in tokens if (x not in not_in_list) and (len(x) > 1)]
  print("done tokenizing")
  return tokens

sugar_tokens1, sugar_tokens2, sugar_tokens3, sugar_tokens4, sugar_tokens5 = [tokenizer(x) for x in sugar_text_lst] 

done tokenizing
done tokenizing
done tokenizing
done tokenizing
done tokenizing


# Sugar trigrams printing

In [None]:
def sugar_val_filter(tokenized_trigrams):
  ret_lst = []
  for tup in tokenized_trigrams:
    if 'sugar' in tup:
      ret_lst.append(tup)
    else:
        continue
  return ret_lst

In [None]:
trigram_tokens1 = nltk.everygrams(sugar_tokens1, 4,4) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])

[(('brown', 'sugar', 'instead', 'white'), 10), (('1/2', 'cup', 'brown', 'sugar'), 9), (('1/4', 'cup', 'brown', 'sugar'), 9), (('cut', 'back', 'brown', 'sugar'), 7), (('brown', 'sugar', '1/2', 'cup'), 6), (('brown', 'sugar', '1/4', 'cup'), 5), (('sweet', 'low', 'brown', 'sugar'), 4), (('sugar', 'instead', 'granulated', 'sugar'), 4), (('sugar', 'thanks', 'great', 'recipe'), 4), (('brown', 'sugar', 'instead', 'granulated'), 4), (('sugar', 'twin', 'brown', 'sugar'), 4), (('try', 'less', 'sugar', 'time'), 4), (('sugar', 'instead', 'white', 'sugar'), 4), (('next', 'time', 'cut', 'sugar'), 4), (('splenda', 'instead', 'brown', 'sugar'), 4), (('brown', 'sugar', 'it', 'little'), 3), (('sugar', 'it', 'little', 'sweet'), 3), (('time', 'cut', 'back', 'sugar'), 3), (('cut', 'back', 'sugar', 'bit'), 3), (('cut', 'brown', 'sugar', 'half'), 3), (('light', 'brown', 'sugar', 'instead'), 3), (('brown', 'sugar', 'instead', 'dark'), 3), (('powdered', 'sugar', 'orange', 'juice'), 3), (('lemon', 'juice', 'pow

## Trigrams prints

In [None]:
trigram_tokens1 = nltk.trigrams(sugar_tokens1) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens1 = nltk.trigrams(sugar_tokens2) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])


trigram_tokens1 = nltk.trigrams(sugar_tokens3) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])


trigram_tokens1 = nltk.trigrams(sugar_tokens4) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])


trigram_tokens1 = nltk.trigrams(sugar_tokens5) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])



[(('cut', 'back', 'sugar'), 30), (('cup', 'brown', 'sugar'), 29), (('brown', 'sugar', 'instead'), 27), (('splenda', 'instead', 'sugar'), 27), (('1/2', 'cup', 'sugar'), 26), (('1/4', 'cup', 'sugar'), 22), (('dark', 'brown', 'sugar'), 18), (('the', 'brown', 'sugar'), 17), (('sugar', '1/2', 'cup'), 15), (('light', 'brown', 'sugar'), 15), (('little', 'less', 'sugar'), 14), (('little', 'brown', 'sugar'), 14), (('dusted', 'powdered', 'sugar'), 14), (('cut', 'sugar', 'half'), 14), (('sugar', 'snap', 'peas'), 14), (('sugar', '1/4', 'cup'), 13), (('sprinkled', 'powdered', 'sugar'), 12), (('amount', 'brown', 'sugar'), 12), (('butter', 'brown', 'sugar'), 12), (('brown', 'sugar', '1/2'), 11), (('brown', 'sugar', 'white'), 11), (('substituted', 'splenda', 'sugar'), 11), (('cup', 'white', 'sugar'), 11), (('cut', 'brown', 'sugar'), 10), (('sugar', 'free', 'jello'), 10), (('sugar', 'instead', 'white'), 10), (('brown', 'sugar', 'the'), 9), (('1/2', 'cups', 'sugar'), 9), (('little', 'bit', 'sugar'), 9),

## Fourgrams Prints

In [None]:
trigram_tokens1 = nltk.fourgrams(sugar_tokens1) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])

trigram_tokens1 = nltk.fourgrams(sugar_tokens2) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])


trigram_tokens1 = nltk.fourgrams(sugar_tokens3) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])


trigram_tokens1 = nltk.fourgrams(sugar_tokens4) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])


trigram_tokens1 = nltk.fourgrams(sugar_tokens5) 
trigram_tokens1=sugar_val_filter(trigram_tokens1)
print(Counter(trigram_tokens1).most_common()[:50])