# PP of Raw Recipes#

preprocess raw recipes fields.

So far we have:

1. formatted tags column to be a list of strings instead of column of strings
2. created new tags columns specific for categories
3. split nutrition to the different nutrition parameters columns
4. format the submitted column to date type
5. change id column name to recipe_id

it's possible that after reality check we'll need to adapt some things

### Init ###

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# now can import drive files as usual files with the path like
# /content/drive/My Drive/location_of_the_file


Mounted at /content/drive


In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt

DRIVE_INITIAL_PATH = r"/content/drive/My Drive/Data Mining"

RAW_RECIPES_PATH = r"data/RAW_recipes.csv"
RAW_INTERCATIONS_PATH = r"data/RAW_interactions.csv"
OUR_PP_RECIPES_PATH = r"data/our_pp_recipes.csv"
OUR_PP_INTERACTIONS_PATH = r"data/our_pp_interactions.csv"

DATE_FORMAT = "%Y-%m-%d"
RAW_RECIPES_DATE_COLUMNS = ["submitted"]
RAW_INTERCATIONS_DATE_COLUMNS = ["date"]

RAW_RECIPES_COL_NAMES_TRANSLATOR = {"id": "recipe_id"}
RAW_INTERACTIONS_COL_NAMES_TRANSLATOR = {}


TAGS_COLUMN_NAME = "tags"

TAGS_CATEGORIES_ADDITION = {"cuisine_tags": r"data/tags_categories/cuisine.csv",
                            "fish_tags": r"data/tags_categories/fish.csv",
                            "seafood_tags": r"data/tags_categories/seafood.csv",
                            "fruit_tags": r"data/tags_categories/fruit.csv",
                            "cheese_eggs_tags": r"data/tags_categories/cheese eggs.csv",
                            "grains_tags": r"data/tags_categories/grains.csv",
                            "lentils_tags": r"data/tags_categories/lentils.csv",
                            "meat_tags": r"data/tags_categories/meat.csv",
                            "vegetable_tags": r"data/tags_categories/vegetable.csv"}

def adapt_path_to_drive(original_path):
  return "/".join([DRIVE_INITIAL_PATH, original_path])

# for drive use
RAW_RECIPES_PATH = adapt_path_to_drive(RAW_RECIPES_PATH)
RAW_INTERCATIONS_PATH = adapt_path_to_drive(RAW_INTERCATIONS_PATH)
OUR_PP_RECIPES_PATH = adapt_path_to_drive(OUR_PP_RECIPES_PATH)
OUR_PP_INTERACTIONS_PATH = adapt_path_to_drive(OUR_PP_INTERACTIONS_PATH)

TAGS_CATEGORIES_ADDITION = {k:adapt_path_to_drive(v) for (k,v) in TAGS_CATEGORIES_ADDITION.items()}



### Tags PP Helper functions ###

In [None]:
def convert_tags_field_to_list(df):
    df[TAGS_COLUMN_NAME] = [list(set(tags_str[2:-2].split("', '"))) for tags_str in df[TAGS_COLUMN_NAME].tolist()]
    return df

def extract_unique_tags_to_csv(csv_path):
    df = pd.read_csv(RAW_RECIPES_PATH, encoding="utf-8")[TAGS_COLUMN_NAME]
    raw_tags_lists = [set(tag_list[2:-2].lower().split("', '")) for tag_list in df[TAGS_COLUMN_NAME].tolist()]
    unique_tags = list(set.union(*raw_tags_lists))
    df_tags = pd.DataFrame(unique_tags, columns=["Tag"])
    df_tags.to_csv(csv_path, encoding="utf_8", index=False)

def add_tags_category_column(df, tags_csv_path, new_col_name):
    tags_set = set(pd.read_csv(tags_csv_path, encoding="utf-8").iloc[:, 0].tolist())

    df[new_col_name] = df[TAGS_COLUMN_NAME].apply(lambda x: list(set(x).intersection(tags_set)))
    df[new_col_name + "_num"] = df[new_col_name].str.len()
    return df

def enrich_tags_columns(df):
    for (tags_col_name, tags_csv_path) in TAGS_CATEGORIES_ADDITION.items():
        df = add_tags_category_column(df, tags_csv_path, tags_col_name)
    return df

def add_is_vegan_tag(df):
    df["is_vegan_tag"] = df[TAGS_COLUMN_NAME].apply(lambda x: "vegan" in x)
    return df

def add_is_vegetarian_tag(df):
    df["is_vegetarian_tags"] = df[TAGS_COLUMN_NAME].apply(lambda x: ("vegetarian" in x) or ("veggie-burgers" in x))
    return df

def tags_preprocessing(df):
    df = convert_tags_field_to_list(df)
    df = enrich_tags_columns(df)
    df = add_is_vegan_tag(df)
    df = add_is_vegetarian_tag(df)
    return df

### Nutrition PP Helper Functions ###

In [None]:
def float_str_list_to_list(df, old_col_name, new_col_name):
    df[new_col_name] = df[old_col_name].apply(lambda str_list: [float(x) for x in str_list[1:-1].split(", ")])
    return df

def pp_nutrition(df):
    df = float_str_list_to_list(df, "nutrition", "nutrition_list")
    df["calories"] = df["nutrition_list"].apply(lambda x: x[0])
    df["total_fat_pdv"] = df["nutrition_list"].apply(lambda x: x[1])
    df["sugar_pdv"] = df["nutrition_list"].apply(lambda x: x[2])
    df["sodium_pdv"] = df["nutrition_list"].apply(lambda x: x[3])
    df["protein_pdv"] = df["nutrition_list"].apply(lambda x: x[4])
    df["saturated_fat_pdv"] = df["nutrition_list"].apply(lambda x: x[5])
    df["carbohydrates_pdv"] = df["nutrition_list"].apply(lambda x: x[6])
    return df


### Date Formatter ###

In [None]:
def format_date_column(df, column_to_format, date_format=DATE_FORMAT):
  df[column_to_format] = pd.to_datetime(df[column_to_format], format=date_format)
  return df

In [None]:
# Recipe edition

def format_raw_recipes_date_cols(raw_recipes_df):
  for col_name in RAW_RECIPES_DATE_COLUMNS:
    raw_recipes_df = format_date_column(raw_recipes_df, col_name)
  return raw_recipes_df
  
# Interactions edition

def format_raw_interactions_date_cols(raw_interactions_df):
  for col_name in RAW_INTERCATIONS_DATE_COLUMNS:
    raw_interactions_df = format_date_column(raw_interactions_df, col_name)
  return raw_interactions_df

### Columns Names Translator ###

In [None]:
def translate_df_columns_names(df, translation_dict):
  return df.rename(columns=translation_dict)

def tranlate_raw_recipes_cols_names(raw_recipes_df):
  return translate_df_columns_names(raw_recipes_df, RAW_RECIPES_COL_NAMES_TRANSLATOR)

## Raw Recipes PP full function ##

In [None]:
raw_recipes_df = pd.read_csv(RAW_RECIPES_PATH, encoding="utf-8")
raw_recipes_df = tags_preprocessing(raw_recipes_df)
raw_recipes_df = pp_nutrition(raw_recipes_df)
raw_recipes_df = format_raw_recipes_date_cols(raw_recipes_df)
raw_recipes_df = tranlate_raw_recipes_cols_names(raw_recipes_df)


# now raw_recipes_df is the preprocessed version




In [None]:
# save our_pp_recipes - currently inactive
#raw_recipes_df.to_csv(OUR_PP_RECIPES_PATH, encoding="utf-8", index=False)


# PP of Raw Interactions
1. format the submitted column to date type

##Raw Recipes PP full function

In [None]:
   raw_interactions_df = pd.read_csv(RAW_INTERCATIONS_PATH, encoding="utf-8")
   raw_interactions_df = format_raw_interactions_date_cols(raw_interactions_df)
   # now raw_interactions_df is the preprocessed version

In [None]:
# raw_interactions_df.to_csv(OUR_PP_INTERACTIONS_PATH, encoding="utf-8", index=False)