In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
# read recipe data
recipes_frame = pd.read_csv('data/recipes_info.csv')
recipes_frame.head()

In [None]:
# sort recipes by id
recipes_frame = recipes_frame.sort_values('id')
recipes_frame.head()

In [None]:
# drop columns not used
recipes_frame = recipes_frame.drop(['contributor_id', 'submitted'], axis=1)
recipes_frame.head()

In [None]:
# encode name
name_encoding = model.encode(recipes_frame['name'].to_list())
torch.save(torch.Tensor(name_encoding), 'processed/name_encoding.pt')

In [None]:
# encode step
steps_frame = recipes_frame['steps'].apply(lambda x: x.replace("'", '').replace('[', '').replace(']', ''))
steps_encoding = model.encode(steps_frame.to_list())
torch.save(torch.Tensor(name_encoding), 'processed/steps_encoding.pt')

In [None]:
# encode ingredient
ingredients_frame = recipes_frame['ingredients'].apply(lambda x: x.replace("'", '').replace('[', '').replace(']', ''))
ingredients_encoding = model.encode(ingredients_frame.to_list())
torch.save(torch.Tensor(ingredients_encoding), 'processed/ingredients_encoding.pt')

In [None]:
# encode description
def clean_empty(data):
    if data is float('nan'):
        return ''
    return str(data)

desc_frame = recipes_frame['description'].apply(clean_empty)
desc_encoding = model.encode(desc_frame.to_list())
torch.save(torch.Tensor(desc_encoding), 'processed/desc_encoding.pt')

In [None]:
# encode tag
tags_frame = recipes_frame['tags'].apply(lambda x: '|'.join(eval(x)))
tags_encoding = tags_frame.str.get_dummies().values
torch.save(torch.Tensor(tags_encoding), 'processed/tags_encoding.pt')

In [None]:
# encode nutrition
nutrition_frame = recipes_frame['nutrition'].apply(lambda x: eval(x))
torch.save(torch.Tensor(nutrition_frame.to_list()), 'processed/nutrition_encoding.pt')

In [None]:
def clean_numeric(data):
    if data is float('nan'):
        return 0
    try:
        return float(data)
    except Exception as _:
        return 0

# minutes, n_steps, n_ingredients
minutes_frame = recipes_frame['minutes'].apply(clean_numeric)
n_steps_frame = recipes_frame['n_steps'].apply(clean_numeric)
n_ingredients_frame = recipes_frame['n_ingredients'].apply(clean_numeric)

torch.save(torch.cat([torch.Tensor(minutes_frame.to_list()).unsqueeze_(1),
            torch.Tensor(n_steps_frame.to_list()).unsqueeze_(1),
            torch.Tensor(n_ingredients_frame.to_list()).unsqueeze_(1)], axis=1), 'processed/hardcode_encoding.pt')

In [None]:
# calculate mapping
id_list = recipes_frame['id'].to_list()
mapping_dict = {}
for i in range(len(id_list)):
    mapping_dict[id_list[i]] = i

In [None]:
# remap training
train_df = pd.read_csv('data/train.csv')
train_df = train_df.sort_values(by=['user_id', 'date'])
train_df['recipe_id'] = train_df['recipe_id'].apply(lambda x: mapping_dict[int(x)])
train_df.to_csv('processed/train_discretization.csv', index=False)

In [None]:
from datetime import datetime
# remap testing
test_df = pd.read_csv('data/test.csv')
test_df['date'] = test_df['date'].apply(lambda x: datetime.strftime(datetime.strptime(x, '%d/%m/%Y'), '%Y-%m-%d'))
# test_df = test_df.sort_values(by=['user_id', 'date'])
test_df['recipe_id'] = test_df['recipe_id'].apply(lambda x: mapping_dict[int(x)])
test_df.to_csv('processed/test_discretization.csv', index=False)

In [None]:
# calculate user count
len(list(set(train_df['user_id'].to_list())))

In [None]:
# top rating count for all users
from collections import Counter
sorted(Counter(train_df['user_id'].to_list()).items(), key=lambda t: t[1], reverse=True)[:10]