In [None]:
import ast

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import pickle
import matplotlib.pyplot as plt

from utils import SEED

In [None]:
plt.style.use('seaborn-v0_8')

In [None]:
DATA_DIR = 'data/Food_com_Recipes/'

In [None]:
interactions_train = pd.read_csv(DATA_DIR + 'interactions_train.csv')

In [None]:
interactions_train.head()

In [None]:
interactions_train['rating'].unique()

In [None]:
RAW_recipes = pd.read_csv(DATA_DIR + 'RAW_recipes.csv')

In [None]:
RAW_recipes.head()

In [None]:
RAW_recipes.dtypes

In [None]:
RAW_recipes['nutrition'] = RAW_recipes['nutrition'].apply(ast.literal_eval)
RAW_recipes['ingredients'] = RAW_recipes['ingredients'].apply(ast.literal_eval)

In [None]:
def get_ingredient(row):
    for i, v in enumerate(row):
        row[i] = v.replace(' ', '_').replace('-', '_')
    return ' '.join(row)

In [None]:
ingredients = RAW_recipes['ingredients'].apply(get_ingredient)

In [None]:
ingridients_counts = RAW_recipes['ingredients'].explode().value_counts().values

In [None]:
RAW_recipes['ingredients'].explode().value_counts().head(10).to_frame()

In [None]:
RAW_recipes['ingredients'].explode().value_counts().tail(10).to_frame()

In [None]:
n_recipes = RAW_recipes.shape[0]

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))

ax.hist(ingridients_counts / n_recipes, bins=20)

ax.set_xlabel('Frequency of Ingridient Occurrences', fontweight='bold')
ax.set_ylabel('Number of Occurrences', fontweight='bold')
ax.set_title('Destribution of Frequency of Ingridients Occurrences', fontweight='bold')

ax.set_yscale('log')

In [None]:
ingridients_counts_threshold = 10
max_features = ingridients_counts[ingridients_counts > ingridients_counts_threshold].shape[0]

In [None]:
vectorizer = CountVectorizer(max_features=max_features)
bow_matrix = vectorizer.fit_transform(ingredients)

In [None]:
n_components = 10
svd = TruncatedSVD(n_components=n_components)
embeddings = svd.fit_transform(bow_matrix)

svd = TruncatedSVD(n_components=2)
embeddings_2d_svd = svd.fit_transform(bow_matrix)

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(embeddings_2d_svd[:, 0], embeddings_2d_svd[:, 1], alpha=0.1, s=2)

In [None]:
n_clusters = 4 
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster = kmeans.fit_predict(embeddings)

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
axs[0].scatter(embeddings_2d_svd[:, 0], embeddings_2d_svd[:, 1], alpha=0.05, s=1)
axs[1].scatter(embeddings_2d_svd[:, 0], embeddings_2d_svd[:, 1], alpha=0.05, s=1, c=cluster, cmap='viridis')
axs[0].set_title('Ingredients embeddings projection', fontweight='bold', fontsize=12)
axs[1].set_title('Ingredients embeddings projection\n+ clustering', fontweight='bold', fontsize=12)

In [None]:
plt.figure(figsize=(8, 8))
plt.scatter(embeddings_2d_svd[:, 0], embeddings_2d_svd[:, 1], alpha=0.1, s=2, c=cluster, cmap='viridis')

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
clusters_encoded = enc.fit_transform(cluster.reshape(-1, 1)).toarray()

In [None]:
processed_recipes = pd.DataFrame(columns=[f'cluster_{i}' for i in range(4)], data=clusters_encoded)

In [None]:
processed_recipes['recipe_ID'] = RAW_recipes['id']
processed_recipes['minutes'] = RAW_recipes['minutes']
processed_recipes['calories'] = RAW_recipes['nutrition'].apply(lambda x: x[0])
processed_recipes['total_fat'] = RAW_recipes['nutrition'].apply(lambda x: x[1] +x[5])
processed_recipes['carbs'] = RAW_recipes['nutrition'].apply(lambda x: x[-1])
processed_recipes['protein'] = RAW_recipes['nutrition'].apply(lambda x: x[4])
processed_recipes['sugar'] = RAW_recipes['nutrition'].apply(lambda x: x[2])

In [None]:
processed_recipes.head()

In [None]:
processed_recipes['minutes'] = np.where(processed_recipes['minutes'] > 60*10, 60*10, processed_recipes['minutes'])

In [None]:
def process_feature_with_outliers(_feature, _q=0.99):
    q = np.quantile(_feature, _q)
    _feature = np.where(_feature > q, q, _feature)
    if (_feature == 0).any():
        _feature += 1
    _feature = np.log(_feature)
    return _feature

In [None]:
features_to_transform = ['minutes', 'calories', 'total_fat', 'carbs', 'protein', 'sugar']

In [None]:
fig, axs = plt.subplots(2, 6, figsize=(10, 5))

for i in range(6):
    axs[0, i].hist(processed_recipes[features_to_transform[i]]);
    axs[0, i].set_title(features_to_transform[i], fontweight='bold', fontsize=12)
    axs[1, i].hist(process_feature_with_outliers(processed_recipes[features_to_transform[i]]))
    if i == 0:
        axs[0, i].set_ylabel('Before transformation', fontweight='bold', fontsize=12)
        axs[1, i].set_ylabel('After transformation', fontweight='bold', fontsize=12)
    for axis in ['x', 'y']:
        for j in range(2):
            axs[j, i].tick_params(axis=axis, labelsize=7)
plt.tight_layout()

In [None]:
scaler = MinMaxScaler()

In [None]:
for col in features_to_transform:
    processed_recipes[col] = process_feature_with_outliers(processed_recipes[col])
    processed_recipes[col] = scaler.fit_transform(processed_recipes[col].values.reshape(-1, 1))

In [None]:
processed_recipes.iloc[:, 5:]

In [None]:
PP_users = pd.read_csv(DATA_DIR + 'PP_users.csv')

In [None]:
interactions_train = pd.read_csv(DATA_DIR + 'interactions_train.csv')
interactions_validation = pd.read_csv(DATA_DIR + 'interactions_validation.csv')
interaction_test = pd.read_csv(DATA_DIR + 'interactions_test.csv')

In [None]:
interactions = pd.concat((interactions_train, interactions_validation, interaction_test))

In [None]:
number_of_interactions = interactions.groupby('user_id').count().sort_values('recipe_id', ascending=False)['recipe_id'].values

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(number_of_interactions)
ax.set_yscale('log')

ax.set_xlabel('Number of Interactions', fontweight='bold')
ax.set_ylabel('Number of Occurrences', fontweight='bold')
ax.set_title('Destribution of the Number of Interactions', fontweight='bold');

In [None]:
interactions = interactions[interactions['rating'] >= 4].sample(200000, random_state=SEED)

In [None]:
interactions.head()

In [None]:
common_user_ids = np.intersect1d(
    np.unique(interactions['u'].values), 
    np.unique(PP_users['u'].values)
    )

common_items_ids = np.intersect1d(
    np.unique(interactions['recipe_id'].values), 
    np.unique(processed_recipes['recipe_ID'].values)
    )

In [None]:
interactions = interactions[
    interactions['u'].isin(common_user_ids)
    & interactions['recipe_id'].isin(common_items_ids)
]

processed_recipes = processed_recipes[
    processed_recipes['recipe_ID'].isin(common_items_ids)
]

PP_users = PP_users[
    PP_users['u'].isin(common_user_ids)
]

In [None]:
user_mapping = {userid: i for i, userid in enumerate(interactions['u'].unique())}
item_mapping = {recipeid: i for i, recipeid in enumerate(interactions['recipe_id'].unique())}

In [None]:
user_ids = torch.LongTensor([user_mapping[i] for i in interactions['u']])
item_ids = torch.LongTensor([item_mapping[i] for i in interactions['recipe_id']])
edge_index = torch.stack((user_ids, item_ids))

In [None]:
train_index, val_index = train_test_split(range(len(interactions)), test_size=0.25, random_state=SEED)

train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]

In [None]:
processed_recipes.loc[:, 'ID'] = processed_recipes.loc[:, 'recipe_ID'].apply(lambda i: item_mapping[i])
processed_recipes.sort_values(by='ID', inplace=True)

In [None]:
items_features = torch.Tensor(processed_recipes.drop(columns=['recipe_ID', 'ID']).values)

In [None]:
users_features = torch.Tensor(np.zeros(PP_users.shape[0]).reshape(-1, 1))

In [None]:
food_recipes_dataset = {
    'users_features': users_features,
    'items_features': items_features,
    'train_edge_index': train_edge_index,
    'val_edge_index': val_edge_index
}

In [None]:
with open('datasets/food_recipes_dataset.bin', 'wb') as f:
    pickle.dump(food_recipes_dataset, f)