# Prerequisities

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [72]:
import os
from os.path import join
import json
import numpy as np
import re
import random
from typing import Dict, List, Tuple, Union

In [3]:
# main path to your data folder
main_path = 'gdrive/MyDrive/bot/data'

# path to recipes saved as json file
recipes_path = join(main_path, 'recipes/recipes.json')

# path to vectors representing recipes ingredients
recipes_vectors_path = join(main_path, 'recipes/recipes_vectors.json')

# path to recipes cleared out of unnecessary ingredients
recipes_cleared_path = join(main_path, 'recipes/recipes_cleared.json')

## Fasttext

In [None]:
!pip install fasttext

In [None]:
import fasttext
ft_path = os.path.join(main_path, 'fasttext/cc.pl.100.bin')
ft = fasttext.load_model(ft_path)

## Morfeusz

In [None]:
!pip install morfeusz2

In [7]:
import morfeusz2
morf = morfeusz2.Morfeusz()

## PrettyPrinter

In [63]:
import pprint

In [62]:
pp = pprint.PrettyPrinter(indent=4)

# Source

## Clear, lemmatize and vectorize ingredients

In [28]:
def find_all_subjects(
    syntactic_analysis: List[
        Tuple[int, int, Tuple[str, str, str, List[str], List[str]]]
    ]
) -> List[str]:
    """Find all subjects in syntactic analysis using the Morfeusz tags."""
    return [a for a in syntactic_analysis if a[2][2].split(":")[0] == "subst"]


In [35]:
def lemmatize(word: str) -> str:
    """Lemmatize the word and clear the lemma of unnecessary signs."""
    analysis = morf.analyse(word)
    analysis = find_all_subjects(analysis)
    if not analysis:
        return -1
    lemmas = [a[2][1] for a in analysis]
    cleared_lemmas = []
    for lemma in lemmas:
        if len(lemma.split(":")) == 1:
            cleared_lemmas.append(lemma)
    cleared_lemmas2 = []
    prefs = set()
    for lemma in cleared_lemmas:
        if lemma[:3] not in prefs:
            cleared_lemmas2.append(lemma)
            prefs.add(lemma[:3])
    return " ".join(cleared_lemmas2)


In [47]:
def remove_quantity(ingredient: str) -> str:
    """Remove the quantiti of ingredients from string.
    F.e.: 'cebula 1 sztuka' -> 'cebula'
    """
    words = ingredient.split()
    for ingredient_part in words:
        if ingredient_part.isdigit():
            return " ".join(words[: words.index(ingredient_part)])
    return " ".join(words)


In [None]:
keywords_to_remove = ["Knorr", 'fix', "Fix", 'olej', 'oliwa', 'sól', 'woda', 'pieprz',
'mąka', 'masło', 'cukier', 'ryż', 'cebula', 'null']

In [48]:
def remove_some_ingredients(ingredients: List[str]) -> List[str]:
    """Remove unsignificant ingredients to leave only the key ones."""
    return [
        ingredient
        for ingredient in ingredients
        if not any(k in ingredient for k in keywords_to_remove)
    ]


In [49]:
def remove_stopwords(ingredient: str) -> str:
  """Remove stopwords from ingredient.
  F.e.: for ingredient: 'sól null' -> sól
  """
  stop_words = ['null']
  ingredient = [i for i in ingredient.split(' ') if i not in stop_words]
  return ' '.join(ingredient)

In [52]:
def clear_recipes() -> None:
    """Clear recipes and theirs ingredients in order to be searchable."""
    cleared_recipes_dict = {}
    with open(recipes_path, "r", encoding="utf-8") as recipe_file:
        recipes = json.load(recipe_file)
        for url, recipe in recipes.items():
            try:
                ingredients = recipe["ingredients"]
                instructions = recipe["instructions"]
                title = recipe["title"]
            except KeyError:
                continue
            ingredients_without_stopwords = list(map(remove_stopwords, ingredients))
            cleared_ingredients = list(map(remove_quantity, ingredients))
            cleared_ingredients = remove_some_ingredients(cleared_ingredients)
            cleared_ingredients = [word for word in cleared_ingredients if word]
            cleared_ingredients = list(map(lemmatize, cleared_ingredients))
            if not any(i == -1 for i in cleared_ingredients):
                cleared_recipes_dict[url] = {
                    "title": title,
                    "instructions": instructions,
                    "ingredients": ingredients_without_stopwords,
                    "ingredients_cleared": cleared_ingredients,
                }
    with open(recipes_cleared_path, "w", encoding="utf-8") as cleared_file:
        json.dump(cleared_recipes_dict, cleared_file, ensure_ascii=False)


In [None]:
word_vectors_dict = {}

In [93]:
def get_sentence_vector(sentence: str) -> float:
    """Split the sentence into words, get rid of special signs and
    use fasttext to get their vector representation.
    Fill the word_vectors_dict to speed up the process in the future.

    The vector of the sentence is the sum of vectors of words
    divided by the number of words.
    """
    words = sentence.split()
    words_without_special_signs = [re.sub(r"[\W_]+", "", word) for word in words]

    word_vectors = []
    for word in words_without_special_signs:
        if word not in word_vectors_dict:
            word_vectors_dict[word] = ft.get_word_vector(word)
        word_vectors.append(word_vectors_dict[word])

    word_vectors = list(map(np.array, word_vectors))
    vectors_sum = np.add.reduce(word_vectors)
    return vectors_sum / len(words_without_special_signs)


In [17]:
def find_cosine_similarity(vector_A: float, vector_B: float) -> np.ndarray:
    """Calculate cosine similarity for two vectors."""
    return np.dot(vector_A, vector_B) / (
        np.linalg.norm(vector_A) * np.linalg.norm(vector_B)
    )

In [18]:
def make_ingredients_vector_file() -> None:
    """Get cleared recipes and vectors and create ingredients vectors file."""
    with open(recipes_cleared_path, "r", encoding="utf-8") as recipe_file:
        with open(recipes_vectors_path, "w", encoding="utf-8") as vectors_file:
            recipes = json.load(recipe_file)
            for url, recipe in recipes.items():
                try:
                    ingredients = recipe["ingredients_cleared"]
                except KeyError:
                    continue
                ingredients_vector = get_sentence_vector(" ".join(ingredients))
                try:
                    ingredients_vector = ingredients_vector.tolist()
                    ingredients_vector = [round(num, 4) for num in ingredients_vector]
                    to_write = json.dumps({url: ingredients_vector}, ensure_ascii=False)
                    vectors_file.write(f"{to_write}\n")
                except:
                    continue


## Find recipes

In [56]:
def get_urls_of_recipes(ingredients: str) -> List[str]:
    """Get url adresses of recipes."""
    ingredients = ingredients.split(",")
    ingredients = " ".join(list(map(lemmatize, ingredients)))
    vector_in = get_sentence_vector(ingredients)
    res_urls = {}
    with open(recipes_vectors_path, "r", encoding="utf8") as vectors:
        for line in vectors:
            ((url_out, vector_out),) = json.loads(line).items()
            similarity = find_cosine_similarity(vector_in, vector_out)
            res_urls[url_out] = similarity
    return sorted(res_urls, key=res_urls.get, reverse=True)


In [88]:
def find_recipes(
    ingredients: str, how_many: int = 1
) -> List[Dict[str, Union[str, List[str]]]]:
    """Find recipes in a file that match given ingredients."""
    urls = get_urls_of_recipes(ingredients)
    recipes_out = []
    with open(recipes_cleared_path, "r") as recipes_file:
        recipes = json.load(recipes_file)
        for url in urls:
            if url in recipes.keys():
                recipes_out.append(recipes[url])
    return recipes_out[:how_many]


In [92]:
def find_best_recipe(ingredients: List[str]) -> str:
    """Find 3 best recipes and then chose randomly one of them."""
    three_tries = find_recipes(ingredients, 3)
    random_of_best = random.choice(three_tries)
    return f"{random_of_best['title']}.\nSkładniki:\n{', '.join(random_of_best['ingredients'])}.\nInstrukcje:\n{random_of_best['instructions']}"


In [None]:
find_best_recipe("szynka, ser, ogórek, majonez, makaron")

In [None]:
find_best_recipe("małże, pomidory")