Me gustaría que el usuario me de:
- Pairings: uno o más combinaciones de alimentos con los que quiere maridar el vino. (OK)
- Body, tannis, sweetness, acidity: de 0 a 1 (OK)
- Uvas: una selección de uvas que yo determine + otras (todas las que no estén incluidas dentro de las que le doy como opción individual). (OK)
- Price: el rango de precios en el cual buscar vinos. (OK)

- Gustó la recomendación: debería haber una función que te diga si al usuario le gustó o no la recomendación en base a diversos parámetros (pensar en el modelo por puntaje y filtro que hice).

1. Pasar a Clase
2. Crear datos sintéticos

In [262]:
import random
import sys
import os

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

sys.path.append(os.path.abspath(os.path.join('..', '..', 'src', 'utils')))
import utils as ut

In [1053]:
# randint: https://www.w3schools.com/python/ref_random_randint.asp
# Pandas .quantile: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.quantile.html
# np.random.normal: https://numpy.org/doc/2.1/reference/random/generated/numpy.random.normal.html
# np.clip: https://numpy.org/doc/2.1/reference/generated/numpy.clip.html 


# Selección de pairing
def select_random_meal(meal_df):
    i = np.random.randint(0, len(meal_df))
    return meal_df.loc[i]

# Elección de categoría según un valor dado
def get_category(value, quantiles=[.25, .5, .75], categories=["leve", "moderado", "marcado", "intenso"]):
    if len(quantiles) + 1 != len(categories):
        raise IndexError("Length of quantiles and categories does not match!")
    
    for i, q in enumerate(quantiles):
        if value <= q:
            return categories[i]
        
    return categories[-1]

# Creación de perfiles de sabor por pairing (para elecciones que hagan sentido con la elección del user)
def build_pairing_profile(df, pairing_cols, quantiles=[.25, .5, .75]):
    profiles = {}
    deviations = {}
    for pairing in pairing_cols:
        pairing_subset = df[df[pairing] == 1]
        profiles[pairing] = {}
        deviations[pairing] = {}
        for taste in ["body", "tannis", "sweetness", "acidity"]:
            # Cuantiles
            cuartiles = pairing_subset[taste].quantile(quantiles)
            profiles[pairing][taste] = cuartiles
            # Desvío Estandar (para distribución normal en select_taste_profile)
            std = pairing_subset[taste].std()
            deviations[pairing][taste] = std

    return profiles, deviations

# Selección del perfil de vino con algo de sentido común (en general)
def select_taste_profile(pairing, profiles, deviations, prob=.2, categories=["leve", "moderado", "marcado", "intenso"]):
    selected_profile = {}
    for taste in ["body", "tannis", "sweetness", "acidity"]:
        if random.random() < prob:
            selected_profile[taste] = str(np.random.choice(categories))
        else:
            median = profiles[pairing][taste][.5]
            std = deviations[pairing][taste]
            valor_seleccionado = np.random.normal(loc=median, scale=std)
            selected_profile[taste] = get_category(valor_seleccionado, categories=categories)
    return selected_profile

# Selección de uvas según probabilidad de uva en dataset
def select_grapes(grapes_df, top_n_grapes, min_grapes, max_grapes):

    top_grapes = grapes_df.sum(axis=0).sort_values(ascending=False).head(top_n_grapes)
    total_grapes_in_df = sum(grapes_df.sum(axis=0))
    top_grapes_prob = top_grapes / total_grapes_in_df
    top_grapes_prob["Other"] = 1 - sum(top_grapes_prob)
    grapes_prob = dict(top_grapes_prob)

    while True:
        grape_selection = []
        for key, value in grapes_prob.items():
            if random.random() < value:
                grape_selection.append(key)

        if len(grape_selection) >= min_grapes:
            break

    if len(grape_selection) > max_grapes:
        grape_selection = grape_selection[:max_grapes]

    return grape_selection


# Selección de rango de precios considerando lógica de usuarios
def select_price_range(df,
                       quantiles=[.075, .125, .25, .375, .5, .625, .75, .875],
                       low_prices_weights=[.2, .3, .25, .15, .05, .025, .015, 0.01, 0, 0],
                       min_dispersion=2):
    
    no_outlier_price = ut.manage_outlier_IQR(df=df["price"], func="remove")
    price_quantiles = no_outlier_price.quantile(quantiles)
    price_quantiles[0], price_quantiles[1] = 0, -1
    price_quantiles = price_quantiles.sort_index()
    price_quantiles = pd.Series(price_quantiles.values)

    low_price_index = random.choices(population=range(len(price_quantiles)), weights=low_prices_weights, k=1)[0]

    high_top_limit = len(price_quantiles)
    high_bottom_limit = low_price_index + min_dispersion if (low_price_index + min_dispersion) < len(price_quantiles) else len(price_quantiles)-1

    high_price_possible = range(high_bottom_limit, high_top_limit)

    high_prices_weights = [1 / x**1.5 for x in high_price_possible]

    high_price_index = random.choices(population=high_price_possible, weights=high_prices_weights, k=1)[0]

    price_range = [int(price_quantiles[low_price_index]), int(price_quantiles[high_price_index])]

    return price_range

In [1056]:
wines_df = pd.read_csv("../../src/data/transformed/wines_clean.csv")
pairings = pd.read_csv("../../src/data/processed/aux/pairings.csv")
pairings = list(pairings["pairings"])
meals_df = pd.read_excel("../../src/data/raw/meals/Meals.xlsx")
grapes = pd.read_csv("../../src/data/processed/aux/grapes.csv")
grapes = grapes["grapes"]
grape_selection = wines_df[grapes]
scaler = MinMaxScaler()

taste_columns = ["body", "tannis", "sweetness", "acidity"]

wines_copy = wines_df.copy()
wines_copy[taste_columns] = scaler.fit_transform(wines_df[taste_columns])


# Creación de perfiles de sabor por pairing y obtención de sus desvíos para selección según distribución normal
profiles, deviations = build_pairing_profile(df=wines_copy, pairing_cols=pairings, quantiles=[.25, .5, .75])

# Elección aleatoria de una meal y obtención de su pairing principal
selected_meal = select_random_meal(meals_df)
main_pairing = selected_meal["Ingrediente Principal"].lower()

# Elección del perfil del vino, teniendo en cuenta algo de sentido común de los usuarios
selected_profile = select_taste_profile(pairing=main_pairing, profiles=profiles, deviations=deviations, prob=.2)

# Estaría bueno acá poder ver una distribución tipo histograma de los sabores según el main pairing elegido
selected_grapes = select_grapes(grape_selection, 8, 1, 4)

# Elección del rango de precios del vino, teniendo en cuenta lógica de usuario
selected_price = select_price_range(wines_df)

selected_price

[13, 27]

In [951]:
# random.choices: https://docs.python.org/3/library/random.html#random.choices

no_outlier_price = ut.manage_outlier_IQR(df=wines_df["price"], func="remove")
price_quantiles = no_outlier_price.quantile([.075, .125, .25, .375, .5, .625, .75, .875])
price_quantiles[0], price_quantiles[1] = 0, -1
price_quantiles = price_quantiles.sort_index()
price_quantiles = pd.Series(price_quantiles.values)

low_prices_weights = [.2, .3, .25, .15, .05, .025, .015, 0.01, 0, 0]

low_price_index = random.choices(population=range(len(price_quantiles)), weights=low_prices_weights, k=1)[0]

high_top_limit = len(price_quantiles)
high_bottom_limit = low_price_index + 2 if (low_price_index + 2) < len(price_quantiles) else (low_price_index + 1)

high_price_possible = range(high_bottom_limit, high_top_limit)

high_prices_weights = [1 / x**2 for x in high_price_possible]

high_price_index = random.choices(population=high_price_possible, weights=high_prices_weights, k=1)[0]

price_range = price_quantiles[low_price_index], price_quantiles[high_price_index]

price_range

#high_price_index = random.choice(high_price_possible)

# high_price = random.randint(low_price+3,9)

# print(low_price, high_price)

(np.float64(13.64), np.float64(-1.0))