Me gustaría que el usuario me de:
- Pairings: uno o más combinaciones de alimentos con los que quiere maridar el vino. (OK)
- Body, tannis, sweetness, acidity: de 0 a 1 (OK)
- Uvas: una selección de uvas que yo determine + otras (todas las que no estén incluidas dentro de las que le doy como opción individual). (OK)
- Price: el rango de precios en el cual buscar vinos. (OK)

- Gustó la recomendación: debería haber una función que te diga si al usuario le gustó o no la recomendación en base a diversos parámetros (pensar en el modelo por puntaje y filtro que hice).

1. Pasar a Clase
2. Crear datos sintéticos

In [1]:
import random
import sys
import os

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import MinMaxScaler

sys.path.append(os.path.abspath(os.path.join('..', '..', 'src', 'utils')))
import utils as ut

In [None]:
# randint: https://www.w3schools.com/python/ref_random_randint.asp
# Pandas .quantile: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.quantile.html
# np.random.normal: https://numpy.org/doc/2.1/reference/random/generated/numpy.random.normal.html
# np.clip: https://numpy.org/doc/2.1/reference/generated/numpy.clip.html 
# random.choices: https://docs.python.org/3/library/random.html#random.choices


# Selección de pairing
def select_random_meal(meal_df):
    i = np.random.randint(0, len(meal_df))
    return meal_df.loc[i]

# Elección de categoría según un valor dado
def get_category(value, quantile_values, categories=["leve", "moderado", "marcado", "intenso"]):
    if len(quantile_values)-1 != len(categories):
        raise IndexError("Length of quantiles ranges and categories does not match!")
    
    quant_values = quantile_values.values

    # Obtiene categoría y valor promedio del rango de los cuantiles
    for i in range(0, len(quant_values)-1):
        start_val = quant_values[i]
        end_val = quant_values[i+1]

        if start_val <= value < end_val:
            cat = categories[i]
            mid_val = (start_val + end_val) / 2
            return cat, mid_val
    
    # Gestiona valores atípicos (mayores o menores que el máximo o mínimo valor de los cuantiles)
    if value >= end_val:
        cat = categories[i] # Último i asignado es el mayor i
        mid_val = (start_val + end_val)/2
        return cat, mid_val
    
    if 0 <= value < quant_values[0]:
        cat = categories[0]
        mid_val = (quant_values[0] + quant_values[1]) / 2
        return cat, mid_val
    
    # Levanta error si el valor es inferior a cero
    if value < 0:
        raise ValueError(f"Passed value is less than zero! {value}")

# Creación de perfiles de sabor por pairing (para elecciones que hagan sentido con la elección del user)
def build_pairing_profile(df, pairing_cols, taste_cols, quantiles=[0, .25, .5, .75, 1]):
    profiles = {}
    deviations = {}
    for pairing in pairing_cols:
        pairing_subset = df[df[pairing] == 1]
        profiles[pairing] = {}
        deviations[pairing] = {}
        for taste in taste_cols:
            # Cuantiles
            taste_quantile = pairing_subset[taste].quantile(quantiles)
            profiles[pairing][taste] = taste_quantile
            # Desvío Estandar (para distribución normal en select_taste_profile)
            std = pairing_subset[taste].std()
            deviations[pairing][taste] = std

    return profiles, deviations

# Selección del perfil de vino con algo de sentido común (en general - sumamos probabilidad de random choice)
def select_taste_profile(taste_cols, pairing, profiles, deviations, prob=.2, categories=["leve", "moderado", "marcado", "intenso"]):
    selected_profile = {}
    for taste in taste_cols:
        if random.random() < prob:
            quantiles = profiles[pairing][taste]
            quant_random_options = []
            for i in range(0, len(quantiles.values)-1):
                mid_val = (quantiles.values[i] + quantiles.values[i+1])/2
                quant_random_options.append(mid_val)
            quant_random_choice = np.random.choice(quant_random_options)
            selected_profile[taste] = get_category(quant_random_choice, quantile_values=quantiles, categories=categories)
        else:
            quantiles = profiles[pairing][taste]
            median = quantiles[.5]
            std = deviations[pairing][taste]
            min_val = quantiles[0]
            max_val = quantiles[1]
            valor_seleccionado = min(max(np.random.normal(loc=median, scale=std), min_val), max_val)
            selected_profile[taste] = get_category(valor_seleccionado, quantile_values=quantiles, categories=categories)
    return selected_profile

# Obtención de top N grapes y su cantidad
def get_top_grapes(grapes_df, top_n_grapes):
    top_grapes = grapes_df.sum(axis=0).sort_values(ascending=False).head(top_n_grapes)
    return top_grapes

# Selección de uvas según probabilidad de uva en dataset
def select_grapes(grapes_df, top_n_grapes, min_grapes, max_grapes):

    top_grapes = get_top_grapes(grapes_df, top_n_grapes)
    total_grapes_in_df = sum(grapes_df.sum(axis=0))
    top_grapes_prob = top_grapes / total_grapes_in_df
    top_grapes_prob["Otras Uvas"] = 1 - sum(top_grapes_prob)
    grapes_prob = dict(top_grapes_prob)

    while True:
        grape_selection = []
        for key, value in grapes_prob.items():
            if random.random() < value:
                grape_selection.append(key)

        if len(grape_selection) >= min_grapes:
            break

    if len(grape_selection) > max_grapes:
        grape_selection = grape_selection[:max_grapes]

    return grape_selection


# Selección de rango de precios considerando lógica de usuarios
def select_price_range(df,
                       quantiles=[.075, .125, .25, .375, .5, .625, .75, .875],
                       low_prices_weights=[.2, .3, .25, .15, .05, .025, .015, 0.01, 0, 0],
                       min_dispersion=2):
    
    no_outlier_price = ut.manage_outlier_IQR(df=df["price"], func="remove")
    price_quantiles = no_outlier_price.quantile(quantiles)
    price_quantiles[0], price_quantiles[1] = 0, -1
    price_quantiles = price_quantiles.sort_index()
    price_quantiles = pd.Series(price_quantiles.values)

    low_price_index = random.choices(population=range(len(price_quantiles)), weights=low_prices_weights, k=1)[0]

    high_top_limit = len(price_quantiles)
    high_bottom_limit = low_price_index + min_dispersion if (low_price_index + min_dispersion) < len(price_quantiles) else len(price_quantiles)-1

    high_price_possible = range(high_bottom_limit, high_top_limit)

    high_prices_weights = [1 / x**1.5 for x in high_price_possible]

    high_price_index = random.choices(population=high_price_possible, weights=high_prices_weights, k=1)[0]

    price_range = [int(price_quantiles[low_price_index]), int(price_quantiles[high_price_index])]

    return price_range

### elección de vino

In [321]:
# Transforma df escalando los gustos y agregando columna "Otras Uvas"

def transform_df(df, grapes, top_grapes):
    tra_df = df

    # Scaling de sabores
    scaler = MinMaxScaler()
    
    taste_features = ["body", "tannis", "sweetness", "acidity"]
    scaled_tastes = [taste + "_scld" for taste in taste_features]
    tra_df[scaled_tastes] = scaler.fit_transform(tra_df[taste_features])
    
    # Columna Otras Uvas
    other_grapes = [grape for grape in grapes if grape not in top_grapes]
    tra_df["Otras Uvas"] = tra_df[other_grapes].max(axis=1)
    
    return tra_df

In [None]:
def pick_wine(wines_df, user_input, top_n = 10, d=.075):
    wines_copy = wines_df.copy()

    # User Inputs
    user_pairings = user_input.get("pairing_list")
    precio_min = user_input.get("precio_min")
    precio_max = user_input.get("precio_max")
    grape_list = user_input.get("grape_list")

    # User Weights
    user_weights = user_input.get("weights")
    rating_w = user_weights.get("rating")
    price_quality_w = user_weights.get("price_quality")
    rating_qty_w = user_weights.get("rating_qty")
    user_similarity_w = user_weights.get("user_similarity")
    main_pairing_w = user_weights.get("main_pairing")

    # Filtro por pairings
    if user_pairings is not None:
        wine_base = wines_copy[wines_copy[user_pairings].sum(axis=1)>0]

    # Filtro por precio
    if precio_min is not None:
        wine_base = wine_base[wine_base["price"]>=precio_min]
    if precio_max is not None:
        wine_base = wine_base[wine_base["price"]<=precio_max]
    
    # Filtro por uvas
    if grape_list is not None:
        wine_base = wine_base[wine_base[grape_list].sum(axis=1)>0]

    # Zscore quality/price
    wine_base['rating_zscore'] = stats.zscore(wine_base["rating"])
    wine_base['price_zscore'] = stats.zscore(wine_base["price"])
    wine_base["zscore_quality_price"] = wine_base['rating_zscore'] - wine_base['price_zscore']

    # User similarity (Euclidean Distances)
    user_tastes = np.array(list(user_input["tastes"].values())).reshape(1,-1)
    scaled_tastes = ["body_scld", "tannis_scld", "sweetness_scld", "acidity_scld"]
    distances = euclidean_distances(user_tastes, wine_base[scaled_tastes])[0]
    wine_base["user_similarity"] = 1 - (distances / distances.max())

    # Puntaje sintético del vino
    wine_base["wine_score"] = (
        rating_w * (wine_base["rating"] / 5) + 
        price_quality_w * (wine_base["zscore_quality_price"] / wine_base["zscore_quality_price"].max()) + 
        rating_qty_w * (wine_base["rating_qty"] / wine_base["rating_qty"].max()) +
        user_similarity_w * wine_base["user_similarity"] + 
        main_pairing_w * wine_base[user_input["main_pairing"]]
    )

    # Selección de uno de los mejores top_n vinos
    top_wine_base = wine_base.nlargest(top_n, "wine_score")
    top_selection_weights = [(1 - d)**n for n in range(len(top_wine_base))] # Pesos decrecientes de a d%
    top_selection_weights = [w*(random.random() + 1) for w in top_selection_weights] # Agregamos factor aleatorio
    total_weight = sum(top_selection_weights)
    top_selection_weights = [w / total_weight for w in top_selection_weights] # Normalizamos para interpretabilidad
    selected_wine_id = int(random.choices(top_wine_base.index, top_selection_weights, k=1)[0])
    selected_wine = wine_base.loc[selected_wine_id]
    return selected_wine

### usuario

In [None]:
wines_df = pd.read_csv("../../src/data/transformed/wines_clean.csv")
pairings = pd.read_csv("../../src/data/processed/aux/pairings.csv")
pairings = list(pairings["pairings"])
meals_df = pd.read_excel("../../src/data/raw/meals/Meals.xlsx")
grapes = pd.read_csv("../../src/data/processed/aux/grapes.csv")
grapes = grapes["grapes"]
grape_selection = wines_df[grapes]
scaler = MinMaxScaler()

taste_columns = ["body", "tannis", "sweetness", "acidity"]
scld_taste_columns = [taste + "_scld" for taste in taste_columns]

wines_copy = wines_df.copy()
wines_copy[scld_taste_columns] = scaler.fit_transform(wines_df[taste_columns])


# Creación de perfiles de sabor por pairing y obtención de sus desvíos para selección según distribución normal
profiles, deviations = build_pairing_profile(df=wines_copy, pairing_cols=pairings, taste_cols=scld_taste_columns)

# Elección aleatoria de una meal y obtención de sus pairings
selected_meal = select_random_meal(meals_df)
main_pairing = selected_meal["Ingrediente Principal"].lower()
pairing_list = selected_meal[2:] 
pairing_list = list(pairing_list.index[pairing_list>0])

# Elección del perfil del vino, teniendo en cuenta algo de sentido común de los usuarios
selected_profile = select_taste_profile(taste_cols=scld_taste_columns, pairing=main_pairing, profiles=profiles, deviations=deviations, prob=.2)

# Estaría bueno acá poder ver una distribución tipo histograma de los sabores según el main pairing elegido
selected_grapes = select_grapes(grape_selection, 8, 1, 4)

# Elección del rango de precios del vino, teniendo en cuenta lógica de usuario
selected_price_range = select_price_range(wines_df)

# Generación de user input
user_input = {
    "meal": selected_meal["Comida"],
    "main_pairing": main_pairing,
    "pairing_list": pairing_list,
    "grape_list": selected_grapes,
    "precio_min": selected_price_range[0],
    "precio_max": selected_price_range[1],
    "tastes": {
        "body": selected_profile["body_scld"][1],
        "tannis": selected_profile["tannis_scld"][1],
        "sweetness": selected_profile["sweetness_scld"][1],
        "acidity": selected_profile["acidity_scld"][1]
    },
    "weights": {
        "rating": .3,
        "price_quality": .25,
        "rating_qty": .1,
        "user_similarity": .25,
        "main_pairing": .1
    }
}

user_input

{'meal': 'Guiso de Lentejas',
 'main_pairing': 'vegetarian',
 'pairing_list': ['vegetarian'],
 'grape_list': ['Malbec'],
 'precio_min': 15,
 'precio_max': 34,
 'tastes': {'body': np.float64(0.46343065693430663),
  'tannis': np.float64(0.37758945386064025),
  'sweetness': np.float64(0.3744114877589454),
  'acidity': np.float64(0.39025331104539024)},
 'weights': {'rating': 0.3,
  'price_quality': 0.25,
  'rating_qty': 0.1,
  'user_similarity': 0.25,
  'main_pairing': 0.1}}

In [322]:
# Simular si le gustó o no
# Generar clase
# Crear datos sintéticos