In [16]:
import os
import sys
import ast
import logging

import pandas as pd
import numpy as np

sys.path.append(os.path.abspath(os.path.join('..', 'src', 'utils')))

import utils as ut

# Entendimiento general de datos

In [2]:
wines_df = pd.read_csv("../src/data/processed/all_wines_complete.csv")
wines_df.head()

Unnamed: 0,wine_link,name,year,winery,rating,rating_qty,price,body,tannis,sweetness,acidity,notes,pairings,grapes,region,style,alcohol,image
0,https://www.vivino.com/US/en/luigi-bosca-parai...,Paraiso,2020,Luigi Bosca,4.8,582,188.33,73.43435%,50.905894999999994%,13.619732499999998%,44.74824%,"{'black fruit': 14, 'oaky': 13, 'earthy': 5, '...","['beef', 'lamb', 'poultry', 'game (deer, venis...","['Cabernet Sauvignon', 'Malbec']","['Argentina', 'Mendoza']",Argentinian Cabernet Sauvignon - Malbec,,https://images.vivino.com/thumbs/_Bf6JTwYRpSX6...
1,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2015,Catena Zapata,4.7,297,675.0,74.17103499999999%,55.8328075%,14.345029999999998%,54.45488%,"{'oaky': 109, 'black fruit': 50, 'spices': 25,...","['beef', 'lamb', 'poultry', 'pasta']","['Cabernet Sauvignon', 'Malbec', 'Merlot']","['Argentina', 'Mendoza', 'Agrelo']",Argentinian Bordeaux Blend,14%,https://images.vivino.com/thumbs/Yt464jw0QS-ug...
2,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2017,Catena Zapata,4.7,219,580.0,74.17103499999999%,55.8328075%,14.345029999999998%,54.45488%,"{'oaky': 109, 'black fruit': 50, 'spices': 25,...","['beef', 'lamb', 'poultry', 'pasta']","['Cabernet Sauvignon', 'Malbec', 'Merlot']","['Argentina', 'Mendoza', 'Agrelo']",Argentinian Bordeaux Blend,,https://images.vivino.com/thumbs/Yt464jw0QS-ug...
3,https://www.vivino.com/US/en/monteviejo-la-vio...,La Violeta,2013,Monteviejo,4.7,202,150.0,73.06289000000001%,40.9165975%,21.4814425%,36.61685%,"{'oaky': 51, 'black fruit': 49, 'earthy': 26, ...","['beef', 'lamb']",['100%Malbec'],"['Argentina', 'Mendoza', 'Uco Valley']",Argentinian Uco Valley Malbec Red,14.5%,https://images.vivino.com/thumbs/FO-x9h3mQHSx9...
4,https://www.vivino.com/US/en/vina-cobos-cobos-...,Cobos Volturno,2018,Viña Cobos,4.7,198,399.99,71.052235%,45.63599%,24.262824999999996%,38.4480575%,"{'oaky': 86, 'black fruit': 49, 'spices': 17, ...","['beef', 'lamb', 'poultry', 'game (deer, venis...",['100%Cabernet Sauvignon'],"['Argentina', 'Mendoza', 'Perdriel']",Argentinian Cabernet Sauvignon - Malbec,,https://images.vivino.com/thumbs/D1Mf1fYnRnage...


In [3]:
wines_df.shape

(2026, 18)

In [4]:
wines_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   wine_link   2026 non-null   object 
 1   name        2026 non-null   object 
 2   year        2023 non-null   object 
 3   winery      2026 non-null   object 
 4   rating      2026 non-null   float64
 5   rating_qty  2026 non-null   object 
 6   price       2009 non-null   object 
 7   body        1974 non-null   object 
 8   tannis      1663 non-null   object 
 9   sweetness   1945 non-null   object 
 10  acidity     1974 non-null   object 
 11  notes       2026 non-null   object 
 12  pairings    1974 non-null   object 
 13  grapes      2012 non-null   object 
 14  region      2026 non-null   object 
 15  style       1974 non-null   object 
 16  alcohol     979 non-null    object 
 17  image       2026 non-null   object 
dtypes: float64(1), object(17)
memory usage: 285.0+ KB


# Transformación de datos

## Pasos:

1. **`Tipo de Dato Float`**: convierte las columnas **`[year, rating_qty, price]`** a numéricas.
- **`year`**: existen vinos sin año que traen el dato "N.V.". Los pasamos a NaN.
- **`rating_qty`**: cuando los vinos no tienen suficientes ratings, los basan en los vinos de otros años. En este caso, los pasamos a NaN.

2. **`Tastes + Alcohol`**: convierte las columnas **`[body, tannis, sweetness, acidity, alcohol]`** en decimales.

3. **`Diccionarios`**: transforma la columna **`{notes}`** a formato tipo *one-hot* normalizado, donde la suma de todas las notas de un vino da 1.

4. **`Listas`**: splitea y transforma las columnas **`[pairings, grapes, region]`** a formato *one-hot*.

In [5]:
# Entendimiento de valores nulos year, rating_qyt, price
col = "price" 
mask_invalid = pd.to_numeric(wines_df[col], errors="coerce").isna() & wines_df[col].notna()
wines_df[mask_invalid]

Unnamed: 0,wine_link,name,year,winery,rating,rating_qty,price,body,tannis,sweetness,acidity,notes,pairings,grapes,region,style,alcohol,image
1635,https://www.vivino.com/catena-appellation-en-f...,Appellation En Fuego Malbec,2013,Catena,4.1,based,1438.99,59.347825%,21.85055%,15.054347499999999%,23.4782625%,{},"['beef', 'lamb', 'pork']",['Malbec'],"['Argentina', 'Mendoza']",Argentinian Mendoza Malbec Red,,https://images.vivino.com/thumbs/TwuvDFXuToi7l...


In [6]:
# Year, Rating Qty, Price
winesdf_dtypes = wines_df.copy()
columns_to_convert = ["year", "rating_qty", "price"]
for col in columns_to_convert:
    winesdf_dtypes[col] = pd.to_numeric(winesdf_dtypes[col].str.replace(",", ""), errors="coerce")
winesdf_dtypes["year"] = winesdf_dtypes["year"].astype("Int64")
winesdf_dtypes["rating_qty"] = winesdf_dtypes["rating_qty"].astype("Int64")
winesdf_dtypes.info()
winesdf_dtypes.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   wine_link   2026 non-null   object 
 1   name        2026 non-null   object 
 2   year        1998 non-null   Int64  
 3   winery      2026 non-null   object 
 4   rating      2026 non-null   float64
 5   rating_qty  1606 non-null   Int64  
 6   price       2009 non-null   float64
 7   body        1974 non-null   object 
 8   tannis      1663 non-null   object 
 9   sweetness   1945 non-null   object 
 10  acidity     1974 non-null   object 
 11  notes       2026 non-null   object 
 12  pairings    1974 non-null   object 
 13  grapes      2012 non-null   object 
 14  region      2026 non-null   object 
 15  style       1974 non-null   object 
 16  alcohol     979 non-null    object 
 17  image       2026 non-null   object 
dtypes: Int64(2), float64(2), object(14)
memory usage: 289.0+ KB


Unnamed: 0,wine_link,name,year,winery,rating,rating_qty,price,body,tannis,sweetness,acidity,notes,pairings,grapes,region,style,alcohol,image
0,https://www.vivino.com/US/en/luigi-bosca-parai...,Paraiso,2020,Luigi Bosca,4.8,582,188.33,73.43435%,50.905894999999994%,13.619732499999998%,44.74824%,"{'black fruit': 14, 'oaky': 13, 'earthy': 5, '...","['beef', 'lamb', 'poultry', 'game (deer, venis...","['Cabernet Sauvignon', 'Malbec']","['Argentina', 'Mendoza']",Argentinian Cabernet Sauvignon - Malbec,,https://images.vivino.com/thumbs/_Bf6JTwYRpSX6...
1,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2015,Catena Zapata,4.7,297,675.0,74.17103499999999%,55.8328075%,14.345029999999998%,54.45488%,"{'oaky': 109, 'black fruit': 50, 'spices': 25,...","['beef', 'lamb', 'poultry', 'pasta']","['Cabernet Sauvignon', 'Malbec', 'Merlot']","['Argentina', 'Mendoza', 'Agrelo']",Argentinian Bordeaux Blend,14%,https://images.vivino.com/thumbs/Yt464jw0QS-ug...
2,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2017,Catena Zapata,4.7,219,580.0,74.17103499999999%,55.8328075%,14.345029999999998%,54.45488%,"{'oaky': 109, 'black fruit': 50, 'spices': 25,...","['beef', 'lamb', 'poultry', 'pasta']","['Cabernet Sauvignon', 'Malbec', 'Merlot']","['Argentina', 'Mendoza', 'Agrelo']",Argentinian Bordeaux Blend,,https://images.vivino.com/thumbs/Yt464jw0QS-ug...


In [7]:
# Tastes
def clean_and_convert_tastes(value):
    if isinstance(value, float):  
        return round(value/100,4)
    return round(pd.to_numeric(str(value)[:5].replace('%', ''), errors='coerce')/100,4)

winesdf_taste = winesdf_dtypes.copy()

columns_to_convert = ['body', 'tannis', 'sweetness', 'acidity', 'alcohol']
for col in columns_to_convert:
    winesdf_taste[col] = winesdf_taste[col].apply(clean_and_convert_tastes)

winesdf_taste.info()
winesdf_taste.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   wine_link   2026 non-null   object 
 1   name        2026 non-null   object 
 2   year        1998 non-null   Int64  
 3   winery      2026 non-null   object 
 4   rating      2026 non-null   float64
 5   rating_qty  1606 non-null   Int64  
 6   price       2009 non-null   float64
 7   body        1974 non-null   float64
 8   tannis      1663 non-null   float64
 9   sweetness   1945 non-null   float64
 10  acidity     1974 non-null   float64
 11  notes       2026 non-null   object 
 12  pairings    1974 non-null   object 
 13  grapes      2012 non-null   object 
 14  region      2026 non-null   object 
 15  style       1974 non-null   object 
 16  alcohol     979 non-null    float64
 17  image       2026 non-null   object 
dtypes: Int64(2), float64(7), object(9)
memory usage: 289.0+ KB


Unnamed: 0,wine_link,name,year,winery,rating,rating_qty,price,body,tannis,sweetness,acidity,notes,pairings,grapes,region,style,alcohol,image
0,https://www.vivino.com/US/en/luigi-bosca-parai...,Paraiso,2020,Luigi Bosca,4.8,582,188.33,0.7343,0.509,0.1361,0.4474,"{'black fruit': 14, 'oaky': 13, 'earthy': 5, '...","['beef', 'lamb', 'poultry', 'game (deer, venis...","['Cabernet Sauvignon', 'Malbec']","['Argentina', 'Mendoza']",Argentinian Cabernet Sauvignon - Malbec,,https://images.vivino.com/thumbs/_Bf6JTwYRpSX6...
1,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2015,Catena Zapata,4.7,297,675.0,0.7417,0.5583,0.1434,0.5445,"{'oaky': 109, 'black fruit': 50, 'spices': 25,...","['beef', 'lamb', 'poultry', 'pasta']","['Cabernet Sauvignon', 'Malbec', 'Merlot']","['Argentina', 'Mendoza', 'Agrelo']",Argentinian Bordeaux Blend,0.14,https://images.vivino.com/thumbs/Yt464jw0QS-ug...


In [None]:
# Funciones para gestionar tipos de datos complejos en columnas (listas y diccionarios)
# ast.literal_eval : https://medium.com/@aniruddhapal/the-power-of-the-ast-literal-eval-method-in-python-8fb4014a2574

def extract_unique_values(df_col, type="list"):
    """Procesa el diccionario almacenado como string y crea un set de las palabras que contiene."""
    unique_values = set()
    for entry in df_col.dropna():
        try:
            values = ast.literal_eval(entry) # Convierte string en list o dictionary
            if type == "list":
                unique_values.update(values) # Agrega calores de la list al set()
            elif type == "dict":
                unique_values.update(values.keys()) # Agrega keys del dict al set()
            else:
                raise TypeError("Only types supported are 'list' and 'dict'!")
        except (SyntaxError, ValueError):
            logging.error(f"Error processing entry: {entry}")
            continue
    return sorted(unique_values)

def split_normalize_notes(df):
    unique_notes = extract_unique_values(df["notes"], type="dict")
    
    splitted_df = df.copy()

    for note in unique_notes:
        splitted_df[note] = splitted_df["notes"].apply(
            lambda x: ast.literal_eval(x).get(note, 0) if pd.notna(x) else None
        )

    note_sums = splitted_df[unique_notes].sum(axis=1)
    splitted_df[unique_notes] = splitted_df[unique_notes].div(note_sums, axis=0).fillna(0).round(4) # Normaliza
    return splitted_df.drop(columns=["notes"]), unique_notes

def split_list_cols(df, col):
    unique_values = extract_unique_values(df[col], type="list")
    splitted_df = df.copy()
    for val in unique_values:
        splitted_df[val] = splitted_df[col].apply(
            lambda x: None if pd.isna(x) else 1 if val in ast.literal_eval(x) else 0
        ).astype("Int64")
    return splitted_df.drop(columns=[col]), unique_values

In [10]:
# Notes
winesdf_note, notes = split_normalize_notes(winesdf_taste)
winesdf_note.head(3)
print(notes)
ut.save_csv(pd.DataFrame(notes, columns=["notes"]), path="../src/data/processed/aux/", filename="notes.csv")

['ageing', 'black fruit', 'citrus', 'dried fruit', 'earthy', 'floral', 'oaky', 'red fruit', 'spices', 'tree fruit', 'tropical', 'vegetal', 'yeasty']
Archivo guardado en: ../src/data/processed/aux/notes.csv


In [11]:
# Pairings, Grapes, Regions
list_columns = ["pairings", "grapes", "region"]

winesdf_note["grapes"] = winesdf_note["grapes"].replace(r"\d+%", "", regex=True)

winedf_tra = winesdf_note.copy()

for col in list_columns:
    winedf_tra, unique_vals = split_list_cols(winedf_tra, col)
    ut.save_csv(pd.DataFrame(unique_vals, columns=[f"{col}"]), path="../src/data/processed/aux/", filename=f"{col}.csv")

# winedf_tra[extract_unique_values(winesdf_note["region"], "list")].head() # Revisar valores de una list col

winedf_tra.head(3)

Archivo guardado en: ../src/data/processed/aux/pairings.csv
Archivo guardado en: ../src/data/processed/aux/grapes.csv
Archivo guardado en: ../src/data/processed/aux/region.csv


Unnamed: 0,wine_link,name,year,winery,rating,rating_qty,price,body,tannis,sweetness,...,San Carlos,San Juan,San Rafael,Serra Gaúcha,Tulum Valley,Tunuyán,Tupungato,Uco Valley,Vale dos Vinhedos,Vista Flores
0,https://www.vivino.com/US/en/luigi-bosca-parai...,Paraiso,2020,Luigi Bosca,4.8,582,188.33,0.7343,0.509,0.1361,...,0,0,0,0,0,0,0,0,0,0
1,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2015,Catena Zapata,4.7,297,675.0,0.7417,0.5583,0.1434,...,0,0,0,0,0,0,0,0,0,0
2,https://www.vivino.com/US/en/catena-zapata-est...,Estiba Reservada,2017,Catena Zapata,4.7,219,580.0,0.7417,0.5583,0.1434,...,0,0,0,0,0,0,0,0,0,0


In [12]:
winedf_tra["Pedro Giménez"] = winedf_tra["Pedro Giménez"].fillna(0)

winedf_tra["Pedro Ximenez"] = np.where(
    winedf_tra["Pedro Giménez"] == 1,
    1,
    winedf_tra["Pedro Ximenez"]
)

winedf_tra["Pedro Ximenez"] = winedf_tra["Pedro Ximenez"].astype("Int64")

winedf_tra = winedf_tra.drop(columns=["Pedro Giménez"])

In [13]:
winedf_tra.select_dtypes(include="float")

Unnamed: 0,rating,price,body,tannis,sweetness,acidity,alcohol,ageing,black fruit,citrus,dried fruit,earthy,floral,oaky,red fruit,spices,tree fruit,tropical,vegetal,yeasty
0,4.8,188.33,0.7343,0.5090,0.1361,0.4474,,0.0000,0.3500,0.0000,0.0000,0.1250,0.0500,0.3250,0.0500,0.1000,0.0000,0.0000,0.0000,0.0000
1,4.7,675.00,0.7417,0.5583,0.1434,0.5445,0.140,0.0241,0.2008,0.0080,0.0120,0.0964,0.0241,0.4378,0.0843,0.1004,0.0000,0.0000,0.0040,0.0080
2,4.7,580.00,0.7417,0.5583,0.1434,0.5445,,0.0241,0.2008,0.0080,0.0120,0.0964,0.0241,0.4378,0.0843,0.1004,0.0000,0.0000,0.0040,0.0080
3,4.7,150.00,0.7306,0.4091,0.2148,0.3661,0.145,0.0061,0.2988,0.0000,0.0061,0.1585,0.0671,0.3110,0.0671,0.0671,0.0000,0.0000,0.0061,0.0122
4,4.7,399.99,0.7105,0.4563,0.2426,0.3844,,0.0258,0.2526,0.0103,0.0103,0.0773,0.0103,0.4433,0.0773,0.0876,0.0000,0.0000,0.0000,0.0052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021,3.9,16.99,0.4789,,0.1951,0.5647,,0.0000,0.0000,0.0000,0.0000,0.0000,0.2000,0.0000,0.2000,0.0000,0.6000,0.0000,0.0000,0.0000
2022,3.9,23.63,0.6515,0.5417,0.1375,0.4157,,0.0135,0.1892,0.0000,0.0405,0.1216,0.0135,0.2973,0.1892,0.1081,0.0000,0.0000,0.0135,0.0135
2023,3.9,22.66,0.6311,0.4507,0.2118,0.5878,,0.0061,0.1656,0.0184,0.0061,0.1534,0.0368,0.1166,0.3681,0.0859,0.0000,0.0000,0.0307,0.0123
2024,3.9,10.99,0.6266,0.3178,0.1849,0.3760,0.140,0.0072,0.2984,0.0052,0.0143,0.1610,0.0120,0.3142,0.1013,0.0633,0.0032,0.0016,0.0072,0.0111


In [14]:
winedf_tra.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Columns: 118 entries, wine_link to Vista Flores
dtypes: Int64(93), float64(20), object(5)
memory usage: 2.0+ MB


In [15]:
# Guardado final del dataset transformado
ut.save_csv(winedf_tra, path="../src/data/transformed/", filename="wines_transformed.csv")

Archivo guardado en: ../src/data/transformed/wines_transformed.csv
