<a href="https://colab.research.google.com/github/Perciii/WineBlindTasting/blob/main/ml_blind_tasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Déterminer un vin à partir de son analyse

In [22]:

# =========================
# 1) Setup / Imports
# =========================

import json
import re
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier

sns.set_theme(style="white")

RANDOM_STATE = 2

Upload json file from extracted database

In [23]:
# =========================
# 2) Loading helpers
# =========================

def load_flashwine_export(json_path: str) -> dict:
    """
    Load a FlashWine RTDB export JSON file.

    Parameters
    ----------
    json_path : str
        Path to the JSON export file (e.g. 'flashwine-default-rtdb-export.json').

    Returns
    -------
    dict
        Parsed JSON as Python dictionary.
    """
    with open(json_path, "r", encoding="utf-8") as f:
        return json.load(f)


def extract_all_tasting_notes(export: dict) -> List[dict]:
    """
    Extract tasting notes from the FlashWine export structure.

    Expected structure (based on current notebook):
      export['users'][<uid>]['tastingNotes'][<note_id>] -> note_dict

    Returns
    -------
    list[dict]
        All tasting notes across all users.
    """
    notes: List[dict] = []
    for uid, user_data in export.get("users", {}).items():
        tn = user_data.get("tastingNotes", {})
        for note_id, note in tn.items():
            notes.append(note)
    return notes


def build_tasting_notes_df(all_tasting_notes: List[dict]) -> pd.DataFrame:
    """
    Create the main DataFrame from extracted tasting notes.
    """
    return pd.DataFrame.from_dict(all_tasting_notes)

DICT TO DF & cleaning

In [24]:
# =========================
# 3) Cleaning
# =========================

DEFAULT_DROP_COLS = [
    # IDs / metadata / free-text that is not used as model features here
    "id", "isPrivate", "tastingName", "groupId",
    "bottleAgeingComment", "otherObservations", "qualityComment",
]

def clean_tasting_notes_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean tasting notes dataset:
    - drop non-feature columns (configurable)
    - remove internal 'test'/'blind' notes
    - remove placeholder rows where aromas/flavours contain headers
    - keep only clean == True
    - basic NA handling for string filters

    Notes
    -----
    We keep string columns like 'aromas' and 'flavours' for feature engineering later.
    """
    df = df.copy()

    # Drop columns if present (avoids KeyError if schema changes)
    df = df.drop(columns=[c for c in DEFAULT_DROP_COLS if c in df.columns], errors="ignore")

    # Ensure string columns are strings to avoid .str errors
    for col in ["wineName", "aromas", "flavours"]:
        if col in df.columns:
            df[col] = df[col].fillna("").astype(str)

    # Remove "test/blind" wines (internal notes / placeholders)
    if "wineName" in df.columns:
        df = df[~df["wineName"].str.contains(r"(?i)test", regex=True)]
        df = df[~df["wineName"].str.contains(r"(?i)blind", regex=True)]

    # Remove rows that look like template/placeholder text
    if "aromas" in df.columns:
        df = df[~df["aromas"].str.contains(r"(?i)^aromas?$", regex=True)]
    if "flavours" in df.columns:
        df = df[~df["flavours"].str.contains(r"(?i)^flavours?$", regex=True)]

    # Keep only clean notes (clean == True)
    if "clean" in df.columns:
        df = df[df["clean"] == True]

    return df

# Data preprocessing

In [25]:
# =========================
# 4) Feature engineering
# =========================

WINE_COLOURS = {"red": 0, "white": 1, "rosé": 2}
WINE_QUALITY = {"poor": 0, "acceptable": 0.25, "good": 0.5, "very good": 0.75, "outstanding": 1}

def encode_basic_fields(df: pd.DataFrame) -> pd.DataFrame:
    """
    Encode categorical fields into numeric IDs used by models.
    - wineColour -> wineColourId
    - qualityLevel -> qualityLevelId
    """
    df = df.copy()

    if "wineColour" in df.columns:
        df["wineColourId"] = df["wineColour"].map(WINE_COLOURS)
        df = df.drop(columns=["wineColour"], errors="ignore")

    if "qualityLevel" in df.columns:
        df["qualityLevelId"] = df["qualityLevel"].map(WINE_QUALITY)
        df = df.drop(columns=["qualityLevel"], errors="ignore")

    return df


_AROMA_SPLIT_RE = re.compile(r"\s*[,;]\s*")

def split_aromas(text: str) -> List[str]:
    """
    Split an aroma string into a cleaned list of aroma tokens.

    Assumptions:
    - aromas are separated by commas and/or semicolons
    - casing and extra whitespace are not meaningful
    """
    if text is None:
        return []
    text = str(text).strip().lower()
    if not text:
        return []
    parts = _AROMA_SPLIT_RE.split(text)
    parts = [p.strip() for p in parts if p and p.strip()]
    return parts


def build_aroma_vocabulary(aromas_series: pd.Series) -> Dict[str, int]:
    """
    Build a vocabulary mapping aroma -> index over the whole dataset.
    """
    all_lists = aromas_series.fillna("").astype(str).map(split_aromas).tolist()
    vocab = sorted(set(a for lst in all_lists for a in lst))
    return {a: i for i, a in enumerate(vocab)}


# Grouped aromas
GROUPED_AROMAS: Dict[str, List[str]] = {
    "flowers":["blossom","elderflower","honeysuckle","hawthorn","acacia","lily of the valley","basswood","jasmine","rose","violet","geranium","poppy","lavender","orange blossom","dried flowers","red flowers","white flowers"],
    "green fruits":["apple","pear","quince","gooseberry","grape"],
    "citrus fruits":["grapefruit","lemon","lime","orange","bergamot","blood orange","clementine","mandarin","pomelo"],
    "stone fruits":["peach","yellow peach","white peach","apricot","nectarine","mirabelle plum"],
    "tropical fruits":["banana","lychee","mango","melon","yellow melon","passion fruit","pineapple","guava","fig","papaya"],
    "red fruits":["redcurrant","cranberry","raspberry","strawberry","red cherry","cherry","red plum","plum","pomegranate"],
    "black fruits":["cassis","blackcurrant","blackberry","blueberry","black cherry","black plum","cherry","damson plum","plum"],
    "herbaceous":["bell pepper","green bell pepper","grass","fern","tomato leaf","boxtree","asparagus"],
    "herbs":["eucalyptus","mint","peppermint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","coriander","tarragon","anise","laurel","medicinal herbs"],
    "spices":["pepper","black pepper","white pepper","green pepper","liquorice","cinnamon","saffron","cumin"],
    "fruit ripeness":["unripe","ripe","over-ripe","dried","cooked","jam","compote"],
    "other":["simple","wet stones","mineral","stone","salt","candy","turkish delight","kirsch","chambord","delicatessen","flint","silex","cardboard","tar","steel"],
    "yeast (lees, autolysis, flor)":["biscuit","pastry","brioche","bread","toast","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
    "malolactic conversion":["butter","cream","cheese","goat cheese"],
    "oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","cocoa","coffee","mocha"],
    "red wine":["dried fruit","prune","raisin","dried fig","date","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","dirt","mushroom","meat","blood","game","tobacco","wet leaves","forest floor","caramel"],
    "white wine":["candied lemon","candied melon","candied orange","candied peach","dried fruit","dried apricot","dried mango","prune","raisin","dried fig","date","orange marmalade","petrol","gasoline","petroleum","cinnamon","ginger","nutmeg","almond","hazelnut","cashew","chestnut","honey","caramel"],
    "deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","cocoa","coffee","mocha","caramel"]
}

def aroma_group_frequency(aromas_list: List[str], group_name: str, grouped_aromas: Dict[str, List[str]]) -> float:
    """
    Compute the fraction of aromas in `aromas_list` that belong to a given aroma group.

    We normalize by len(aromas_list) so notes listing many aromas do not automatically get higher values.
    """
    if not aromas_list:
        return 0.0
    group_terms = set(grouped_aromas.get(group_name, []))
    count = 0
    for a in aromas_list:
        if a == group_name or a in group_terms:
            count += 1
    return count / len(aromas_list)


def add_grouped_aroma_features(df: pd.DataFrame, grouped_aromas: Dict[str, List[str]]) -> pd.DataFrame:
    """
    Add one numeric column per aroma group: frequency of group aromas in the note.

    Requires: df has an 'aromas' column.
    """
    df = df.copy()
    aromas_lists = df["aromas"].fillna("").astype(str).map(split_aromas)

    for group in grouped_aromas.keys():
        df[group] = aromas_lists.map(lambda lst, g=group: aroma_group_frequency(lst, g, grouped_aromas))

    return df


def report_dataset_health(df: pd.DataFrame, target_col: Optional[str] = None) -> None:
    """
    Quick sanity checks to help readers understand what data is being modeled.
    """
    print("Shape:", df.shape)
    display(df.head(3))

    missing = df.isna().mean().sort_values(ascending=False).head(15)
    print("\nTop missingness:")
    print(missing)

    if target_col and target_col in df.columns:
        print(f"\nTarget distribution ({target_col}):")
        print(df[target_col].value_counts(dropna=False))
        print(df[target_col].value_counts(normalize=True, dropna=False))

###Aromas

In [26]:
"""primary_aromas = {"floral":["blossom","elderflower","honeysuckle","jasmine","rose","violet"],
"green fruit":["apple","pear","gooseberry","grape"],
"citrus fruit":["grapefruit","lemon","lime","orange"],
"stone fruit":["peach","apricot","nectarine"],
"tropical fruit":["banana","lychee","mango","melon","passion fruit","pineapple"],
"red fruit":["redcurrant","cranberry","raspberry","strawberry","red cherry","red plum"],
"black fruit":["blackcurrant","blackberry","blueberry","black cherry","black plum"],
"herbaceous":["green bell pepper","grass","tomato leaf","asparagus"],
"herbal":["eucalyptus","mint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","tarragon","verbena"],
"spice":["black pepper","white pepper","liquorice","cinnamon"],
"fruit ripeness":["unripe","ripe","dried","cooked"],
"other":["simple","wet stones","candy","kirsch"]}

secondary_aromas = {"yeast (lees, autolysis, flor)":["biscuit","pastry","bread","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
"malolactic conversion":["butter","cream","cheese"],
"oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","coffee"]}

tertiary_aromas = {"red wine":["dried fruit (e.g. prune, raisin, fig)","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","mushroom","meat","tobacco","wet leaves","forest floor","caramel"],
"white wine":["dried fruit (e.g. dried apricot, raisin)","orange marmalade","petrol (gasoline)","cinnamon","ginger","nutmeg","almond","hazelnut","honey","caramel"],
"deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","coffee","caramel"]}

grouped_aromas = {"flowers":["blossom","elderflower","honeysuckle","hawthorn","acacia","lily of the valley","basswood","jasmine","rose","violet","geranium","poppy","lavender","orange blossom","dried flowers","red flowers","white flowers"],
"green fruits":["apple","pear","quince","gooseberry","grape"],
"citrus fruits":["grapefruit","lemon","lime","orange","bergamot","blood orange","clementine","mandarin","pomelo"],
"stone fruits":["peach","yellow peach","white peach","apricot","nectarine","mirabelle plum"],
"tropical fruits":["banana","lychee","mango","melon","yellow melon","passion fruit","pineapple","guava","fig","papaya"],
"red fruits":["redcurrant","cranberry","raspberry","strawberry","red cherry","cherry","red plum","plum","pomegranate"],
"black fruits":["cassis","blackcurrant","blackberry","blueberry","black cherry","black plum","cherry","damson plum","plum"],
"herbaceous":["bell pepper","green bell pepper","grass","fern","tomato leaf","boxtree","asparagus"],
"herbs":["eucalyptus","mint","peppermint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","coriander","tarragon","anise","laurel","medicinal herbs"],
"spices":["pepper","black pepper","white pepper","green pepper","liquorice","cinnamon","saffron","cumin"],
"fruit ripeness":["unripe","ripe","over-ripe","dried","cooked","jam","compote"],
"other":["simple","wet stones","mineral","stone","salt","candy","turkish delight","kirsch","chambord","delicatessen","flint","silex","cardboard","tar","steel"],
"yeast (lees, autolysis, flor)":["biscuit","pastry","brioche","bread","toast","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
"malolactic conversion":["butter","cream","cheese","goat cheese"],
"oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","cocoa","coffee","mocha"],
"red wine":["dried fruit","prune","raisin","dried fig","date","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","dirt","mushroom","meat","blood","game","tobacco","wet leaves","forest floor","caramel"],
"white wine":["candied lemon","candied melon","candied orange","candied peach","dried fruit","dried apricot","dried mango","prune","raisin","dried fig","date","orange marmalade","petrol","gasoline","petroleum","cinnamon","ginger","nutmeg","almond","hazelnut","cashew","chestnut","honey","caramel"],
"deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","cocoa","coffee","mocha","caramel"]}"""""

'primary_aromas = {"floral":["blossom","elderflower","honeysuckle","jasmine","rose","violet"],\n"green fruit":["apple","pear","gooseberry","grape"],\n"citrus fruit":["grapefruit","lemon","lime","orange"],\n"stone fruit":["peach","apricot","nectarine"],\n"tropical fruit":["banana","lychee","mango","melon","passion fruit","pineapple"],\n"red fruit":["redcurrant","cranberry","raspberry","strawberry","red cherry","red plum"],\n"black fruit":["blackcurrant","blackberry","blueberry","black cherry","black plum"],\n"herbaceous":["green bell pepper","grass","tomato leaf","asparagus"],\n"herbal":["eucalyptus","mint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","tarragon","verbena"],\n"spice":["black pepper","white pepper","liquorice","cinnamon"],\n"fruit ripeness":["unripe","ripe","dried","cooked"],\n"other":["simple","wet stones","candy","kirsch"]}\n\nsecondary_aromas = {"yeast (lees, autolysis, flor)":["biscuit","pastry","bread","toasted bread","bread dou

####Diff between our aromas and all aromas

In [27]:
"""
diff_aromas = []
for aroma in all_aromas_dict:
  if not any(aroma in grouped_aromas[key] for key in grouped_aromas) and (aroma not in grouped_aromas):
    diff_aromas.append(aroma)
diff_aromas"""

'\ndiff_aromas = []\nfor aroma in all_aromas_dict:\n  if not any(aroma in grouped_aromas[key] for key in grouped_aromas) and (aroma not in grouped_aromas):\n    diff_aromas.append(aroma)\ndiff_aromas'

In [28]:
# =========================
# 5) Modeling helpers
# =========================

def train_evaluate_classifier(
    X: pd.DataFrame,
    y: pd.Series,
    model=None,
    test_size: float = 0.25,
    random_state: int = RANDOM_STATE,
) -> Tuple[object, pd.DataFrame]:
    """
    Train/test split + fit + evaluation for quick iteration in notebooks.

    Returns
    -------
    model : fitted estimator
    metrics : pd.DataFrame with accuracy + f1_micro
    """
    if model is None:
        model = DecisionTreeClassifier(random_state=random_state)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="micro")

    print("Accuracy:", acc)
    print("F1 micro:", f1)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

    metrics = pd.DataFrame([{"accuracy": acc, "f1_micro": f1}])
    return model, metrics

In [29]:
# =========================
# 6) Example notebook flow (replace your old linear cells with this)
# =========================

# Load
export = load_flashwine_export("flashwine-default-rtdb-export.json")
notes = extract_all_tasting_notes(export)
df = build_tasting_notes_df(notes)

# Clean + encode + features
df = clean_tasting_notes_df(df)
df = encode_basic_fields(df)
df = add_grouped_aroma_features(df, GROUPED_AROMAS)

# =========================
# 7) Prepared dataset for experiments
# =========================

df_model = df.copy()

# Common columns you usually don't want as features
NON_FEATURE_COLS = {
    "wineColourId", "wineName", "wineSubColour", "varieties",
    "ownerUid", "photoAssetIds", "photoPaths", "tastingDate",
    "aromas", "flavours", "clean", "isVintage", "tanninNature", "year"
}

# Basic targets available
AVAILABLE_TARGETS = [c for c in ["wineColourId", "qualityLevelId"] if c in df_model.columns]
print("Available targets:", AVAILABLE_TARGETS)

# Sanity: ensure aroma-group columns exist
aroma_group_cols = list(GROUPED_AROMAS.keys())
missing_groups = [c for c in aroma_group_cols if c not in df_model.columns]
print("Missing aroma groups:", missing_groups)

"""# Inspect
report_dataset_health(df, target_col="wineColourId")

# Build colour dataset
target = df["wineColourId"]
feature_cols_to_drop = [
    "wineColourId", "wineName", "wineSubColour", "varieties",
    "ownerUid", "photoAssetIds", "photoPaths", "tastingDate",
    "aromas", "flavours", "clean", "isVintage", "tanninNature", "year"
]
X = df.drop(columns=[c for c in feature_cols_to_drop if c in df.columns], errors="ignore")
y = target

# Train baseline colour classifier
model, metrics = train_evaluate_classifier(X, y)

# Train aroma-only classifier
X_aroma = df[list(GROUPED_AROMAS.keys())]
model_aroma, metrics_aroma = train_evaluate_classifier(X_aroma, y)"""

Available targets: ['wineColourId', 'qualityLevelId']
Missing aroma groups: []


'# Inspect\nreport_dataset_health(df, target_col="wineColourId")\n\n# Build colour dataset\ntarget = df["wineColourId"]\nfeature_cols_to_drop = [\n    "wineColourId", "wineName", "wineSubColour", "varieties",\n    "ownerUid", "photoAssetIds", "photoPaths", "tastingDate",\n    "aromas", "flavours", "clean", "isVintage", "tanninNature", "year"\n]\nX = df.drop(columns=[c for c in feature_cols_to_drop if c in df.columns], errors="ignore")\ny = target\n\n# Train baseline colour classifier\nmodel, metrics = train_evaluate_classifier(X, y)\n\n# Train aroma-only classifier\nX_aroma = df[list(GROUPED_AROMAS.keys())]\nmodel_aroma, metrics_aroma = train_evaluate_classifier(X_aroma, y)'

# Guess wine colour

###ML - Guess colour from whole tasting notes

In [30]:
# =========================
# Experiment A: Wine colour from structured fields (+ grouped aromas if you want)
# =========================

target_col = "wineColourId"
df_colour = df_model.dropna(subset=[target_col]).copy()

# Option 1: Whole note (structured) but exclude free text fields
X_whole = df_colour.drop(columns=[c for c in (NON_FEATURE_COLS | {target_col}) if c in df_colour.columns], errors="ignore")
y_colour = df_colour[target_col]

# Ensure booleans are numeric (sklearn trees can handle bool, but being explicit helps)
for c in X_whole.select_dtypes(include=["bool"]).columns:
    X_whole[c] = X_whole[c].astype(int)

model_whole, metrics_whole = train_evaluate_classifier(X_whole, y_colour)
metrics_whole

Accuracy: 0.9464285714285714
F1 micro: 0.9464285714285714

Classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99        95
           1       0.95      0.95      0.95       117
           2       0.55      0.50      0.52        12

    accuracy                           0.95       224
   macro avg       0.83      0.82      0.82       224
weighted avg       0.94      0.95      0.95       224



Unnamed: 0,accuracy,f1_micro
0,0.946429,0.946429


###ML - Guess colour from aromas (grouped)

In [31]:
# Option 2: Aroma-group-only
X_aroma_only = df_colour[aroma_group_cols]
model_aroma_only, metrics_aroma_only = train_evaluate_classifier(X_aroma_only, y_colour)
metrics_aroma_only

Accuracy: 0.9330357142857143
F1 micro: 0.9330357142857143

Classification report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        95
           1       0.95      0.93      0.94       117
           2       0.70      0.58      0.64        12

    accuracy                           0.93       224
   macro avg       0.86      0.83      0.84       224
weighted avg       0.93      0.93      0.93       224



Unnamed: 0,accuracy,f1_micro
0,0.933036,0.933036


#Guess wine varieties

In [32]:
# =========================
# Experiment B1: Major variety classification (top-K) - improved
# =========================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

_SPLIT_RE = re.compile(r"\s*[,;]\s*")
_PCT_RE = re.compile(r"\((\d+)\s*%\)")

def major_variety(s: str) -> str | None:
    """Pick the major variety from a blend string.
    If percentages exist, pick max percentage; else pick first token.
    """
    if s is None:
        return None
    raw = str(s).strip().lower()
    if not raw:
        return None

    parts = _SPLIT_RE.split(raw)
    scored: list[tuple[int | None, str]] = []

    for p in parts:
        p = p.strip()
        if not p:
            continue

        m = _PCT_RE.search(p)
        pct = int(m.group(1)) if m else None
        name = re.sub(r"\s*\(.*?\)\s*", "", p).strip()

        if name:
            scored.append((pct, name))

    if not scored:
        return None

    # choose max pct if present, else first
    if any(pct is not None for pct, _ in scored):
        scored2 = [(pct if pct is not None else -1, name) for pct, name in scored]
        return max(scored2, key=lambda t: t[0])[1]

    return scored[0][1]


# --- Build dataset
df_var = df_model.copy()

# Optional: remove sparkling (often blends / different style cues)
if "sparkling" in df_var.columns:
    df_var = df_var[df_var["sparkling"] == False].copy()

df_var["major_variety"] = df_var["varieties"].map(major_variety)
df_var = df_var.dropna(subset=["major_variety"]).copy()

TOP_K = 5
top_varieties = df_var["major_variety"].value_counts().head(TOP_K).index
df_var = df_var[df_var["major_variety"].isin(top_varieties)].copy()

print("Top varieties used:", list(top_varieties))
print(df_var["major_variety"].value_counts())

TARGET = "major_variety"

# Drop truly-non-feature columns + raw target sources
DROP_COLS = set(NON_FEATURE_COLS) | {"varieties", TARGET}
X = df_var.drop(columns=[c for c in DROP_COLS if c in df_var.columns], errors="ignore").copy()
y = df_var[TARGET].copy()

# Ensure bool -> int for models that don't like bool
for c in X.select_dtypes(include=["bool"]).columns:
    X[c] = X[c].astype(int)

# --- Stratified split (important for imbalanced classes)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y,
)

# --- Model
rf = RandomForestClassifier(
    n_estimators=600,
    random_state=RANDOM_STATE,
    class_weight="balanced_subsample",
)

rf.fit(X_train, y_train)
pred = rf.predict(X_test)

# --- Metrics (more informative than micro)
acc = accuracy_score(y_test, pred)
f1_w = f1_score(y_test, pred, average="weighted")
f1_m = f1_score(y_test, pred, average="macro")

print("Accuracy:", acc)
print("F1 weighted:", f1_w)
print("F1 macro:", f1_m)
print("\nClassification report:\n")
print(classification_report(y_test, pred))

Top varieties used: ['chardonnay', 'pinot noir', 'riesling', 'merlot', 'cabernet sauvignon']
major_variety
chardonnay            84
pinot noir            77
riesling              40
merlot                38
cabernet sauvignon    28
Name: count, dtype: int64
Accuracy: 0.7910447761194029
F1 weighted: 0.7758029672536021
F1 macro: 0.7218545278742476

Classification report:

                    precision    recall  f1-score   support

cabernet sauvignon       0.67      0.57      0.62         7
        chardonnay       0.81      1.00      0.89        21
            merlot       0.62      0.50      0.56        10
        pinot noir       0.82      0.95      0.88        19
          riesling       1.00      0.50      0.67        10

          accuracy                           0.79        67
         macro avg       0.78      0.70      0.72        67
      weighted avg       0.80      0.79      0.78        67



Idées de modélisation pour trouver cépage(s):


*   Réduction à détection du cépage majoritaire : changer target en "cépage majoritaire" = nouvelle colonne
*   Transformer en pbm de régression : output = représentation numérique bijective de l'assemblage
*   **Problème :** trop peu de données et trop de classes ==> réduire à x cépages pour tester, supprimer lignes qui n'ont pas ces cépages et all_varieties_dict contient ces cépages uniquement





#Guess country

In [33]:
!pip -q install pycountry

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [34]:
import pycountry

In [35]:
# =========================
# 8) Robust wineName parsing + country validation
# =========================

import unicodedata

_SPLIT_WINE_NAME_RE = re.compile(r"\s*,\s*")

def _norm_token(s: str) -> str:
    """Normalize for matching: lowercase, strip, remove accents, collapse spaces."""
    s = str(s or "").strip().lower()
    s = "".join(
        ch for ch in unicodedata.normalize("NFKD", s)
        if not unicodedata.combining(ch)
    )
    s = re.sub(r"\s+", " ", s)
    return s

# =========================
# ISO 3166-1 canonical country list via pycountry
# =========================

def build_iso_country_lookup() -> dict[str, str]:
    """
    Build normalized->canonical mapping from ISO 3166-1 country names.

    Canonical output uses pycountry's official .name.
    We also accept official_name/common_name and alpha-2/alpha-3 codes as inputs.
    """
    lookup: dict[str, str] = {}

    def add(name: str, canonical: str) -> None:
        k = _norm_token(name)
        if k and k not in lookup:
            lookup[k] = canonical

    for c in pycountry.countries:
        canonical = c.name
        add(c.name, canonical)

        if hasattr(c, "official_name"):
            add(getattr(c, "official_name"), canonical)
        if hasattr(c, "common_name"):
            add(getattr(c, "common_name"), canonical)

        # Optional: accept codes
        if hasattr(c, "alpha_2"):
            add(getattr(c, "alpha_2"), canonical)
        if hasattr(c, "alpha_3"):
            add(getattr(c, "alpha_3"), canonical)

    return lookup

ISO_COUNTRY_LOOKUP = build_iso_country_lookup()

# Keep an alias layer for dataset quirks / abbreviations / language variants
COUNTRY_ALIASES_ISO = {
    # formatting variants
    "new-zealand": "New Zealand",
    "turkey": "Türkiye",

    # abbreviations / common shortcuts
    "usa": "United States",
    "u.s.a.": "United States",
    "us": "United States",
    "u.s.": "United States",
    "uk": "United Kingdom",
    "u.k.": "United Kingdom",

    # some French variants (keep if relevant)
    "etats-unis": "United States",
    "états-unis": "United States",
    "allemagne": "Germany",
    "autriche": "Austria",
    "suisse": "Switzerland",
    "grece": "Greece",
    "grèce": "Greece",
    "afrique du sud": "South Africa",
    "nouvelle-zelande": "New Zealand",
    "nouvelle-zélande": "New Zealand",
}

def canonicalize_country(raw_country: str) -> str | None:
    """
    Canonicalize + validate using ISO 3166-1 (pycountry).
    Returns ISO canonical country name (pycountry .name) or None if unmatched.
    """
    if raw_country is None:
        return None
    raw_norm = _norm_token(raw_country)
    if not raw_norm:
        return None

    # Apply alias if any
    alias_target = COUNTRY_ALIASES_ISO.get(raw_norm, raw_country)
    alias_norm = _norm_token(alias_target)

    # Direct ISO match
    if alias_norm in ISO_COUNTRY_LOOKUP:
        return ISO_COUNTRY_LOOKUP[alias_norm]

    # Optional fuzzy fallback (comment out if you want strict matching)
    try:
        hit = pycountry.countries.search_fuzzy(alias_target)[0]
        return hit.name
    except Exception:
        return None

def parse_wine_name_loose(wine_name: str) -> dict | None:
    """
    Parse wineName where:
      - the first 4 commas define the first 5 fields
      - any remaining comma-separated fragments are appended to cuvée

    Expected minimal structure:
      "Country, Region, Appellation, Producer, Cuvée[, extra, extra2...]"

    Returns dict with fields or None if not parseable.
    """
    if wine_name is None:
        return None
    s = str(wine_name).strip()
    if not s:
        return None

    parts = [p.strip() for p in _SPLIT_WINE_NAME_RE.split(s) if p and p.strip()]
    if len(parts) < 5:
        return None

    country, region, appellation, producer = parts[0], parts[1], parts[2], parts[3]
    cuvee = ", ".join(parts[4:]).strip()

    if not (country and region and appellation and producer and cuvee):
        return None

    return {
        "country_raw": country,
        "region": region,
        "appellation": appellation,
        "producer": producer,
        "cuvee": cuvee,
    }

def add_wine_name_targets_validated(
    df: pd.DataFrame,
    validate_country: bool = True,
) -> pd.DataFrame:
    """
    Adds:
      - country_raw (from wineName)
      - country (canonical, validated) if validate_country else raw
      - region/appellation/producer/cuvee

    Rows that are not parseable get NaNs.
    """
    df = df.copy()

    parsed = df["wineName"].map(parse_wine_name_loose) if "wineName" in df.columns else None
    if parsed is None:
        for c in ["country_raw", "country", "region", "appellation", "producer", "cuvee"]:
            df[c] = np.nan
        return df

    df["country_raw"] = parsed.map(lambda p: p["country_raw"] if p else np.nan)
    df["region"] = parsed.map(lambda p: p["region"] if p else np.nan)
    df["appellation"] = parsed.map(lambda p: p["appellation"] if p else np.nan)
    df["producer"] = parsed.map(lambda p: p["producer"] if p else np.nan)
    df["cuvee"] = parsed.map(lambda p: p["cuvee"] if p else np.nan)

    if validate_country:
        df["country"] = df["country_raw"].map(canonicalize_country)
    else:
        df["country"] = df["country_raw"]

    return df

def show_unmatched_countries(df_with_targets: pd.DataFrame, top_n: int = 50) -> pd.Series:
    """
    Helpful audit: see which raw countries failed validation.
    """
    bad = df_with_targets[
        df_with_targets["country_raw"].notna()
        & df_with_targets["country"].isna()
    ]
    return bad["country_raw"].value_counts().head(top_n)

In [36]:
# =========================
# Experiment C: Guess canonical country (validated) from note features
# =========================

df_country = add_wine_name_targets_validated(df_model, validate_country=True)

# Optional audit: see what failed validation and update CANON_COUNTRIES / COUNTRY_ALIASES
print("Top unmatched extracted countries (raw):")
print(show_unmatched_countries(df_country, top_n=30))

# Keep only rows with validated country
df_country = df_country.dropna(subset=["country"]).copy()

TOP_K = 10
top_countries = df_country["country"].value_counts().head(TOP_K).index
df_country = df_country[df_country["country"].isin(top_countries)].copy()

print("\nCountries used:", list(top_countries))
print(df_country["country"].value_counts())

TARGET = "country"

# Prevent leakage: remove wineName + parsed targets (and any other too-informative fields you decide)
DROP_COLS = set(NON_FEATURE_COLS) | {
    "wineName",
    "country_raw", "country", "region", "appellation", "producer", "cuvee",
}

X = df_country.drop(columns=[c for c in DROP_COLS if c in df_country.columns], errors="ignore").copy()
y = df_country[TARGET].copy()

for c in X.select_dtypes(include=["bool"]).columns:
    X[c] = X[c].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y,
)

model = RandomForestClassifier(
    n_estimators=800,
    random_state=RANDOM_STATE,
    class_weight="balanced_subsample",
)

model.fit(X_train, y_train)
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print("F1 weighted:", f1_score(y_test, pred, average="weighted"))
print("F1 macro:", f1_score(y_test, pred, average="macro"))
print("\nClassification report:\n")
print(classification_report(y_test, pred))

Top unmatched extracted countries (raw):
Series([], Name: count, dtype: int64)

Countries used: ['France', 'Spain', 'Portugal', 'Italy', 'United States', 'Germany', 'Argentina', 'Greece', 'South Africa', 'Chile']
country
France           592
Spain             52
Portugal          40
Italy             38
United States     33
Germany           12
Greece            11
Argentina         11
South Africa      10
Chile              7
Name: count, dtype: int64
Accuracy: 0.7425742574257426
F1 weighted: 0.6512987669029294
F1 macro: 0.14081301351709516

Classification report:

               precision    recall  f1-score   support

    Argentina       0.00      0.00      0.00         3
        Chile       0.00      0.00      0.00         2
       France       0.75      0.99      0.85       148
      Germany       0.00      0.00      0.00         3
       Greece       0.00      0.00      0.00         3
        Italy       1.00      0.10      0.18        10
     Portugal       0.50      0.30      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#Guess region

In [37]:
# get data

#Guess wine vintage



In [38]:
# to guess the vintage, we have to guess the age of the wine when it has been tasted so we need to compute the difference between the tasting date and the vintage of the wine