<a href="https://colab.research.google.com/github/Perciii/WineBlindTasting/blob/main/ml_blind_tasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Déterminer un vin à partir de son analyse

In [74]:
from google.colab import drive

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white")

Upload json file from extracted database

In [75]:
import json
from google.colab import files
#uploaded = files.upload()

Load json in dict

In [76]:
with open('flashwine-default-rtdb-export.json') as json_file:
    data = json.load(json_file)

all_tasting_notes = []
for key in data['users']:
  for tasting_note_id in data['users'][key]['tastingNotes']:
    all_tasting_notes.append(data['users'][key]['tastingNotes'][tasting_note_id])

In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DICT TO DF & cleaning

In [78]:
tasting_notes_df = pd.DataFrame.from_dict(all_tasting_notes)
tasting_notes_df = tasting_notes_df.drop(columns=['id', 'isPrivate', 'tastingName', 'groupId', 'bottleAgeingComment', 'otherObservations', 'qualityComment'])
# Remove rows where "wine name" contains either "test" or "blind" or if aromas or flavours are empty
tasting_notes_df = tasting_notes_df[~tasting_notes_df['wineName'].str.contains("(?i)test")]
tasting_notes_df = tasting_notes_df[~tasting_notes_df['wineName'].str.contains("(?i)blind")]
tasting_notes_df = tasting_notes_df[~tasting_notes_df['aromas'].str.contains("(?i)aromas")]
tasting_notes_df = tasting_notes_df[~tasting_notes_df['flavours'].str.contains("(?i)flavours")]
tasting_notes_df.columns

Index(['acidity', 'alcohol', 'appearanceIntensity', 'aromas', 'bodyLevel',
       'bottleAgeingSuitable', 'clean', 'drinkNow', 'finish',
       'flavourIntensity', 'flavours', 'isVintage', 'noseIntensity',
       'qualityLevel', 'sparkling', 'sweetness', 'tannin', 'tanninNature',
       'tastingDate', 'varieties', 'wineColour', 'wineName', 'wineSubColour',
       'year', 'ownerUid', 'photoAssetIds', 'photoPaths'],
      dtype='object')

# Data preprocessing

In [79]:
# Remove rows where clean = False
tasting_notes_df = tasting_notes_df[tasting_notes_df['clean'] == True]

Preprocess data

In [80]:
## Function to translate from list of aromas to binary number to decimal number
## From a note's aromas, add a colum with the corresponding decimal number
# --> note done yet

# Process flavours --> not done yet

# Process wineColour --> translate to scale
wine_colours = {"red": 0, "white": 1, "rosé": 2}
tasting_notes_df['wineColourId'] = tasting_notes_df.apply(lambda row: wine_colours[row['wineColour']], axis=1)
tasting_notes_df.drop(['wineColour'], axis=1, inplace=True)

# Process qualityLevel --> translate to scale
wine_quality = {"poor": 0, "acceptable": 0.25, "good": 0.5, "very good": 0.75, "outstanding": 1}
tasting_notes_df['qualityLevelId'] = tasting_notes_df.apply(lambda row: wine_quality[row['qualityLevel']], axis=1)
tasting_notes_df.drop(['qualityLevel'], axis=1, inplace=True)

###Aromas

In [81]:
# Process aromas
## Retrieve all aromas in a collection (set + list?)
all_aromas = tasting_notes_df['aromas'].str.lower()
#all_aromas = all_aromas.str.split(r'(?:,|;)\s*').dropna()
all_aromas = all_aromas.str.split(r'[(?:,|;)]\s*')
#all_aromas = all_aromas.replace('', np.nan, inplace=True)
all_aromas = all_aromas.dropna()
all_aromas = all_aromas.to_numpy()

all_aromas_unique = np.unique(sum(all_aromas, []))
all_aromas_unique = np.unique(np.char.strip(all_aromas_unique))
all_aromas_unique = np.delete(all_aromas_unique, np.where(all_aromas_unique == ''))

all_aromas_dict = {k: v for v, k in enumerate(all_aromas_unique)}
print(all_aromas_dict)

{np.str_('acacia'): 0, np.str_('acetaldehyde'): 1, np.str_('almond'): 2, np.str_('anise'): 3, np.str_('apple'): 4, np.str_('apple jam'): 5, np.str_('apple pie'): 6, np.str_('apple skin'): 7, np.str_('applesauce'): 8, np.str_('apricot'): 9, np.str_('apricot jam'): 10, np.str_('asparagus'): 11, np.str_('back cherry'): 12, np.str_('baked apple'): 13, np.str_('balsamic vinegar'): 14, np.str_('banana'): 15, np.str_('basswood'): 16, np.str_('bell pepper'): 17, np.str_('bergamot'): 18, np.str_('biscuit'): 19, np.str_('biscuit rose de reims'): 20, np.str_('black cherry'): 21, np.str_('black currant'): 22, np.str_('black fruits'): 23, np.str_('black olive'): 24, np.str_('black pepper'): 25, np.str_('black plum'): 26, np.str_('black tea'): 27, np.str_('blackberry'): 28, np.str_('blackberry jam'): 29, np.str_('blackcurrant'): 30, np.str_('blood'): 31, np.str_('blood orange'): 32, np.str_('blossom'): 33, np.str_('blue plum'): 34, np.str_('blueberry'): 35, np.str_('boxtree'): 36, np.str_('bread'): 

####Create a dict of grouped aromas

In [82]:
primary_aromas = {"floral":["blossom","elderflower","honeysuckle","jasmine","rose","violet"],
"green fruit":["apple","pear","gooseberry","grape"],
"citrus fruit":["grapefruit","lemon","lime","orange"],
"stone fruit":["peach","apricot","nectarine"],
"tropical fruit":["banana","lychee","mango","melon","passion fruit","pineapple"],
"red fruit":["redcurrant","cranberry","raspberry","strawberry","red cherry","red plum"],
"black fruit":["blackcurrant","blackberry","blueberry","black cherry","black plum"],
"herbaceous":["green bell pepper","grass","tomato leaf","asparagus"],
"herbal":["eucalyptus","mint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","tarragon","verbena"],
"spice":["black pepper","white pepper","liquorice","cinnamon"],
"fruit ripeness":["unripe","ripe","dried","cooked"],
"other":["simple","wet stones","candy","kirsch"]}

secondary_aromas = {"yeast (lees, autolysis, flor)":["biscuit","pastry","bread","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
"malolactic conversion":["butter","cream","cheese"],
"oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","coffee"]}

tertiary_aromas = {"red wine":["dried fruit (e.g. prune, raisin, fig)","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","mushroom","meat","tobacco","wet leaves","forest floor","caramel"],
"white wine":["dried fruit (e.g. dried apricot, raisin)","orange marmalade","petrol (gasoline)","cinnamon","ginger","nutmeg","almond","hazelnut","honey","caramel"],
"deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","coffee","caramel"]}

grouped_aromas = {"flowers":["blossom","elderflower","honeysuckle","hawthorn","acacia","lily of the valley","basswood","jasmine","rose","violet","geranium","poppy","lavender","orange blossom","dried flowers","red flowers","white flowers"],
"green fruits":["apple","pear","quince","gooseberry","grape"],
"citrus fruits":["grapefruit","lemon","lime","orange","bergamot","blood orange","clementine","mandarin","pomelo"],
"stone fruits":["peach","yellow peach","white peach","apricot","nectarine","mirabelle plum"],
"tropical fruits":["banana","lychee","mango","melon","yellow melon","passion fruit","pineapple","guava","fig","papaya"],
"red fruits":["redcurrant","cranberry","raspberry","strawberry","red cherry","cherry","red plum","plum","pomegranate"],
"black fruits":["cassis","blackcurrant","blackberry","blueberry","black cherry","black plum","cherry","damson plum","plum"],
"herbaceous":["bell pepper","green bell pepper","grass","fern","tomato leaf","boxtree","asparagus"],
"herbs":["eucalyptus","mint","peppermint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","coriander","tarragon","anise","laurel","medicinal herbs"],
"spices":["pepper","black pepper","white pepper","green pepper","liquorice","cinnamon","saffron","cumin"],
"fruit ripeness":["unripe","ripe","over-ripe","dried","cooked","jam","compote"],
"other":["simple","wet stones","mineral","stone","salt","candy","turkish delight","kirsch","chambord","delicatessen","flint","silex","cardboard","tar","steel"],
"yeast (lees, autolysis, flor)":["biscuit","pastry","brioche","bread","toast","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
"malolactic conversion":["butter","cream","cheese","goat cheese"],
"oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","cocoa","coffee","mocha"],
"red wine":["dried fruit","prune","raisin","dried fig","date","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","dirt","mushroom","meat","blood","game","tobacco","wet leaves","forest floor","caramel"],
"white wine":["candied lemon","candied melon","candied orange","candied peach","dried fruit","dried apricot","dried mango","prune","raisin","dried fig","date","orange marmalade","petrol","gasoline","petroleum","cinnamon","ginger","nutmeg","almond","hazelnut","cashew","chestnut","honey","caramel"],
"deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","cocoa","coffee","mocha","caramel"]}

####Diff between our aromas and all aromas

In [83]:
diff_aromas = []
for aroma in all_aromas_dict:
  if not any(aroma in grouped_aromas[key] for key in grouped_aromas) and (aroma not in grouped_aromas):
    diff_aromas.append(aroma)
diff_aromas

[np.str_('apple jam'),
 np.str_('apple pie'),
 np.str_('apple skin'),
 np.str_('applesauce'),
 np.str_('apricot jam'),
 np.str_('back cherry'),
 np.str_('baked apple'),
 np.str_('balsamic vinegar'),
 np.str_('biscuit rose de reims'),
 np.str_('black currant'),
 np.str_('black olive'),
 np.str_('black tea'),
 np.str_('blackberry jam'),
 np.str_('blue plum'),
 np.str_('bruised apple'),
 np.str_('cabbage'),
 np.str_('cacao'),
 np.str_('candied apricot'),
 np.str_('cardamom'),
 np.str_('cashew nut'),
 np.str_('celery'),
 np.str_('chalk'),
 np.str_('chambord liqueur'),
 np.str_('chamomile'),
 np.str_('cheese rind'),
 np.str_('cherry jam'),
 np.str_('cherry pie'),
 np.str_('chicory'),
 np.str_('cider'),
 np.str_('clove'),
 np.str_('coal'),
 np.str_('cooked apple'),
 np.str_('cooked blackberry'),
 np.str_('cooked cherry'),
 np.str_('cooked pear'),
 np.str_('cooked strawberry'),
 np.str_('cranberries'),
 np.str_('crème brûlée'),
 np.str_('curry'),
 np.str_('dates'),
 np.str_('dough'),
 np.str_

### Aromas to number

In [84]:
# Function to create decimal number from aromas in tasting note
def aromas_to_number(tasting_note_aromas, all_aromas_dict):
  aromas_binary = [0 for i in range(len(all_aromas_dict))]
  for aroma in tasting_note_aromas:
    aromas_binary[all_aromas_dict[aroma]] = 1
  return int("".join(str(x) for x in aromas_binary), 2)

print(aromas_to_number(['acacia', 'anise'], all_aromas_dict))

139872160240252493106201257291293361278253292411754045530130493501201108534775011868672


In [85]:
# Function to create a dict with frequency of aromas in each group
def computeFrequency(row, aroma_group):
  row_aromas = row['aromas'].lower()
  row_aromas = row_aromas.split(",")
  row_aromas = [aroma.strip() for aroma in row_aromas]

  nb = 0
  for aroma in row_aromas:
    if aroma in grouped_aromas[aroma_group] or aroma == aroma_group:
      nb += 1
  #return nb/(len(grouped_aromas[aroma_group])+1)
  return nb/len(row_aromas)

#frequency
for aroma_group in grouped_aromas:
  tasting_notes_df[aroma_group] = tasting_notes_df.apply(lambda row : computeFrequency(row, aroma_group), axis=1)

# Guess wine colour

In [86]:
# New DF
tasting_to_colour_df = tasting_notes_df.drop(columns=['clean', 'isVintage', 'tanninNature', 'varieties', 'wineName', 'wineSubColour', 'year', 'ownerUid', 'photoAssetIds', 'photoPaths', 'tastingDate'])

##ML - classification for colour based on whole notes

###ML - Guess colour from what whole tasting notes

In [87]:
# Metrics for Evaluation of model Accuracy and F1-score
from sklearn.metrics import f1_score,accuracy_score

#Importing the Decision Tree from scikit-learn library
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# For splitting of data into train and test set
from sklearn.model_selection import train_test_split

In [88]:
#first we split our data into input and output
# y is the output and is stored in "wineColourId" column of dataframe
# X contains the other columns and are features or input
y = tasting_to_colour_df.wineColourId
tasting_to_colour_df.drop(['wineColourId'], axis=1, inplace=True)
tasting_to_colour_df.drop(['aromas'], axis=1, inplace=True)
tasting_to_colour_df.drop(['flavours'], axis=1, inplace=True)
X = tasting_to_colour_df

In [89]:
print(X)

     acidity  alcohol  appearanceIntensity  bodyLevel  bottleAgeingSuitable  \
1          3        1                    1          3                  True   
2          3        1                    1          3                  True   
3          4        1                    1          3                  True   
4          1        1                    1          2                 False   
5          2        1                    2          4                  True   
..       ...      ...                  ...        ...                   ...   
982        2        2                    0          4                  True   
983        4        2                    1          3                  True   
984        4        2                    0          4                 False   
986        2        2                    0          4                 False   
987        2        2                    0          4                  True   

     drinkNow  finish  flavourIntensity  noseIntens

In [90]:
# Now we split the dataset in train and test part
# here the train set is 75% and test set is 25%

# Drop non-numerical columns that are not features for the model
#X = X.drop(columns=['ownerUid', 'photoAssetIds', 'photoPaths', 'tastingDate'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
print("X train :" + str(len(X_train)))
print("X test :" + str(len(X_test)))
print("y train :" + str(len(y_train)))
print("y test :" + str(len(y_test)))

# Training the model is as simple as this
# Use the function imported above and apply fit() on it
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)

# We use the predict() on the model to predict the output
pred=DT.predict(X_test)

# Print pred and test
print("Test:")
print(y_test)
print("Pred:")
print(pred)

# for classification we use accuracy and F1 score
print(accuracy_score(y_test,pred))
print(f1_score(y_test,pred,average='micro'))

X train :662
X test :221
y train :662
y test :221
Test:
266    1
916    1
357    0
69     0
554    0
      ..
118    0
814    0
711    1
362    1
370    1
Name: wineColourId, Length: 221, dtype: int64
Pred:
[2 1 0 0 0 1 1 1 1 0 1 1 0 0 1 1 1 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 1 0
 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 0 0 2 0 1 0 0 1 0 0 0 1 1 0 1 1 2 1 0
 1 0 1 0 1 0 1 2 1 1 1 0 2 1 0 1 2 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 0
 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 2 1 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 1 1 2
 1 1 2 2 0 0 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 2 1 1 0 0 0 1 1 0 0 1 1 0 1 1 0
 1 1 1 1 0 2 0 0 0 2 1 2 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 1 0 0 1 1 1]
0.9592760180995475
0.9592760180995475


###ML - Guess colour from aromas (grouped)

In [91]:
# Select only aromas columns in training set
X = tasting_to_colour_df[grouped_aromas.keys()]
print(X.columns)
# Now we split the dataset in train and test part
# here the train set is 75% and test set is 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
print("X train :" + str(len(X_train)))
print("X test :" + str(len(X_test)))
print("y train :" + str(len(y_train)))
print("y test :" + str(len(y_test)))

# Training the model is as simple as this
# Use the function imported above and apply fit() on it
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)

# We use the predict() on the model to predict the output
pred=DT.predict(X_test)

# Print pred and test
print("Test:")
print(y_test)
print(X_test)
print("Pred:")
print(pred)

# for classification we use accuracy and F1 score
print(accuracy_score(y_test,pred))
print(f1_score(y_test,pred,average='micro'))

Index(['flowers', 'green fruits', 'citrus fruits', 'stone fruits',
       'tropical fruits', 'red fruits', 'black fruits', 'herbaceous', 'herbs',
       'spices', 'fruit ripeness', 'other', 'yeast (lees, autolysis, flor)',
       'malolactic conversion', 'oak', 'red wine', 'white wine',
       'deliberately oxidised wines'],
      dtype='object')
X train :662
X test :221
y train :662
y test :221
Test:
266    1
916    1
357    0
69     0
554    0
      ..
118    0
814    0
711    1
362    1
370    1
Name: wineColourId, Length: 221, dtype: int64
      flowers  green fruits  citrus fruits  stone fruits  tropical fruits  \
266  0.000000      0.000000       0.500000           0.0         0.000000   
916  0.166667      0.166667       0.333333           0.0         0.000000   
357  0.000000      0.000000       0.000000           0.0         0.000000   
69   0.000000      0.000000       0.000000           0.0         0.000000   
554  0.000000      0.000000       0.000000           0.0         

#Guess wine varieties

In [92]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# New DF
tasting_to_varieties_df = tasting_notes_df.drop(columns=['clean', 'isVintage', 'tanninNature', 'wineName', 'wineSubColour', 'year', 'ownerUid', 'photoAssetIds', 'photoPaths', 'tastingDate'])
tasting_to_varieties_df.columns

Index(['acidity', 'alcohol', 'appearanceIntensity', 'aromas', 'bodyLevel',
       'bottleAgeingSuitable', 'drinkNow', 'finish', 'flavourIntensity',
       'flavours', 'noseIntensity', 'sparkling', 'sweetness', 'tannin',
       'varieties', 'wineColourId', 'qualityLevelId', 'flowers',
       'green fruits', 'citrus fruits', 'stone fruits', 'tropical fruits',
       'red fruits', 'black fruits', 'herbaceous', 'herbs', 'spices',
       'fruit ripeness', 'other', 'yeast (lees, autolysis, flor)',
       'malolactic conversion', 'oak', 'red wine', 'white wine',
       'deliberately oxidised wines'],
      dtype='object')

##Data formatting & preprocessing

###Varieties

In [93]:
# Process varieties
## Retrieve all varieties in a collection (set + list?)
all_varieties = tasting_to_varieties_df['varieties'].str.lower()
all_varieties = all_varieties.str.split(r'[(?:,|;)]\s*')
all_varieties = all_varieties.dropna()
all_varieties = all_varieties.to_numpy()

all_varieties_unique = np.unique(sum(all_varieties, []))
all_varieties_unique = np.unique(np.char.strip(all_varieties_unique))
all_varieties_unique = np.delete(all_varieties_unique, np.where(all_varieties_unique == ''))

all_varieties_dict = {k: v for v, k in enumerate(all_varieties_unique)}
print(all_varieties_dict)

{np.str_('25%'): 0, np.str_('50%'): 1, np.str_('75%'): 2, np.str_('abouriou'): 3, np.str_('albariño'): 4, np.str_('albillo'): 5, np.str_('alicante'): 6, np.str_('alicante bouschet'): 7, np.str_('aligoté'): 8, np.str_('altesse'): 9, np.str_('alvarinho'): 10, np.str_('antão vaz'): 11, np.str_('arbane'): 12, np.str_('arinto'): 13, np.str_('assyrtiko'): 14, np.str_('avesso'): 15, np.str_('baga'): 16, np.str_('biancu ghjentile'): 17, np.str_('bical'): 18, np.str_('bourboulenc'): 19, np.str_('brancellao'): 20, np.str_('brustianu'): 21, np.str_('cabernet franc'): 22, np.str_('cabernet sauvignon'): 23, np.str_('caiño tinto'): 24, np.str_('carcaghjolu biancu'): 25, np.str_('carignan'): 26, np.str_('carignan rouge'): 27, np.str_('cariñena'): 28, np.str_('carmenère'): 29, np.str_('carménère'): 30, np.str_('carricante'): 31, np.str_('chardonnay'): 32, np.str_('chenin'): 33, np.str_('chenin blanc'): 34, np.str_('cinsault'): 35, np.str_('cinsaut'): 36, np.str_('clairette'): 37, np.str_('clairette bl

In [94]:
import re

# Translate varieties to list of numbers
# Function to create a dict with frequency of aromas in each group
def computeVarieties(row):
  # The 'varieties' column in tasting_to_varieties_df now contains single variety names
  # e.g., 'pinot noir (75%)', 'chardonnay', or None, due to previous steps.
  variety_string = row['varieties']

  if variety_string is None:
    return [] # Return an empty list for consistency, or handle as NaN later if needed

  # Clean the variety string by removing percentages in parentheses and stripping whitespace
  # This makes it compatible with keys in all_varieties_dict
  cleaned_variety = re.sub(r'\s*\(.*\)', '', variety_string.lower()).strip()

  if cleaned_variety in all_varieties_dict:
    return [all_varieties_dict[cleaned_variety]]
  else:
    # If the cleaned variety is still not in the dictionary, it's an unmapped variety.
    # We might want to filter these out or assign a special ID.
    # For now, return an empty list or a default value.
    print(f"Warning: Cleaned variety '{cleaned_variety}' not found in all_varieties_dict. Original: '{variety_string}'")
    return []

#replace varieties with corresponding number in all_varieties_dict
tasting_to_varieties_df['varieties'] = tasting_to_varieties_df.apply(lambda row : computeVarieties(row), axis=1)
tasting_to_varieties_df['varieties'].head()



Unnamed: 0,varieties
1,[163]
2,[32]
3,[]
4,[145]
5,[]


Idées de modélisation pour trouver cépage(s):


*   Réduction à détection du cépage majoritaire : changer target en "cépage majoritaire" = nouvelle colonne
*   Transformer en pbm de régression : output = représentation numérique bijective de l'assemblage
*   **Problème :** trop peu de données et trop de classes ==> réduire à x cépages pour tester, supprimer lignes qui n'ont pas ces cépages et all_varieties_dict contient ces cépages uniquement





In [95]:
# Keep only first variety
def keepOnlyFirstVariety(row):
  return row['varieties'][0] if len(row['varieties'])>0 else None

tasting_to_varieties_df['varieties'] = tasting_to_varieties_df.apply(lambda row : keepOnlyFirstVariety(row), axis=1)
tasting_to_varieties_df['varieties']

Unnamed: 0,varieties
1,163.0
2,32.0
3,
4,145.0
5,
...,...
982,
983,78.0
984,
986,


In [96]:
# Compute number of tasting notes by variety & keep top 5 varieties
most_used_varieties = tasting_to_varieties_df.groupby(['varieties'])['varieties'].count().sort_values(ascending=False).head(5).index
most_used_varieties

Index([32.0, 135.0, 141.0, 117.0, 150.0], dtype='float64', name='varieties')

In [97]:
tasting_to_varieties_df.varieties

Unnamed: 0,varieties
1,163.0
2,32.0
3,
4,145.0
5,
...,...
982,
983,78.0
984,
986,


In [98]:
# Filter dataframe to keep only notes using top 5 varieties
tasting_to_varieties_df = tasting_to_varieties_df[tasting_to_varieties_df['varieties'].isin(most_used_varieties)]
tasting_to_varieties_df.varieties

Unnamed: 0,varieties
2,32.0
14,32.0
21,32.0
31,135.0
32,135.0
...,...
972,141.0
973,141.0
974,141.0
977,32.0


In [99]:
# Remove sparkling
tasting_to_varieties_df = tasting_to_varieties_df[tasting_to_varieties_df['sparkling'] == False]

In [100]:
#first we split our data into input and output
# y is the output and is stored in "Class" column of dataframe
# X contains the other columns and are features or input
y = tasting_to_varieties_df.varieties
#mlb = MultiLabelBinarizer(classes=list(all_varieties_dict.values()))
#y = pd.DataFrame(mlb.fit_transform(tasting_to_varieties_df.varieties), columns=mlb.classes_, index=tasting_to_varieties_df.index)
tasting_to_varieties_df.drop(['varieties'], axis=1, inplace=True)
tasting_to_varieties_df.drop(['aromas'], axis=1, inplace=True)
tasting_to_varieties_df.drop(['flavours'], axis=1, inplace=True)
X = tasting_to_varieties_df

In [101]:
def predict_ml(X, y, model):
  match model:
    case "random_forest":
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, shuffle=True)
      clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
    case "neural_network":
      X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=1, shuffle=True)
      clf = MLPClassifier(solver='lbfgs', random_state=1, max_iter=3000).fit(X_train, y_train)

  print("X train length :", len(X_train))
  print("X test length :", len(X_test))
  print("y train length :", len(y_train))
  print("y test length :", len(y_test))

  print("X train: \n", X_train)
  print("Y train:\n", y_train)

  pred = clf.predict(X_test)

  # Print pred and test
  print("Y Test:\n", y_test.values)
  print("Pred:\n", pred)

  # Scores
  print("Score train:", clf.score(X_train, y_train))
  print("Accuracy test:", accuracy_score(y_test,pred))
  print("F1 score test:", f1_score(y_test,pred,average='weighted'))

In [102]:
predict_ml(X,y,"neural_network")

X train length : 169
X test length : 57
y train length : 169
y test length : 57
X train: 
      acidity  alcohol  appearanceIntensity  bodyLevel  bottleAgeingSuitable  \
474        3        1                    1          3                 False   
719        4        1                    0          3                  True   
313        3        1                    0          3                  True   
830        0        0                    1          1                 False   
865        2        1                    2          3                 False   
..       ...      ...                  ...        ...                   ...   
693        3        1                    0          3                 False   
261        3        1                    0          2                  True   
402        4        1                    0          3                  True   
867        1        1                    1          1                 False   
450        2        1                   

#Guess country

In [103]:
# get data

#Guess region

In [104]:
# get data

#Guess wine vintage



In [105]:
# to guess the vintage, we have to guess the age of the wine when it has been tasted so we need to compute the difference between the tasting date and the vintage of the wine