<a href="https://colab.research.google.com/github/Perciii/WineBlindTasting/blob/main/ml_blind_tasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Déterminer un vin à partir de son analyse

In [None]:
from google.colab import drive

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="white")

Upload json file from extracted database

In [None]:
import json
from google.colab import files
#uploaded = files.upload()

Load json in dict

In [None]:
with open('flashwine-default-rtdb-export.json') as json_file:
    data = json.load(json_file)

all_tasting_notes = []
for key in data['users']:
  for tasting_note_id in data['users'][key]['tastingNotes']:
    all_tasting_notes.append(data['users'][key]['tastingNotes'][tasting_note_id])

DICT TO DF & cleaning

In [None]:
tasting_notes_df = pd.DataFrame.from_dict(all_tasting_notes)
tasting_notes_df = tasting_notes_df.drop(columns=['id', 'isPrivate', 'tastingName', 'groupId', 'bottleAgeingComment', 'otherObservations', 'qualityComment'])
# Remove rows where "wine name" contains either "test" or "blind" or if aromas or flavours are empty
tasting_notes_df = tasting_notes_df[~tasting_notes_df['wineName'].str.contains("(?i)test")]
tasting_notes_df = tasting_notes_df[~tasting_notes_df['wineName'].str.contains("(?i)blind")]
tasting_notes_df = tasting_notes_df[~tasting_notes_df['aromas'].str.contains("(?i)aromas")]
tasting_notes_df = tasting_notes_df[~tasting_notes_df['flavours'].str.contains("(?i)flavours")]
tasting_notes_df.columns

Index(['acidity', 'alcohol', 'appearanceIntensity', 'aromas', 'bodyLevel',
       'bottleAgeingSuitable', 'clean', 'drinkNow', 'finish',
       'flavourIntensity', 'flavours', 'isVintage', 'noseIntensity',
       'qualityLevel', 'sparkling', 'sweetness', 'tannin', 'tanninNature',
       'varieties', 'wineColour', 'wineName', 'wineSubColour', 'year'],
      dtype='object')

# Data preprocessing

In [None]:
# Remove rows where clean = False
tasting_notes_df = tasting_notes_df[tasting_notes_df['clean'] == True]

Preprocess data

In [None]:
## Function to translate from list of aromas to binary number to decimal number
## From a note's aromas, add a colum with the corresponding decimal number
# --> note done yet

# Process flavours --> not done yet

# Process wineColour --> translate to scale
wine_colours = {"red": 0, "white": 1, "rosé": 2}
tasting_notes_df['wineColourId'] = tasting_notes_df.apply(lambda row: wine_colours[row['wineColour']], axis=1)
tasting_notes_df.drop(['wineColour'], axis=1, inplace=True)

# Process qualityLevel --> translate to scale
wine_quality = {"poor": 0, "acceptable": 0.25, "good": 0.5, "very good": 0.75, "outstanding": 1}
tasting_notes_df['qualityLevelId'] = tasting_notes_df.apply(lambda row: wine_quality[row['qualityLevel']], axis=1)
tasting_notes_df.drop(['qualityLevel'], axis=1, inplace=True)

###Aromas

In [None]:
# Process aromas
## Retrieve all aromas in a collection (set + list?)
all_aromas = tasting_notes_df['aromas'].str.lower()
#all_aromas = all_aromas.str.split(r'(?:,|;)\s*').dropna()
all_aromas = all_aromas.str.split(r'[(?:,|;)]\s*')
#all_aromas = all_aromas.replace('', np.nan, inplace=True)
all_aromas = all_aromas.dropna()
all_aromas = all_aromas.to_numpy()

all_aromas_unique = np.unique(sum(all_aromas, []))
all_aromas_unique = np.unique(np.char.strip(all_aromas_unique))
all_aromas_unique = np.delete(all_aromas_unique, np.where(all_aromas_unique == ''))

all_aromas_dict = {k: v for v, k in enumerate(all_aromas_unique)}
print(all_aromas_dict)

{'acacia': 0, 'almond': 1, 'anise': 2, 'apple': 3, 'apple jam': 4, 'apple pie': 5, 'applesauce': 6, 'apricot': 7, 'banana': 8, 'basswood': 9, 'bell pepper': 10, 'bergamot': 11, 'biscuit': 12, 'biscuit rose de reims': 13, 'black cherry': 14, 'black fruits': 15, 'black olive': 16, 'black pepper': 17, 'blackberry': 18, 'blackcurrant': 19, 'blood': 20, 'blood orange': 21, 'blossom': 22, 'blueberry': 23, 'boxtree': 24, 'bread': 25, 'brioche': 26, 'bruised apple': 27, 'butter': 28, 'cacao': 29, 'candied lemon': 30, 'candied melon': 31, 'candied orange': 32, 'candied peach': 33, 'candy': 34, 'caramel': 35, 'cardboard': 36, 'cassis': 37, 'cedar': 38, 'chambord': 39, 'cherry': 40, 'chestnut': 41, 'chicory': 42, 'chocolate': 43, 'cider': 44, 'cinnamon': 45, 'citrus fruits': 46, 'clementine': 47, 'cloves': 48, 'cocoa': 49, 'coconut': 50, 'coffee': 51, 'cranberry': 52, 'cream': 53, 'crème brûlée': 54, 'damson plum': 55, 'delicatessen': 56, 'dried apricot': 57, 'dried fig': 58, 'dried flowers': 59,

####Create a dict of grouped aromas

In [None]:
primary_aromas = {"floral":["blossom","elderflower","honeysuckle","jasmine","rose","violet"],
"green fruit":["apple","pear","gooseberry","grape"],
"citrus fruit":["grapefruit","lemon","lime","orange"],
"stone fruit":["peach","apricot","nectarine"],
"tropical fruit":["banana","lychee","mango","melon","passion fruit","pineapple"],
"red fruit":["redcurrant","cranberry","raspberry","strawberry","red cherry","red plum"],
"black fruit":["blackcurrant","blackberry","blueberry","black cherry","black plum"],
"herbaceous":["green bell pepper","grass","tomato leaf","asparagus"],
"herbal":["eucalyptus","mint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","tarragon","verbena"],
"spice":["black pepper","white pepper","liquorice","cinnamon"],
"fruit ripeness":["unripe","ripe","dried","cooked"],
"other":["simple","wet stones","candy","kirsch"]}

secondary_aromas = {"yeast (lees, autolysis, flor)":["biscuit","pastry","bread","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
"malolactic conversion":["butter","cream","cheese"],
"oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","coffee"]}

tertiary_aromas = {"red wine":["dried fruit (e.g. prune, raisin, fig)","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","mushroom","meat","tobacco","wet leaves","forest floor","caramel"],
"white wine":["dried fruit (e.g. dried apricot, raisin)","orange marmalade","petrol (gasoline)","cinnamon","ginger","nutmeg","almond","hazelnut","honey","caramel"],
"deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","coffee","caramel"]}

grouped_aromas = {"flowers":["blossom","elderflower","honeysuckle","hawthorn","acacia","lily of the valley","basswood","jasmine","rose","violet","geranium","poppy","lavender","orange blossom","dried flowers","red flowers","white flowers"],
"green fruits":["apple","pear","quince","gooseberry","grape"],
"citrus fruits":["grapefruit","lemon","lime","orange","bergamot","blood orange","clementine","mandarin","pomelo"],
"stone fruits":["peach","yellow peach","white peach","apricot","nectarine","mirabelle plum"],
"tropical fruits":["banana","lychee","mango","melon","yellow melon","passion fruit","pineapple","guava","fig","papaya"],
"red fruits":["redcurrant","cranberry","raspberry","strawberry","red cherry","cherry","red plum","plum","pomegranate"],
"black fruits":["cassis","blackcurrant","blackberry","blueberry","black cherry","black plum","cherry","damson plum","plum"],
"herbaceous":["bell pepper","green bell pepper","grass","fern","tomato leaf","boxtree","asparagus"],
"herbs":["eucalyptus","mint","peppermint","fennel","dill","dried herbs","thyme","oregano","rosemary","sage","parsley","basil","coriander","tarragon","anise","laurel","medicinal herbs"],
"spices":["pepper","black pepper","white pepper","green pepper","liquorice","cinnamon","saffron","cumin"],
"fruit ripeness":["unripe","ripe","over-ripe","dried","cooked","jam","compote"],
"other":["simple","wet stones","mineral","stone","salt","candy","turkish delight","kirsch","chambord","delicatessen","flint","silex","cardboard","tar","steel"],
"yeast (lees, autolysis, flor)":["biscuit","pastry","brioche","bread","toast","toasted bread","bread dough","cheese","yogurt","acetaldehyde"],
"malolactic conversion":["butter","cream","cheese","goat cheese"],
"oak":["vanilla","cloves","coconut","cedar","charred wood","smoke","chocolate","cocoa","coffee","mocha"],
"red wine":["dried fruit","prune","raisin","dried fig","date","cooked fruit (e.g. cooked plum, cooked cherry)","leather","earth","dirt","mushroom","meat","blood","game","tobacco","wet leaves","forest floor","caramel"],
"white wine":["candied lemon","candied melon","candied orange","candied peach","dried fruit","dried apricot","dried mango","prune","raisin","dried fig","date","orange marmalade","petrol","gasoline","petroleum","cinnamon","ginger","nutmeg","almond","hazelnut","cashew","chestnut","honey","caramel"],
"deliberately oxidised wines":["almond","hazelnut","walnut","chocolate","cocoa","coffee","mocha","caramel"]}

####Diff between our aromas and all aromas

In [None]:
diff_aromas = []
for aroma in all_aromas_dict:
  if not any(aroma in grouped_aromas[key] for key in grouped_aromas) and (aroma not in grouped_aromas):
    diff_aromas.append(aroma)
diff_aromas

['apple jam',
 'apple pie',
 'applesauce',
 'biscuit rose de reims',
 'black olive',
 'bruised apple',
 'cacao',
 'chicory',
 'cider',
 'crème brûlée',
 'dust',
 'forest raspberry',
 'fresh strawberry',
 'hay',
 'juicy strawberry',
 'marzipan',
 'molasses',
 'nail polish remover',
 'orange zest',
 'resin',
 'ripe cherry',
 'roasted pineapple',
 'scrubland',
 'sesame',
 'strawberry jam',
 'turpentine',
 'verbena',
 'white chocolate',
 'white flower',
 'yellow apple',
 'yellow plum']

### Aromas to number

In [None]:
# Function to create decimal number from aromas in tasting note
def aromas_to_number(tasting_note_aromas, all_aromas_dict):
  aromas_binary = [0 for i in range(len(all_aromas_dict))]
  for aroma in tasting_note_aromas:
    aromas_binary[all_aromas_dict[aroma]] = 1
  return int("".join(str(x) for x in aromas_binary), 2)

print(aromas_to_number(['acacia', 'anise'], all_aromas_dict))

1870722095783555735300716585876842265159593655009280


In [None]:
# Function to create a dict with frequency of aromas in each group
def computeFrequency(row, aroma_group):
  row_aromas = row['aromas'].lower()
  row_aromas = row_aromas.split(",")
  row_aromas = [aroma.strip() for aroma in row_aromas]

  nb = 0
  for aroma in row_aromas:
    if aroma in grouped_aromas[aroma_group] or aroma == aroma_group:
      nb += 1
  #return nb/(len(grouped_aromas[aroma_group])+1)
  return nb/len(row_aromas)

#frequency
for aroma_group in grouped_aromas:
  tasting_notes_df[aroma_group] = tasting_notes_df.apply(lambda row : computeFrequency(row, aroma_group), axis=1)

# Guess wine colour

In [None]:
# New DF
tasting_to_colour_df = tasting_notes_df.drop(columns=['clean', 'isVintage', 'tanninNature', 'varieties', 'wineName', 'wineSubColour', 'year'])

##ML - classification for colour based on whole notes

###ML - Guess colour from what whole tasting notes

In [None]:
# Metrics for Evaluation of model Accuracy and F1-score
from sklearn.metrics import f1_score,accuracy_score

#Importing the Decision Tree from scikit-learn library
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# For splitting of data into train and test set
from sklearn.model_selection import train_test_split

In [None]:
#first we split our data into input and output
# y is the output and is stored in "wineColourId" column of dataframe
# X contains the other columns and are features or input
y = tasting_to_colour_df.wineColourId
tasting_to_colour_df.drop(['wineColourId'], axis=1, inplace=True)
tasting_to_colour_df.drop(['aromas'], axis=1, inplace=True)
tasting_to_colour_df.drop(['flavours'], axis=1, inplace=True)
X = tasting_to_colour_df

In [None]:
print(X)

     acidity  alcohol  appearanceIntensity  bodyLevel  bottleAgeingSuitable  \
1          3        1                    1          3                  True   
2          3        1                    1          3                  True   
3          4        1                    1          3                  True   
4          1        1                    1          2                 False   
5          2        1                    2          4                  True   
..       ...      ...                  ...        ...                   ...   
341        4        1                    1          2                  True   
342        2        1                    0          1                  True   
343        4        1                    1          0                  True   
344        2        1                    0          2                  True   
345        4        1                    0          0                  True   

     drinkNow  finish  flavourIntensity  noseIntens

In [None]:
# Now we split the dataset in train and test part
# here the train set is 75% and test set is 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
print("X train :" + str(len(X_train)))
print("X test :" + str(len(X_test)))
print("y train :" + str(len(y_train)))
print("y test :" + str(len(y_test)))

# Training the model is as simple as this
# Use the function imported above and apply fit() on it
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)

# We use the predict() on the model to predict the output
pred=DT.predict(X_test)

# Print pred and test
print("Test:")
print(y_test)
print("Pred:")
print(pred)

# for classification we use accuracy and F1 score
print(accuracy_score(y_test,pred))
print(f1_score(y_test,pred,average='micro'))

X train :196
X test :66
y train :196
y test :66
Test:
113    0
90     1
127    1
57     1
4      1
      ..
20     0
304    1
336    0
139    1
27     1
Name: wineColourId, Length: 66, dtype: int64
Pred:
[0 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 2 0 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0
 0 1 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 2 1 0 0 0 0 1 0 1 1]
1.0
1.0


###ML - Guess colour from aromas (grouped)

In [None]:
# Select only aromas columns in training set
X = tasting_to_colour_df[grouped_aromas.keys()]
print(X.columns)
# Now we split the dataset in train and test part
# here the train set is 75% and test set is 25%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
print("X train :" + str(len(X_train)))
print("X test :" + str(len(X_test)))
print("y train :" + str(len(y_train)))
print("y test :" + str(len(y_test)))

# Training the model is as simple as this
# Use the function imported above and apply fit() on it
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)

# We use the predict() on the model to predict the output
pred=DT.predict(X_test)

# Print pred and test
print("Test:")
print(y_test)
print(X_test)
print("Pred:")
print(pred)

# for classification we use accuracy and F1 score
print(accuracy_score(y_test,pred))
print(f1_score(y_test,pred,average='micro'))

Index(['flowers', 'green fruits', 'citrus fruits', 'stone fruits',
       'tropical fruits', 'red fruits', 'black fruits', 'herbaceous', 'herbs',
       'spices', 'fruit ripeness', 'other', 'yeast (lees, autolysis, flor)',
       'malolactic conversion', 'oak', 'red wine', 'white wine',
       'deliberately oxidised wines'],
      dtype='object')
X train :196
X test :66
y train :196
y test :66
Test:
113    0
90     1
127    1
57     1
4      1
      ..
20     0
304    1
336    0
139    1
27     1
Name: wineColourId, Length: 66, dtype: int64
      flowers  green fruits  citrus fruits  stone fruits  tropical fruits  \
113  0.000000      0.000000       0.000000      0.000000         0.000000   
90   0.250000      0.000000       0.250000      0.250000         0.000000   
127  0.000000      0.285714       0.142857      0.000000         0.000000   
57   0.000000      0.166667       0.000000      0.333333         0.166667   
4    0.000000      0.166667       0.000000      0.166667         0.1

#Guess wine varieties

In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# New DF
tasting_to_varieties_df = tasting_notes_df.drop(columns=['clean', 'isVintage', 'tanninNature', 'wineName', 'wineSubColour', 'year'])
tasting_to_varieties_df.columns

Index(['acidity', 'alcohol', 'appearanceIntensity', 'aromas', 'bodyLevel',
       'bottleAgeingSuitable', 'drinkNow', 'finish', 'flavourIntensity',
       'flavours', 'noseIntensity', 'sparkling', 'sweetness', 'tannin',
       'varieties', 'wineColourId', 'qualityLevelId', 'flowers',
       'green fruits', 'citrus fruits', 'stone fruits', 'tropical fruits',
       'red fruits', 'black fruits', 'herbaceous', 'herbs', 'spices',
       'fruit ripeness', 'other', 'yeast (lees, autolysis, flor)',
       'malolactic conversion', 'oak', 'red wine', 'white wine',
       'deliberately oxidised wines'],
      dtype='object')

##Data formatting & preprocessing

###Varieties

In [None]:
# Process varieties
## Retrieve all varieties in a collection (set + list?)
all_varieties = tasting_to_varieties_df['varieties'].str.lower()
all_varieties = all_varieties.str.split(r'[(?:,|;)]\s*')
all_varieties = all_varieties.dropna()
all_varieties = all_varieties.to_numpy()

all_varieties_unique = np.unique(sum(all_varieties, []))
all_varieties_unique = np.unique(np.char.strip(all_varieties_unique))
all_varieties_unique = np.delete(all_varieties_unique, np.where(all_varieties_unique == ''))

all_varieties_dict = {k: v for v, k in enumerate(all_varieties_unique)}
print(all_varieties_dict)

{'alicante': 0, 'alicante bouschet': 1, 'aligoté': 2, 'altesse': 3, 'alvarinho': 4, 'avesso': 5, 'biancu ghjentile': 6, 'brancellao': 7, 'brustianu': 8, 'cabernet franc': 9, 'cabernet sauvignon': 10, 'caiño tinto': 11, 'carcaghjolu biancu': 12, 'carignan': 13, 'carmenère': 14, 'carricante': 15, 'chardonnay': 16, 'chenin': 17, 'chenin blanc': 18, 'cinsault': 19, 'cinsaut': 20, 'cortese': 21, 'cualtacciu': 22, 'cudiverta': 23, 'côt': 24, 'ferrol': 25, 'furmint': 26, 'gamay': 27, 'garnacha tinta': 28, 'genovese': 29, 'gewurztraminer': 30, 'ghjenuvese': 31, 'glera': 32, 'godello': 33, 'graciano': 34, 'greco': 35, 'grenache': 36, 'grenache blanc': 37, 'grenache noir': 38, 'loureiro': 39, 'macabeo': 40, 'malbec': 41, 'malvasia puntinata': 42, 'marsanne': 43, 'mauzac': 44, 'mazuela': 45, 'melon de bourgogne': 46, 'merlot': 47, 'meunier': 48, 'monastrell': 49, 'mourvèdre': 50, 'muscadelle': 51, 'muscaris': 52, 'muscat': 53, 'muscat blanc à petits grains': 54, 'muscat à petits grains blancs': 5

In [None]:
# Translate varieties to list of numbers
# Function to create a dict with frequency of aromas in each group
def computeVarieties(row):
  row_varieties = row['varieties'].lower()
  row_varieties = row_varieties.split(",")
  row_varieties = [variety.strip() for variety in row_varieties]
  row_varieties = [x for x in row_varieties if x]

  #print(row_varieties[0])
  #print(all_varieties_dict[row_varieties[0]])

  length = len(row_varieties)
  for i in range(length):
    row_varieties[i] = all_varieties_dict[row_varieties[i]]

  # varieties_binary = [0 for i in all_varieties_dict]
  # for v in row_varieties:
  #   varieties_binary[all_varieties_dict[v]] = 1

  #row_varieties = list(map(lambda x: x.replace(row_varieties[x], all_varieties_dict[row_varieties[x]]), row_varieties))
  return row_varieties

#replace varieties with corresponding number in all_varieties_dict
tasting_to_varieties_df['varieties'] = tasting_to_varieties_df.apply(lambda row : computeVarieties(row), axis=1)
tasting_to_varieties_df['varieties'].head()

Unnamed: 0,varieties
1,[91]
2,[16]
3,"[16, 71, 70]"
4,[76]
5,"[10, 47, 9, 66]"


Idées de modélisation pour trouver cépage(s):


*   Réduction à détection du cépage majoritaire : changer target en "cépage majoritaire" = nouvelle colonne
*   Transformer en pbm de régression : output = représentation numérique bijective de l'assemblage
*   **Problème :** trop peu de données et trop de classes ==> réduire à x cépages pour tester, supprimer lignes qui n'ont pas ces cépages et all_varieties_dict contient ces cépages uniquement





In [None]:
# Keep only first variety
def keepOnlyFirstVariety(row):
  return row['varieties'][0] if len(row['varieties'])>0 else None

tasting_to_varieties_df['varieties'] = tasting_to_varieties_df.apply(lambda row : keepOnlyFirstVariety(row), axis=1)
tasting_to_varieties_df['varieties']

Unnamed: 0,varieties
1,91.0
2,16.0
3,16.0
4,76.0
5,10.0
...,...
341,73.0
342,73.0
343,73.0
344,73.0


In [None]:
# Compute number of tasting notes by variety & keep top 5 varieties
most_used_varieties = tasting_to_varieties_df.groupby(['varieties'])['varieties'].count().sort_values(ascending=False).head(5).index
most_used_varieties

Index([16.0, 73.0, 71.0, 79.0, 47.0], dtype='float64', name='varieties')

In [None]:
tasting_to_varieties_df.varieties

Unnamed: 0,varieties
1,91.0
2,16.0
3,16.0
4,76.0
5,10.0
...,...
341,73.0
342,73.0
343,73.0
344,73.0


In [None]:
# Filter dataframe to keep only notes using top 5 varieties
tasting_to_varieties_df = tasting_to_varieties_df[tasting_to_varieties_df['varieties'].isin(most_used_varieties)]
tasting_to_varieties_df.varieties

Unnamed: 0,varieties
2,16.0
3,16.0
11,47.0
12,71.0
13,16.0
...,...
341,73.0
342,73.0
343,73.0
344,73.0


In [None]:
# Remove sparkling
tasting_to_varieties_df = tasting_to_varieties_df[tasting_to_varieties_df['sparkling'] == False]

In [None]:
#first we split our data into input and output
# y is the output and is stored in "Class" column of dataframe
# X contains the other columns and are features or input
y = tasting_to_varieties_df.varieties
#mlb = MultiLabelBinarizer(classes=list(all_varieties_dict.values()))
#y = pd.DataFrame(mlb.fit_transform(tasting_to_varieties_df.varieties), columns=mlb.classes_, index=tasting_to_varieties_df.index)
tasting_to_varieties_df.drop(['varieties'], axis=1, inplace=True)
tasting_to_varieties_df.drop(['aromas'], axis=1, inplace=True)
tasting_to_varieties_df.drop(['flavours'], axis=1, inplace=True)
X = tasting_to_varieties_df

In [None]:
def predict_ml(X, y, model):
  match model:
    case "random_forest":
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, shuffle=True)
      clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X_train, y_train)
    case "neural_network":
      X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=1, shuffle=True)
      clf = MLPClassifier(solver='lbfgs', random_state=1, max_iter=3000).fit(X_train, y_train)

  print("X train length :", len(X_train))
  print("X test length :", len(X_test))
  print("y train length :", len(y_train))
  print("y test length :", len(y_test))

  print("X train: \n", X_train)
  print("Y train:\n", y_train)

  pred = clf.predict(X_test)

  # Print pred and test
  print("Y Test:\n", y_test.values)
  print("Pred:\n", pred)

  # Scores
  print("Score train:", clf.score(X_train, y_train))
  print("Accuracy test:", accuracy_score(y_test,pred))
  print("F1 score test:", f1_score(y_test,pred,average='weighted'))

In [None]:
predict_ml(X,y,"neural_network")

X train length : 78
X test length : 27
y train length : 78
y test length : 27
X train: 
      acidity  alcohol  appearanceIntensity  bodyLevel  bottleAgeingSuitable  \
11         3        2                    2          4                  True   
239        3        1                    0          3                  True   
57         4        1                    1          2                  True   
142        4        1                    1          3                  True   
51         4        1                    0          0                  True   
..       ...      ...                  ...        ...                   ...   
303        4        1                    0          1                 False   
258        4        1                    0          3                  True   
342        2        1                    0          1                  True   
315        4        1                    0          1                  True   
2          3        1                    1

#Guess country

In [None]:
# get data

#Guess region

In [None]:
# get data

#Guess wine vintage



In [None]:
# to guess the vintage, we have to guess the age of the wine when it has been tasted so we need to compute the difference between the tasting date and the vintage of the wine