In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [3]:
df = pd.read_csv('../data/df_nonans.csv', index_col = 0)

In [4]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [57]:
with open('../models/bern/bow.p', 'rb') as f:
    bow = pickle.load(f)

with open('../models/bern/m.p', 'rb') as f:
    m = pickle.load(f)

In [90]:
query = []

In [91]:
style = input('Do you prefer sweet or dry wine: ')

Do you prefer sweet or dry wine: dry


In [92]:
if style == 'sweet':
    query.append('sweet')
    query.append('sweetness')
    query.append('sugar')
elif style == 'dry':
    query.append('dry')
    query.append('savory')
    query.append('bitter')

In [93]:
print('Which best describes your preferred wine style:')
print('1. Light and refreshing')
print('2. Smooth and balanced')
print('3. Full and rich')
style_2 = input('Enter 1, 2, or 3: ')

Which best describes your preferred wine style:
1. Light and refreshing
2. Smooth and balanced
3. Full and rich
Enter 1, 2, or 3: 1


In [94]:
if style_2 == '1':
    query.append('light')
    query.append('refreshing')
    query.append('bright')
    query.append('racy')
elif style_2 == '2':
    query.append('smooth')
    query.append('balanced')
    query.append('polished')
elif style_2 == '3':
    query.append('full')
    query.append('rich')
    query.append('dense')
else:
    style_2 = input('Enter 1, 2, or 3: ')
    #continue

In [95]:
appealing = input('Which is more appealing fruity or earthy: ')

Which is more appealing fruity or earthy: earthy


In [96]:
if appealing == 'fruity':
    query.append('fruity')
    query.append('fruitiness')
    query.append('jam')
    query.append('jammy')
elif appealing == 'earthy':
    query.append('earthy')
    query.append('earth')
    query.append('soil')
    query.append('minerality')
    query.append('graphite')
    
else:
    appealing = input('Which is more appealing fruity or earthy: ')

In [97]:
flavor = input('Do you like flavors that remind you of caramel, toast and spice (yes or no)? ')

Do you like flavors that remind you of caramel, toast and spice (yes or no)? yes


In [98]:
if flavor == 'yes':
    query.append('carmel')
    query.append('toast')
    query.append('spice')
    query.append('oak')
    query.append('oaky')
    query.append('smokey')
    query.append('toasty')
    query.append('smoke')
    query.append('cedar')
    query.append('tea')

In [99]:
query

['dry',
 'savory',
 'bitter',
 'light',
 'refreshing',
 'bright',
 'racy',
 'earthy',
 'earth',
 'soil',
 'minerality',
 'graphite',
 'carmel',
 'toast',
 'spice',
 'oak',
 'oaky',
 'smokey',
 'toasty',
 'smoke',
 'cedar',
 'tea']

In [100]:
query = [' '.join(query)]

In [101]:
query

['dry savory bitter light refreshing bright racy earthy earth soil minerality graphite carmel toast spice oak oaky smokey toasty smoke cedar tea']

In [102]:
m.predict(bow.transform(query))

array(['Pinot Noir'], dtype='<U29')

In [103]:
m.predict_proba(bow.transform(query)).max()

0.8496554683880507

In [35]:
m.predict_proba(bow.transform(query))

array([[0.01034809, 0.00470898, 0.00546808, 0.00850742, 0.03661328,
        0.00288867, 0.0527767 , 0.00791998, 0.00848632, 0.00407253,
        0.0617867 , 0.00700638, 0.00873859, 0.00977464, 0.01394966,
        0.00379503, 0.00402035, 0.00756899, 0.04817632, 0.15379753,
        0.00396906, 0.00266176, 0.00829603, 0.05450378, 0.00431853,
        0.2936543 , 0.01981345, 0.02120059, 0.00299243, 0.0362555 ,
        0.00483388, 0.02129705, 0.00860254, 0.00550886, 0.0064714 ,
        0.01670488, 0.02851172]])

In [21]:
m.classes_ 

array(['Barbera', 'Bordeaux-style Red Blend',
       'Bordeaux-style White Blend', 'Cabernet Franc',
       'Cabernet Sauvignon', 'Carmenère', 'Chardonnay', 'Chenin Blanc',
       'Corvina, Rondinella, Molinara', 'Gamay', 'Gewürztraminer',
       'Grenache', 'Grüner Veltliner', 'Malbec', 'Merlot', 'Nebbiolo',
       'Petite Sirah', 'Pinot Grigio', 'Pinot Gris', 'Pinot Noir', 'Port',
       'Portuguese Red', 'Portuguese White', 'Red Blend',
       'Rhône-style Red Blend', 'Riesling', 'Rosé', 'Sangiovese',
       'Sangiovese Grosso', 'Sauvignon Blanc', 'Shiraz', 'Syrah',
       'Tempranillo', 'Tempranillo Blend', 'Viognier', 'White Blend',
       'Zinfandel'], dtype='<U29')

In [104]:
df_pred = pd.DataFrame(m.predict_proba(bow.transform(query)).T, index=m.classes_ ,columns=['probability'])

In [105]:
df_pred.sort_values(by='probability', ascending=False)

Unnamed: 0,probability
Pinot Noir,0.8496555
Barbera,0.05726378
Red Blend,0.03012012
Syrah,0.02123221
Cabernet Franc,0.01568268
Cabernet Sauvignon,0.0103637
Tempranillo,0.004463769
Merlot,0.003777201
Tempranillo Blend,0.003011224
Grenache,0.001249015
