In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [4]:
df_red = pd.read_csv('../data/df_red.csv', index_col = 0)
df_w_r = pd.read_csv('../data/df_w_r.csv', index_col = 0)
df_white = pd.read_csv('../data/df_white.csv', index_col = 0)
df_all = pd.read_csv('../data/df_nonans.csv', index_col = 0)

In [5]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [52]:
with open('../models/random_forest/bow_all_rf.p', 'rb') as f:
    bow_all = pickle.load(f)
with open('../models/random_forest/bow_red_rf.p', 'rb') as f:
    bow_red = pickle.load(f)
with open('../models/random_forest/bow_w_r_rf.p', 'rb') as f:
    bow_w_r = pickle.load(f)
with open('../models/random_forest/bow_white_rf.p', 'rb') as f:
    bow_white = pickle.load(f)

In [53]:
with open('../models/random_forest/m_all_rf.p', 'rb') as f:
    m_all = pickle.load(f)
with open('../models/random_forest/m_red_rf.p', 'rb') as f:
    m_red = pickle.load(f)
with open('../models/random_forest/m_w_r_rf.p', 'rb') as f:
    m_w_r = pickle.load(f)
with open('../models/random_forest/m_white_rf.p', 'rb') as f:
    m_white = pickle.load(f)

In [61]:
query = []
model = []

In [62]:
wine_type = input('Would you like red, white or either: ')

Would you like red, white or either: red


In [63]:
if wine_type == 'red':
    model.append('red')
elif wine_type == 'white':
    rose = input('Should we include Rosé wine too? ')
    if rose == 'yes': 
        model.append('white_rose')
    elif rose == 'no':
        model.append('white')
elif wine_type == 'either':
    model.append('all')
else:
    wine_type = input('Would you like red, white or either:')

In [64]:
model

['red']

In [65]:
style = input('Do you prefer sweet or dry wine: ')

Do you prefer sweet or dry wine: dry


In [66]:
if style == 'sweet':
    query.append('sweet')
    query.append('sweetness')
    query.append('sugar')
elif style == 'dry':
    query.append('dry')
    query.append('savory')
    query.append('bitter')

In [67]:
print('Which best describes your preferred wine style:')
print('1. Light and refreshing')
print('2. Smooth and balanced')
print('3. Full and rich')
style_2 = input('Enter 1, 2, or 3: ')

Which best describes your preferred wine style:
1. Light and refreshing
2. Smooth and balanced
3. Full and rich
Enter 1, 2, or 3: 2


In [68]:
if style_2 == '1':
    query.append('light')
    query.append('refreshing')
    query.append('bright')
    query.append('racy')
elif style_2 == '2':
    query.append('smooth')
    query.append('balanced')
    query.append('polished')
elif style_2 == '3':
    query.append('full')
    query.append('rich')
    query.append('dense')
else:
    style_2 = input('Enter 1, 2, or 3: ')
    #continue

In [69]:
appealing = input('Which is more appealing fruity or earthy: ')

Which is more appealing fruity or earthy: earthy


In [70]:
if appealing == 'fruity':
    query.append('fruity')
    query.append('fruitiness')
    query.append('jam')
    query.append('jammy')
elif appealing == 'earthy':
    query.append('earthy')
    query.append('earth')
    query.append('soil')
    query.append('minerality')
    query.append('graphite')
    
else:
    appealing = input('Which is more appealing fruity or earthy: ')

In [71]:
flavor = input('Do you like flavors that remind you of caramel, toast and spice (yes or no)? ')

Do you like flavors that remind you of caramel, toast and spice (yes or no)? yes


In [72]:
if flavor == 'yes':
    query.append('carmel')
    query.append('toast')
    query.append('spice')
    query.append('oak')
    query.append('oaky')
    query.append('smokey')
    query.append('toasty')
    query.append('smoke')
    query.append('cedar')
    query.append('tea')

In [73]:
query

['dry',
 'savory',
 'bitter',
 'smooth',
 'balanced',
 'polished',
 'earthy',
 'earth',
 'soil',
 'minerality',
 'graphite',
 'carmel',
 'toast',
 'spice',
 'oak',
 'oaky',
 'smokey',
 'toasty',
 'smoke',
 'cedar',
 'tea']

In [74]:
query = [' '.join(query)]

In [75]:
query

['dry savory bitter smooth balanced polished earthy earth soil minerality graphite carmel toast spice oak oaky smokey toasty smoke cedar tea']

In [76]:
model

['red']

In [77]:
if model[0] == 'all':
    m = m_all
    bow = bow_all
elif model[0] == 'red':
    m = m_red
    bow = bow_red
elif model[0] == 'white':
    m = m_white
    bow = bow_white
elif model[0] == 'white_rose':
    m = m_w_r
    bow = bow_w_r

In [78]:
m.predict(bow.transform(query))

array(['Pinot Noir'], dtype=object)

In [79]:
m.predict_proba(bow.transform(query)).max()

0.43

In [270]:
m.predict_proba(bow.transform(query))

array([[1.27281263e-04, 9.69788223e-01, 4.86765005e-03, 2.94074304e-03,
        6.00228584e-07, 1.66726902e-05, 6.49000158e-03, 9.87192201e-06,
        2.11563546e-04, 1.49513001e-05, 1.20285465e-04, 1.07941131e-02,
        4.61804316e-03]])

In [271]:
m.classes_ 

array(['Bordeaux-style White Blend', 'Chardonnay', 'Chenin Blanc',
       'Gewürztraminer', 'Grüner Veltliner', 'Pinot Grigio', 'Pinot Gris',
       'Portuguese White', 'Riesling', 'Rosé', 'Sauvignon Blanc',
       'Viognier', 'White Blend'], dtype='<U26')

In [38]:
df_pred = pd.DataFrame(m.predict_proba(bow.transform(query)).T, index=m.classes_ ,columns=['probability'])

In [39]:
df_pred.sort_values(by='probability', ascending=False)

Unnamed: 0,probability
Pinot Noir,0.841128
Syrah,0.04634929
Red Blend,0.04139851
Cabernet Sauvignon,0.02230502
Cabernet Franc,0.01639877
Barbera,0.01329293
Merlot,0.008907045
Tempranillo,0.004042321
Tempranillo Blend,0.003573694
Malbec,0.001906774
