In [1]:
import pickle
import spacy
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "textcat", "ner"])

In [3]:
df_red = pd.read_csv('../data/df_red.csv', index_col = 0)
df_w_r = pd.read_csv('../data/df_w_r.csv', index_col = 0)
df_white = pd.read_csv('../data/df_white.csv', index_col = 0)
df_all = pd.read_csv('../data/df_nonans.csv', index_col = 0)

In [4]:
def custom_tokenizer(text):
    '''
    used to filter out unwanted words, punctuation, and so on
    '''
    tokens = []
    for t in nlp(text):
        if not(len(t) < 2 or t.is_stop or t.like_num or 
               t.is_punct or not t.is_alpha):
            tokens.append(t.lemma_)
    return tokens 

In [5]:
with open('../models/logreg/bow.p', 'rb') as f:
    bow_all = pickle.load(f)
with open('../models/logreg/bow_red.p', 'rb') as f:
    bow_red = pickle.load(f)
with open('../models/logreg/bow_w_r.p', 'rb') as f:
    bow_w_r = pickle.load(f)
with open('../models/logreg/bow_white.p', 'rb') as f:
    bow_white = pickle.load(f)

In [6]:
with open('../models/logreg/m.p', 'rb') as f:
    m_all = pickle.load(f)
with open('../models/logreg/m_red.p', 'rb') as f:
    m_red = pickle.load(f)
with open('../models/logreg/m_w_r.p', 'rb') as f:
    m_w_r = pickle.load(f)
with open('../models/logreg/m_white.p', 'rb') as f:
    m_white = pickle.load(f)

In [26]:
query = []
model = []
#remake models with class_weights='balanced'

In [27]:
wine_type = input('Would you like red, white or either: ')

Would you like red, white or either: red


In [28]:
if wine_type == 'red':
    model.append('red')
elif wine_type == 'white':
    rose = input('Should we include Rosé wine too? ')
    if rose == 'yes': 
        model.append('white_rose')
    elif rose == 'no':
        model.append('white')
elif wine_type == 'either':
    model.append('all')
else:
    wine_type = input('Would you like red, white or either:')

In [29]:
style = input('Do you prefer sweet or dry wine: ')

Do you prefer sweet or dry wine: sweet


In [30]:
if style == 'sweet':
    query.append('sweet')
    query.append('sweetness')
    query.append('sugar')
elif style == 'dry':
    query.append('dry')
    query.append('savory')
    query.append('bitter')

In [31]:
print('Which best describes your preferred wine style:')
print('1. Light and refreshing')
print('2. Smooth and balanced')
print('3. Full and rich')
style_2 = input('Enter 1, 2, or 3: ')

Which best describes your preferred wine style:
1. Light and refreshing
2. Smooth and balanced
3. Full and rich
Enter 1, 2, or 3: 3


In [32]:
if style_2 == '1':
    query.append('light')
    query.append('refreshing')
    query.append('bright')
    query.append('racy')
elif style_2 == '2':
    query.append('smooth')
    query.append('balanced')
    query.append('polished')
    query.append('silky')
elif style_2 == '3':
    query.append('full')
    query.append('rich')
    query.append('dense')
    query.append('tannin')
    query.append('tannic')
    query.append('chewy')
    query.append('heavy')
else:
    style_2 = input('Enter 1, 2, or 3: ')
    #continue

In [33]:
appealing = input('Which is more appealing fruity or earthy: ')

Which is more appealing fruity or earthy: fruity


In [34]:
if appealing == 'fruity':
    query.append('fruity')
    query.append('fruitiness')
    query.append('jam')
    query.append('jammy')
elif appealing == 'earthy':
    query.append('earthy')
    query.append('earth')
    query.append('soil')
    query.append('minerality')
    query.append('graphite')    
else:
    appealing = input('Which is more appealing fruity or earthy: ')

In [46]:
print('Which do you prefer: ')
print('1. Wines that finish soft and light.')
print('2. Wines that linger on your palate after they have finished.')
finish = input('Enter 1 or 2: ')

#this question asked with a 1 seems to help pinot nior. that good or bad?

In [None]:
if appealing == '1':
    query.append('soft')
    query.append('light')
    query.append('fine')
    query.append('round')
elif appealing == '2':
    query.append('long')
    query.append('concentrated')
    query.append('structure')
    query.append('linger')
    query.append('firm')  

In [36]:
flavor = input('Do you like flavors that remind you of caramel, toast and spice (yes or no)? ')
#finds out if the user likes wine aged in an oak barrel or stainless steel

Do you like flavors that remind you of caramel, toast and spice (yes or no)? yes


In [37]:
if flavor == 'yes':
    query.append('carmel')
    query.append('toast')
    query.append('spice')
    query.append('oak')
    query.append('oaky')
    query.append('smokey')
    query.append('toasty')
    query.append('smoke')
    query.append('cedar')
    query.append('tea')

In [38]:
query

['sweet',
 'sweetness',
 'sugar',
 'full',
 'rich',
 'dense',
 'tannin',
 'tannic',
 'chewy',
 'heavy',
 'fruity',
 'fruitiness',
 'jam',
 'jammy',
 'carmel',
 'toast',
 'spice',
 'oak',
 'oaky',
 'smokey',
 'toasty',
 'smoke',
 'cedar',
 'tea']

In [39]:
query = [' '.join(query)]

In [40]:
query

['sweet sweetness sugar full rich dense tannin tannic chewy heavy fruity fruitiness jam jammy carmel toast spice oak oaky smokey toasty smoke cedar tea']

In [41]:
if model[0] == 'all':
    m = m_all
    bow = bow_all
elif model[0] == 'red':
    m = m_red
    bow = bow_red
elif model[0] == 'white':
    m = m_white
    bow = bow_white
elif model[0] == 'white_rose':
    m = m_w_r
    bow = bow_w_r

In [42]:
m.predict(bow.transform(query))

array(['Pinot Noir'], dtype=object)

In [43]:
m.predict_proba(bow.transform(query)).max()

0.20566230860084825

In [270]:
m.predict_proba(bow.transform(query))

array([[1.27281263e-04, 9.69788223e-01, 4.86765005e-03, 2.94074304e-03,
        6.00228584e-07, 1.66726902e-05, 6.49000158e-03, 9.87192201e-06,
        2.11563546e-04, 1.49513001e-05, 1.20285465e-04, 1.07941131e-02,
        4.61804316e-03]])

In [271]:
m.classes_ 

array(['Bordeaux-style White Blend', 'Chardonnay', 'Chenin Blanc',
       'Gewürztraminer', 'Grüner Veltliner', 'Pinot Grigio', 'Pinot Gris',
       'Portuguese White', 'Riesling', 'Rosé', 'Sauvignon Blanc',
       'Viognier', 'White Blend'], dtype='<U26')

In [44]:
df_pred = pd.DataFrame(m.predict_proba(bow.transform(query)).T, index=m.classes_ ,columns=['probability'])

In [45]:
df_pred.sort_values(by='probability', ascending=False)

Unnamed: 0,probability
Pinot Noir,0.205662
Cabernet Sauvignon,0.169051
Red Blend,0.131865
Merlot,0.071164
Zinfandel,0.069245
Malbec,0.044211
"Corvina, Rondinella, Molinara",0.043317
Port,0.040714
Bordeaux-style Red Blend,0.03739
Petite Sirah,0.025069
