In [368]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, sigmoid_kernel 

# Building a Wine Recommendation System

Creating a content-based recommendation system through using NLP modelinng on sommellier reviews.

In [369]:
df = pd.read_csv('../../Data/wine_data.csv')

In [370]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   150930 non-null  int64  
 1   country      150925 non-null  object 
 2   description  150930 non-null  object 
 3   designation  105195 non-null  object 
 4   points       150930 non-null  int64  
 5   price        137235 non-null  float64
 6   province     150925 non-null  object 
 7   region_1     125870 non-null  object 
 8   region_2     60953 non-null   object 
 9   variety      150930 non-null  object 
 10  winery       150930 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 12.7+ MB


In [371]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
96722,96722,US,"Made from Cabernet Sauvignon, Merlot and Petit...",Trilogy,91,65.0,California,Napa Valley,Napa,Bordeaux-style Red Blend,Flora Springs
108942,108942,France,"While it has very bready and yeasty aromas, yo...",,94,120.0,Champagne,Champagne,,Champagne Blend,Jacquesson et Fils
40902,40902,Italy,This playful blend of air-dried white grapes (...,Alba Chiara Passito 500ml,83,12.0,Veneto,Veneto,,White Blend,Pizzolato
134975,134975,Italy,"There are some tertiary aromas here of cola, M...",Carlin Burel,85,18.0,Piedmont,Dolcetto d'Alba,,Dolcetto,Rivetti Massimo
83561,83561,Italy,"Very informal and light, this shows fresh frui...",Pizzo del Vento,84,12.0,Southern Italy,Salento,,Negroamaro,Miali
110519,110519,US,Good fruit from the Riverbend vineyard brings ...,Shameless Hussy,87,20.0,Washington,Wahluke Slope,Columbia Valley,Syrah,Hard Row To Hoe
117431,117431,US,"With 6 Rhône varieties, including Viognier, th...",Rhône Red Wine,87,40.0,California,Central Coast,Central Coast,Rhône-style Red Blend,Clos La Chance
77216,77216,Greece,Exotic pineapple and melon fruit aromas lead o...,,87,30.0,Peloponnese,,,Petroulianos,Vatistas
93550,93550,Italy,San Polo has shown very impressive work lately...,,94,75.0,Tuscany,Brunello di Montalcino,,Sangiovese Grosso,San Polo
123069,123069,US,The 2007 vintage was a fabulous one for Willia...,Papera Vineyard,94,48.0,California,Russian River Valley,Sonoma,Zinfandel,Williams Selyem


In [372]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [373]:
df.drop('region_2', axis=1, inplace=True)

In [374]:
df.sample(10)

Unnamed: 0,country,description,designation,points,price,province,region_1,variety,winery
21082,France,91-93. Barrel sample. This has an herbal chara...,Barrel Sample,92,,Bordeaux,Pomerol,Bordeaux-style Red Blend,Château Providence
63836,US,"Shows the bright, forward cherry flavors and z...",,84,15.0,California,San Benito County,Sangiovese,Ca' del Solo
8810,US,"Dark in the glass, this blend of 75% Grenache ...",First Born King,91,34.0,California,Paso Robles,Rhône-style Red Blend,Rendarrio Vineyards
4507,Morocco,"From the first whiff of cherry, cranberry and ...",,90,17.0,Zenata,,Syrah,Ouled Thaleb
137773,Argentina,"Tight, concentrated and deep, with rubbery aro...",Don David Reserve,89,17.0,Other,Cafayate,Syrah,Michel Torino
112361,US,Powerful aromatics lead to powerful fruity fla...,Lotus Lot #45,87,28.0,California,San Pasqual,Rhône-style White Blend,Orfila
45905,Australia,Good luck finding a Cabernet for $15 that offe...,Thomas Hyland,89,15.0,South Australia,South Australia,Cabernet Sauvignon,Penfolds
37803,Spain,"Pungent and sweaty smelling on first blush, al...",,87,22.0,Galicia,Rías Baixas,Albariño,Don Olegario
969,US,"Raisin and white pepper aromas, ripe fruit, fl...",Whole Cluster,90,44.0,California,Anderson Valley,Pinot Noir,Stemmler
65269,US,"Rustic in country-style tannins, with a hint o...",,84,9.0,California,California,Merlot,Eastwood


In [375]:
predictors = df[['country', 'description', 'designation', 'province', 'region_1', 'variety', 'winery']]

In [376]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   country      150925 non-null  object
 1   description  150930 non-null  object
 2   designation  105195 non-null  object
 3   province     150925 non-null  object
 4   region_1     125870 non-null  object
 5   variety      150930 non-null  object
 6   winery       150930 non-null  object
dtypes: object(7)
memory usage: 8.1+ MB


## Missing Data

For the first iteration of this recommender system, I will drop observations with missing values across the board instead of being more selective. This cuts the available data in half. Next iterations could try modelling using fewer features, but more observations.

In [377]:
predictors.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictors.dropna(inplace=True)


In [378]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85614 entries, 0 to 150928
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      85614 non-null  object
 1   description  85614 non-null  object
 2   designation  85614 non-null  object
 3   province     85614 non-null  object
 4   region_1     85614 non-null  object
 5   variety      85614 non-null  object
 6   winery       85614 non-null  object
dtypes: object(7)
memory usage: 5.2+ MB


In [379]:
predictors = predictors.reset_index()

In [380]:
predictors.description.duplicated().value_counts()

False    55449
True     30165
Name: description, dtype: int64

In [381]:
predictors.duplicated().value_counts()

False    85614
dtype: int64

In [382]:
predictors[(predictors.description.duplicated() == True)]

Unnamed: 0,index,country,description,designation,province,region_1,variety,winery
203,300,US,This standout Rocks District wine brings earth...,The Funk Estate,Washington,Walla Walla Valley (WA),Syrah,Saviah
289,423,US,"The aromas on this wine are quite light, conve...",Weinbau,Washington,Wahluke Slope,Grenache,Sol Stone
290,424,Spain,"A mix of smoke and toast blends with fresh, cr...",Yá Cuvée 23 Brut Rosé,Catalonia,Cava,Sparkling Blend,Sumarroca
316,480,US,Made from what Californians call the Pommard c...,Charles Vineyard Clone O5,California,Anderson Valley,Pinot Noir,Foursight
520,810,Italy,Here's a lively Moscato made in a dry style th...,Bianco Dry,Sicily & Sardinia,Noto,Moscato,Planeta
...,...,...,...,...,...,...,...,...
85609,150923,France,"Rich and toasty, with tiny bubbles. The bouque...",Demi-Sec,Champagne,Champagne,Champagne Blend,Jacquart
85610,150924,France,"Really fine for a low-acid vintage, there's an...",Diamant Bleu,Champagne,Champagne,Champagne Blend,Heidsieck & Co Monopole
85611,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,Champagne,Champagne,Champagne Blend,H.Germain
85612,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,Southern Italy,Fiano di Avellino,White Blend,Terredora


In [383]:
predictors.sample(10)

Unnamed: 0,index,country,description,designation,province,region_1,variety,winery
58382,101874,Italy,"This darkly saturated Riserva has an inky, rip...",Riserva Castello di Monna Lisa,Tuscany,Chianti Classico,Sangiovese,Villa Vignamaggio
63466,110560,Italy,Bortolotti makes a well-priced Non-DOC Prosecc...,Single Extra Dry,Veneto,Prosecco del Veneto,Prosecco,Bortolotti
28428,48919,France,"Pure black berry juice, ripe and solid, this i...",Chatons du Cèdre,Southwest France,Cahors,Malbec,Château du Cèdre
56155,97929,US,"Crisp acidity gives this Pinot a tart bite, bu...",Sleepy Hollow Vineyard,California,Santa Lucia Highlands,Pinot Noir,Twin Oaks
47299,82652,Argentina,"An apple aroma comes with a tinge of mild oak,...",1300,Mendoza Province,Uco Valley,Chardonnay,Andeluna
8799,14957,France,"The cherry, leather and chocolate aromas are a...",La Devèze,Rhône Valley,Côtes du Rhône,Rhône-style Red Blend,Domaine de Dionysos
21935,37394,Italy,Petruccino is a delightful and elegant wine wi...,Petruccino,Tuscany,Orcia,Red Blend,Podere Forte
35399,60742,US,"A soft entry quickly leads into a short, tart,...",Boushey Vineyards,Washington,Yakima Valley,Syrah,Three Rivers
83320,146088,Australia,"Like the label says, no oak has touched this w...",Unoaked,Victoria,Mornington Peninsula,Chardonnay,Willow Creek
2906,4868,US,Fig and fennel star in this block-designated w...,Block 4SB20,California,Sonoma Valley,Sauvignon Blanc,Kunde


## Feature Engineering & Unique Names for Wines

In [384]:
## Creating a more detailed name for each wine by combining Winery and Designation

predictors['name'] = predictors['winery'] + ', ' + predictors['designation']

In [385]:
predictors.drop('index', axis=1, inplace=True)

In [386]:
predictors.sample(10)

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name
16148,US,The somewhat prominent herb aromas don't seem ...,Pepper Bridge Vineyard,Washington,Walla Walla Valley (WA),Merlot,Two Vintners,"Two Vintners, Pepper Bridge Vineyard"
55976,Spain,"Roll out the barrel, or the staves, or the oak...",Veleta,Andalucia,Vino de la Tierra Contraviesa Alpujarra,Chardonnay,Dominio Buenavista,"Dominio Buenavista, Veleta"
84508,US,"Powerful, youthful and still a bit raw, the ne...",Chaleur Estate,Washington,Columbia Valley (WA),Red Blend,DeLille,"DeLille, Chaleur Estate"
64053,France,This wine has a smoky character with aromas of...,Les Caillottes,Loire Valley,Pouilly-Fumé,Sauvignon Blanc,Fournier Père et Fils,"Fournier Père et Fils, Les Caillottes"
76061,France,91-93 Barrel sample. There is a good structure...,Barrel sample,Bordeaux,Pomerol,Bordeaux-style Red Blend,Château Nenin,"Château Nenin, Barrel sample"
3712,US,A 100% Cabernet Sauvignon from the M5 block of...,Stagecoach Vineyard M5,California,Napa Valley,Cabernet Sauvignon,Krupp Brothers,"Krupp Brothers, Stagecoach Vineyard M5"
40785,US,Laurent Montalieu is the winemaker behind the ...,Spruce Goose,Oregon,Willamette Valley,Riesling,Evergreen Vineyards,"Evergreen Vineyards, Spruce Goose"
31443,US,"A harshly sharp, rather green wine, with scour...",Estate,California,El Dorado,Cabernet Sauvignon,Wofford Acres,"Wofford Acres, Estate"
36817,US,"A very nice effort at this price, it's smooth ...",Kamiakin Red,Washington,Yakima Valley,Red Blend,Sheridan Vineyard,"Sheridan Vineyard, Kamiakin Red"
31171,Italy,"From the far-away 1994 vintage, this aged meto...",Brut Riserva,Lombardy,Franciacorta,Sparkling Blend,Cornaleto,"Cornaleto, Brut Riserva"


In [387]:
## Creating a UID for each wine by combining all data into one variable

predictors['uid'] = predictors['winery'] + ', ' + predictors['designation'] + ', ' + predictors['country'] + ', ' + predictors['description'] + ', ' + predictors['province'] + ', ' + predictors['region_1'] + ', ' + predictors['variety']

In [388]:
predictors.sample(10)

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name,uid
68781,US,"Silky and dry, but sharp and herbal, with not ...",Coastal Series,California,California,Pinot Noir,Jenica Peak,"Jenica Peak, Coastal Series","Jenica Peak, Coastal Series, US, Silky and dry..."
42100,US,"Easy to like for its orange and pear soda, van...",Artisan Collection,California,California,Chardonnay,HandCraft,"HandCraft, Artisan Collection","HandCraft, Artisan Collection, US, Easy to lik..."
19738,US,"A mix of Cabernet and Merlot, with small amoun...",Proprietary Red,Washington,Columbia Valley (WA),Bordeaux-style Red Blend,Swiftwater Cellars,"Swiftwater Cellars, Proprietary Red","Swiftwater Cellars, Proprietary Red, US, A mix..."
5285,US,"This is a delicate wine, true to variety with ...",Pishon Bloc,Oregon,Umpqua Valley,Pinot Noir,Reustle,"Reustle, Pishon Bloc","Reustle, Pishon Bloc, US, This is a delicate w..."
43722,US,This shows the streamlined acidity and mineral...,Taylor Ridge Vineyard,California,Sonoma Coast,Chardonnay,Boheme,"Boheme, Taylor Ridge Vineyard","Boheme, Taylor Ridge Vineyard, US, This shows ..."
29278,France,"A classic and fun Nouveau, this has an extra s...",Nouveau,Beaujolais,Beaujolais,Gamay,Thorin,"Thorin, Nouveau","Thorin, Nouveau, France, A classic and fun Nou..."
47178,Italy,"This offers soft, slightly sweet aromas of che...",Montipagano,Central Italy,Montepulciano d'Abruzzo,Montepulciano,Umani Ronchi,"Umani Ronchi, Montipagano","Umani Ronchi, Montipagano, Italy, This offers ..."
13801,US,"Racy and bright, with high acidity framing dry...",Reserve,California,Russian River Valley,Sauvignon Blanc,Frei Brothers,"Frei Brothers, Reserve","Frei Brothers, Reserve, US, Racy and bright, w..."
53632,US,"A blend of grapes from three vineyard sites, t...",Dry,New York,Finger Lakes,Riesling,Heart & Hands,"Heart & Hands, Dry","Heart & Hands, Dry, US, A blend of grapes from..."
16988,France,"86–88. Barrel sample. With a chemical aroma, t...",Barrel Sample,Bordeaux,Barsac,Bordeaux-style White Blend,Château Suau,"Château Suau, Barrel Sample","Château Suau, Barrel Sample, France, 86–88. Ba..."


In [361]:
predictors.winery.duplicated().value_counts()

True     46024
False     9437
Name: winery, dtype: int64

In [389]:
## Region and Variety

predictors['des_reg_var'] = predictors['designation'] + ', ' + predictors['region_1'] + ', ' + predictors['variety']

In [390]:
predictors.des_reg_var.duplicated().value_counts()

True     50896
False    34718
Name: des_reg_var, dtype: int64

In [392]:
predictors.sample(10).des_reg_var

21242              Jenkins Ranch, Sonoma Coast, Pinot Noir
74645          Àn, Vi de la Terra Illes Balears, Red Blend
6514               Hatschbourg Grand Cru, Alsace, Riesling
23008               Cool Climate, Sonoma Coast, Pinot Noir
1353                             Colarej, Barolo, Nebbiolo
52466    Camp 4 Vineyard, Santa Ynez Valley, Grenache B...
82482    Diamond Mountain Ranch, Napa Valley, Cabernet ...
13594         Extra Dry, Columbia Valley (WA), White Blend
39320        Reserve, Alexander Valley, Cabernet Sauvignon
82931    Le Serre Nuove, Bolgheri, Cabernet Sauvignon-M...
Name: des_reg_var, dtype: object

### Removing Duplicate Values

In [324]:
predictors.drop_duplicates(inplace=True)

In [325]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55461 entries, 0 to 85141
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      55461 non-null  object
 1   description  55461 non-null  object
 2   designation  55461 non-null  object
 3   province     55461 non-null  object
 4   region_1     55461 non-null  object
 5   variety      55461 non-null  object
 6   winery       55461 non-null  object
 7   name         55461 non-null  object
 8   uid          55461 non-null  object
dtypes: object(9)
memory usage: 4.2+ MB


In [326]:
predictors.country.value_counts()

US           26157
Italy        10537
France        9761
Spain         4317
Argentina     2428
Australia     2147
Canada         114
Name: country, dtype: int64

In [328]:
predictors.uid.duplicated().value_counts()

False    55461
Name: uid, dtype: int64

In [360]:
predictors.name.duplicated().value_counts()

False    33549
True     21912
Name: name, dtype: int64

## Reducing Scope of Wines

In order to be able to model locally, I will reduce the scope of the project to focus only on winde producing regions outside of the United States.

In [329]:
non_usa = predictors[(predictors.country != 'US')]

In [346]:
non_usa.reset_index(inplace=True)

In [348]:
non_usa.drop('index', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [349]:
non_usa

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name,uid
0,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez,"Bodega Carmen Rodríguez, Carodorum Selección E...","Bodega Carmen Rodríguez, Carodorum Selección E..."
1,France,"This is the top wine from La Bégude, named aft...",La Brûlade,Provence,Bandol,Provence red blend,Domaine de la Bégude,"Domaine de la Bégude, La Brûlade","Domaine de la Bégude, La Brûlade, France, This..."
2,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,Northern Spain,Toro,Tinta de Toro,Numanthia,"Numanthia, Numanthia","Numanthia, Numanthia, Spain, Deep, dense and p..."
3,Spain,Slightly gritty black-fruit aromas include a s...,San Román,Northern Spain,Toro,Tinta de Toro,Maurodos,"Maurodos, San Román","Maurodos, San Román, Spain, Slightly gritty bl..."
4,Spain,Lush cedary black-fruit aromas are luxe and of...,Carodorum Único Crianza,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez,"Bodega Carmen Rodríguez, Carodorum Único Crianza","Bodega Carmen Rodríguez, Carodorum Único Crian..."
...,...,...,...,...,...,...,...,...,...
29299,Argentina,Standard aromas in the berry and beet range is...,Silver Reserve,Other,Famatina Valley,Syrah,Raza,"Raza, Silver Reserve","Raza, Silver Reserve, Argentina, Standard arom..."
29300,Australia,"Starts off with grassy, fresh herbal aromas an...",Zeepaard,Western Australia,Western Australia,Sauvignon Blanc,West Cape Howe,"West Cape Howe, Zeepaard","West Cape Howe, Zeepaard, Australia, Starts of..."
29301,Argentina,"Yellowish in color, with a heavy, somewhat cre...",Organic,Mendoza Province,Tupungato,Chardonnay,Domaine Jean Bousquet,"Domaine Jean Bousquet, Organic","Domaine Jean Bousquet, Organic, Argentina, Yel..."
29302,Argentina,"Dark and dense, with extracted black cherry, l...",Organic,Mendoza Province,Tupungato,Malbec,Domaine Jean Bousquet,"Domaine Jean Bousquet, Organic","Domaine Jean Bousquet, Organic, Argentina, Dar..."


## Vectorizing With Tfidf

In [350]:
vectors = TfidfVectorizer(min_df = 3,
                         max_features = None,
                         strip_accents = 'unicode',
                         analyzer = 'word',
                         token_pattern = '\w{2,}',
                         ngram_range = (1,3),
                         stop_words = 'english')

In [351]:
vectors_matrix = vectors.fit_transform(non_usa['description'])

In [352]:
vectors_matrix.shape

(29304, 65978)

## Calculating Similarity

In [353]:
sig_kern = sigmoid_kernel(vectors_matrix, vectors_matrix)

In [354]:
sig_kern

array([[0.76160052, 0.76159418, 0.76159482, ..., 0.76159436, 0.76159444,
        0.76159421],
       [0.76159418, 0.76160052, 0.76159418, ..., 0.76159416, 0.76159416,
        0.76159416],
       [0.76159482, 0.76159418, 0.76160052, ..., 0.76159419, 0.76159429,
        0.76159421],
       ...,
       [0.76159436, 0.76159416, 0.76159419, ..., 0.76160052, 0.76159427,
        0.76159419],
       [0.76159444, 0.76159416, 0.76159429, ..., 0.76159427, 0.76160052,
        0.76159418],
       [0.76159421, 0.76159416, 0.76159421, ..., 0.76159419, 0.76159418,
        0.76160052]])

In [355]:
index = pd.Series(non_usa.index, index=non_usa['winery']).drop_duplicates()

In [359]:
index['Maurodos']

winery
Maurodos        3
Maurodos     2364
Maurodos     5939
Maurodos    10949
Maurodos    13269
Maurodos    21319
Maurodos    24576
Maurodos    26911
dtype: int64

In [357]:
def recommend_wine(winery, sig_kern=sig_kern):
    indx = index[winery]
    sigmoid_score = list(enumerate(sig_kern[indx]))
    sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True)
    sigmoid_score = sigmoid_score[1:3]
    wines = [i[0] for i in sigmoid_score]
    return non_usa.iloc[wines]

In [358]:
recommend_wine('Maurodos')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()