In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, sigmoid_kernel 

# Building a Wine Recommendation System

Creating a content-based recommendation system through using NLP modelinng on sommellier reviews.

In [2]:
df = pd.read_csv('../../Data/wine_data.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   150930 non-null  int64  
 1   country      150925 non-null  object 
 2   description  150930 non-null  object 
 3   designation  105195 non-null  object 
 4   points       150930 non-null  int64  
 5   price        137235 non-null  float64
 6   province     150925 non-null  object 
 7   region_1     125870 non-null  object 
 8   region_2     60953 non-null   object 
 9   variety      150930 non-null  object 
 10  winery       150930 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 12.7+ MB


In [4]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
117329,117329,US,"This is a pleasant bottle, with pretty varieta...",Del-Mar Private Reserve,87,32.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Evergreen
133826,133826,Greece,"The lush nose of Viognier—fresh flowers, honey...",,86,45.0,Epanomi,,,Viognier,Domaine Gerovassiliou
7205,7205,Austria,"Although restrained on the nose, this wine lea...",Austrian Cherry,90,14.0,Niederösterreich,,,Zweigelt,The Dot
142953,142953,France,"95-97 Barrel sample. A hugely dense wine, pack...",Barrel sample,96,,Bordeaux,Saint-Estèphe,,Bordeaux-style Red Blend,Château Cos d'Estournel
51881,51881,US,Smells sharp and jammy and tastes tart and aci...,Fióre Marcheschi,81,24.0,California,Alexander Valley,Sonoma,Sangiovese,Irish Monkey Cellars
131667,131667,US,"Voluptuous, with decadently rich flavors of ca...",À Côté Santa Barbara Highlands Vineyard South ...,92,29.0,California,Santa Barbara County,Central Coast,Syrah,La Fenêtre
78961,78961,New Zealand,"Riper in style than Ara's Composite bottling, ...",Resolute,86,26.0,Marlborough,,,Sauvignon Blanc,Ara
74060,74060,US,"A massive wine, decadent and splendid, and a w...",Family Reserve,96,135.0,California,Napa Valley,Napa,Cabernet Sauvignon,Venge
90313,90313,US,"Clean, brisk and balanced, with good varietal ...",,87,13.0,California,Napa Valley,Napa,Chardonnay,Irony
121213,121213,US,"Sweet, soft and simple, with fruit stand flavors.",Red Shorts,81,12.0,California,Lodi,Central Valley,Red Blend,Peirano


In [5]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
df.drop('region_2', axis=1, inplace=True)

In [7]:
df.sample(10)

Unnamed: 0,country,description,designation,points,price,province,region_1,variety,winery
78647,US,"If you like that bite of gooseberry and dry, g...",,87,18.0,California,Happy Canyon of Santa Barbara,Sauvignon Blanc,3CV
27191,US,"Soft and layered, this wine has firmness and d...",Van der Kamp Vineyard,91,49.0,California,Sonoma Mountain,Pinot Noir,Siduri
87033,US,This is a standout in this price range. The we...,JV Estate,89,25.0,Oregon,Dundee Hills,Pinot Noir,Stoller
139761,Italy,This is a gorgeous and very smooth wine that b...,Piaggia Riserva,92,,Tuscany,Carmignano,Red Blend,Mauro Vannucci
8832,France,Jaboulet's Condrieu program takes a huge step ...,Domaine de Grands Amandiers,93,100.0,Rhône Valley,Condrieu,Viognier,Paul Jaboulet Aîné
42188,US,"A solid Cabernet, dry and full bodied, that sh...",Vintner's Reserve,87,20.0,California,Sonoma-Napa-Mendocino,Cabernet Sauvignon,Kendall-Jackson
61363,South Africa,"With its mélange of Verdelho, Viognier, Grenac...",Fair Maiden,87,15.0,Coastal Region,,White Blend,Bellingham
26097,US,"Sweet and fizzy, with orange soda and vanilla ...",,85,13.0,California,California,Moscato,Allure
128326,US,"Too acidic, even for Petite Sirah, and the aci...",Jonquil Vineyard,85,35.0,California,Napa Valley,Petite Sirah,Hopper Creek
103266,Portugal,"Soft, with strawberry flavors and only light a...",Dona Helena,83,,Setubal,,Rosé,Wines & Winemakers


In [8]:
predictors = df[['country', 'description', 'designation', 'province', 'region_1', 'variety', 'winery']]

In [9]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   country      150925 non-null  object
 1   description  150930 non-null  object
 2   designation  105195 non-null  object
 3   province     150925 non-null  object
 4   region_1     125870 non-null  object
 5   variety      150930 non-null  object
 6   winery       150930 non-null  object
dtypes: object(7)
memory usage: 8.1+ MB


## Missing Data

For the first iteration of this recommender system, I will drop observations with missing values across the board instead of being more selective. This cuts the available data in half. Next iterations could try modelling using fewer features, but more observations.

In [10]:
predictors.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [11]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85614 entries, 0 to 150928
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      85614 non-null  object
 1   description  85614 non-null  object
 2   designation  85614 non-null  object
 3   province     85614 non-null  object
 4   region_1     85614 non-null  object
 5   variety      85614 non-null  object
 6   winery       85614 non-null  object
dtypes: object(7)
memory usage: 5.2+ MB


In [12]:
predictors = predictors.reset_index()

In [13]:
predictors.description.duplicated().value_counts()

False    55449
True     30165
Name: description, dtype: int64

In [14]:
predictors.duplicated().value_counts()

False    85614
dtype: int64

In [15]:
predictors[(predictors.description.duplicated() == True)]

Unnamed: 0,index,country,description,designation,province,region_1,variety,winery
203,300,US,This standout Rocks District wine brings earth...,The Funk Estate,Washington,Walla Walla Valley (WA),Syrah,Saviah
289,423,US,"The aromas on this wine are quite light, conve...",Weinbau,Washington,Wahluke Slope,Grenache,Sol Stone
290,424,Spain,"A mix of smoke and toast blends with fresh, cr...",Yá Cuvée 23 Brut Rosé,Catalonia,Cava,Sparkling Blend,Sumarroca
316,480,US,Made from what Californians call the Pommard c...,Charles Vineyard Clone O5,California,Anderson Valley,Pinot Noir,Foursight
520,810,Italy,Here's a lively Moscato made in a dry style th...,Bianco Dry,Sicily & Sardinia,Noto,Moscato,Planeta
...,...,...,...,...,...,...,...,...
85609,150923,France,"Rich and toasty, with tiny bubbles. The bouque...",Demi-Sec,Champagne,Champagne,Champagne Blend,Jacquart
85610,150924,France,"Really fine for a low-acid vintage, there's an...",Diamant Bleu,Champagne,Champagne,Champagne Blend,Heidsieck & Co Monopole
85611,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,Champagne,Champagne,Champagne Blend,H.Germain
85612,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,Southern Italy,Fiano di Avellino,White Blend,Terredora


In [16]:
predictors.sample(10)

Unnamed: 0,index,country,description,designation,province,region_1,variety,winery
34767,59571,US,A Cab-based Bordeaux blend with a little Syrah...,Bad Boy,California,Yountville,Red Blend,Rocca
43743,76158,France,A very fine wine with some bottle age that has...,Orbe Noir,Southwest France,Cahors,Malbec,Domaine le Bout du Lieu
84611,148478,US,"This is a tart, racy, exciting wine, showing j...",Big Easy,Washington,Columbia Valley (WA),Cabernet Sauvignon,Stevens
50023,87179,US,"Tough in acidity and tannins, this Pinot is no...",Olivet Grange Vineyard,California,Russian River Valley,Pinot Noir,Inman Family
57077,99499,Italy,Here's a vineyard-designate Riserva distinguis...,Vigneto Bucerchiale Riserva,Tuscany,Chianti Rufina,Sangiovese,Selvapiana
30858,53115,Italy,The Preda Sarmassa Barolo Riserva offers an in...,Preda Sarmassa Riserva,Piedmont,Barolo,Nebbiolo,Virna Borgogno
15963,27173,US,An herbally prolific blend of 46% Cabernet Fra...,Farmer Mickey's Limited Edition,California,Paso Robles,Red Blend,Four Sisters Ranch
58986,103000,US,"A little too soft and sweet for balance, with ...",HMR Estate,California,Paso Robles,Chardonnay,Adelaida
20916,35587,France,"A lean, high acid wine that has a dry edge to ...",Bourgogne A. Rodet,Burgundy,Bourgogne,Pinot Noir,Antonin Rodet
79430,138738,Italy,"Offers measured peach, melon and grass aromas....",Stemmari,Sicily & Sardinia,Sicilia,Grillo,Feudo Arancio


## Feature Engineering & Unique Names for Wines

In [17]:
## Creating a more detailed name for each wine by combining Winery and Designation

predictors['name'] = predictors['winery'] + ', ' + predictors['designation']

In [18]:
predictors.drop('index', axis=1, inplace=True)

In [19]:
predictors.sample(10)

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name
1921,Italy,"Fragrances suggest hay, crushed tomato vine an...",Kirchleiten,Northeastern Italy,Alto Adige,Sauvignon,Tiefenbrunner,"Tiefenbrunner, Kirchleiten"
44338,Italy,"Slightly more subdued in aromatic intensity, t...",Vignassa,Piedmont,Barbera d'Asti Superiore Nizza,Barbera,Cascina La Ghersa,"Cascina La Ghersa, Vignassa"
17199,Argentina,"This is an unusual, exotic take on Malbec. The...",Altísimo,Mendoza Province,Uco Valley,Malbec,Ricominciare,"Ricominciare, Altísimo"
33027,US,Some high-toned floral highlights liven up the...,Final-Final,Washington,Columbia Valley (WA),Cabernet-Syrah,Efeste,"Efeste, Final-Final"
9957,Italy,Hearty Negroamaro with 15% Malvasia Nera produ...,Schiaccianoci,Southern Italy,Salento,Negroamaro,Vigne & Vini,"Vigne & Vini, Schiaccianoci"
47276,US,"Dark and jellied, with raspberry and vanilla f...",Grenache Mourvedre,California,Santa Barbara County,Rosé,Tercero,"Tercero, Grenache Mourvedre"
73804,US,"This blend of Zin, Petite Sirah and Mourvèdre ...",Phantom Old Vine,California,California,Red Blend,Bogle,"Bogle, Phantom Old Vine"
23983,Italy,This expression of Trebbiano and Chardonnay is...,Pergliamici,Tuscany,Toscana,White Blend,Casa Sola,"Casa Sola, Pergliamici"
50927,US,If you like your Chardonnay on the fruity side...,Artisan Collection,California,California,Chardonnay,HandCraft,"HandCraft, Artisan Collection"
19470,US,"This is the third release of Figgins, a single...",Estate,Washington,Walla Walla Valley (WA),Bordeaux-style Red Blend,Figgins,"Figgins, Estate"


In [20]:
## Leaving this in for future ideas

## Creating a UID for each wine by combining all data into one variable

#predictors['uid'] = predictors['winery'] + ', ' + predictors['designation'] + ', ' + predictors['country'] + ', ' + predictors['description'] + ', ' + predictors['province'] + ', ' + predictors['region_1'] + ', ' + predictors['variety']

### Removing Duplicate Values

In [21]:
predictors.drop_duplicates(inplace=True)

In [22]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55461 entries, 0 to 85141
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      55461 non-null  object
 1   description  55461 non-null  object
 2   designation  55461 non-null  object
 3   province     55461 non-null  object
 4   region_1     55461 non-null  object
 5   variety      55461 non-null  object
 6   winery       55461 non-null  object
 7   name         55461 non-null  object
dtypes: object(8)
memory usage: 3.8+ MB


In [23]:
predictors.country.value_counts()

US           26157
Italy        10537
France        9761
Spain         4317
Argentina     2428
Australia     2147
Canada         114
Name: country, dtype: int64

In [24]:
predictors.name.duplicated().value_counts()

False    33549
True     21912
Name: name, dtype: int64

In order for the end user to recieve recommendations, using this model and approach, they need to enter a unique name for a wine they like. With so many duplicates this becomes tricky. For now I will drop duplicated wine values, which significantly reduces the volume of data but solved the uniqueness issue. There is almost certainly a better way around this!

In [25]:
predictors.drop_duplicates(subset='name', keep='last', inplace=True)

In [26]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33549 entries, 1 to 85141
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      33549 non-null  object
 1   description  33549 non-null  object
 2   designation  33549 non-null  object
 3   province     33549 non-null  object
 4   region_1     33549 non-null  object
 5   variety      33549 non-null  object
 6   winery       33549 non-null  object
 7   name         33549 non-null  object
dtypes: object(8)
memory usage: 2.3+ MB


In [27]:
predictors.reset_index(inplace=True)

In [28]:
predictors.drop('index', axis=1, inplace=True)

## Vectorizing With Tfidf

In [29]:
vectors = TfidfVectorizer(min_df = 3,
                         max_features = None,
                         strip_accents = 'unicode',
                         analyzer = 'word',
                         token_pattern = '\w{2,}',
                         ngram_range = (1,3),
                         stop_words = 'english')

In [30]:
vectors_matrix = vectors.fit_transform(predictors['description'])

In [31]:
vectors_matrix.shape

(33549, 73445)

## Calculating Similarity

In [32]:
sig_kern = sigmoid_kernel(vectors_matrix, vectors_matrix)

In [33]:
sig_kern

array([[0.76159987, 0.76159421, 0.76159418, ..., 0.76159416, 0.76159422,
        0.76159418],
       [0.76159421, 0.76159987, 0.76159418, ..., 0.76159417, 0.76159417,
        0.76159417],
       [0.76159418, 0.76159418, 0.76159987, ..., 0.76159418, 0.76159418,
        0.76159418],
       ...,
       [0.76159416, 0.76159417, 0.76159418, ..., 0.76159987, 0.76159417,
        0.76159423],
       [0.76159422, 0.76159417, 0.76159418, ..., 0.76159417, 0.76159987,
        0.76159419],
       [0.76159418, 0.76159417, 0.76159418, ..., 0.76159423, 0.76159419,
        0.76159987]])

In [34]:
index = pd.Series(predictors.index, index=predictors['name']).drop_duplicates()

In [35]:
index.to_csv('sig_wines.csv')

## The Recommender

In [92]:
def recommend_wine(name, sig_kern=sig_kern):
    indx = index[name]
    sigmoid_score = list(enumerate(sig_kern[indx]))
    sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True)
    sigmoid_score = sigmoid_score[1:4]
    position = [i[0] for i in sigmoid_score]
    return predictors.iloc[position]

In [99]:
recommend_wine('Castelli del Grevepesa, Riserva Castelgreve')

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name
19711,Italy,"Made with Sangiovese, this shows ripe berry an...",Terra Rossa Riserva,Tuscany,Chianti Colli Senesi,Sangiovese,Tenuta di Trecciano,"Tenuta di Trecciano, Terra Rossa Riserva"
30485,Argentina,This wine shows molasses and Boston baked bean...,Paris Goulart Reserva,Mendoza Province,Mendoza,Malbec-Cabernet Sauvignon,Bodega Goulart,"Bodega Goulart, Paris Goulart Reserva"
22443,Italy,"From the Classico zone of Amarone, this shows ...",Corte Vaona,Veneto,Amarone della Valpolicella Classico,"Corvina, Rondinella, Molinara",Novaia,"Novaia, Corte Vaona"
