In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, sigmoid_kernel 

# Building a Wine Recommendation System

Creating a content-based recommendation system through using NLP modelinng on sommellier reviews.

In [31]:
df = pd.read_csv('../../Data/wine_data.csv')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   150930 non-null  int64  
 1   country      150925 non-null  object 
 2   description  150930 non-null  object 
 3   designation  105195 non-null  object 
 4   points       150930 non-null  int64  
 5   price        137235 non-null  float64
 6   province     150925 non-null  object 
 7   region_1     125870 non-null  object 
 8   region_2     60953 non-null   object 
 9   variety      150930 non-null  object 
 10  winery       150930 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 12.7+ MB


In [33]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
60752,60752,Australia,"Overly green, with herbal flavors that overpow...",,82,8.0,New South Wales,New South Wales,,Merlot,Amaroo
22811,22811,France,"Approaching maturity, here is a Riesling that ...",,88,25.0,Alsace,Alsace,,Riesling,Hugel
50665,50665,Argentina,"Big and woody to start, with heavily toasted b...",Crios,88,15.0,Mendoza Province,Mendoza,,Cabernet Sauvignon,Dominio del Plata
112319,112319,US,"Once again, this bottling delivers a big, lush...",Belle Canyon,91,35.0,California,Dry Creek Valley,Sonoma,Zinfandel,Bella
131416,131416,France,"Good density here, with tannins that spring fr...",,88,17.0,Bordeaux,Haut-Médoc,,Bordeaux-style Red Blend,Château Lieujean
40088,40088,France,A wine that certainly benefits from the warm 2...,Clos St Landelin,85,75.0,Alsace,Alsace,,Pinot Noir,René Muré
37730,37730,US,"A new wine for Nottingham, this is a crisp, te...",,88,28.0,California,Arroyo Seco,Central Coast,Viognier,Nottingham Cellars
10241,10241,US,This blend of 60% Pinot Noir and 40% Chardonna...,Brut Cuvée,90,45.0,California,Sta. Rita Hills,Central Coast,Sparkling Blend,Fesstivity
69858,69858,US,"Made in the Pride style, this is a big, powerf...",,93,56.0,California,Sonoma-Napa,Napa-Sonoma,Merlot,Pride Mountain
110710,110710,Portugal,"From old vines, on pre-phylloxera terraces, th...",Quinta do Malho,91,90.0,Douro,,,Portuguese Red,J. & F. Lurton


In [34]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [35]:
df.drop('region_2', axis=1, inplace=True)

In [36]:
df.sample(10)

Unnamed: 0,country,description,designation,points,price,province,region_1,variety,winery
45,Italy,"A blend of 90% Sangiovese and 10% Canaiolo, th...",Vigneto Odoardo Beccari Riserva,90,30.0,Tuscany,Chianti Classico,Red Blend,Vignavecchia
81257,Italy,"This is a solid, medium-bodied wine that shoul...",Colle dei Venti,87,19.0,Central Italy,Montepulciano d'Abruzzo,Montepulciano,Caldora Vini
64466,Portugal,"A tight, juicy wine, with powerful new wood ar...",Quinta da Pellada Reserva,91,,Dão,,Portuguese Red,Alvaro Castro
30252,Italy,This simple Pinot Grigio has aromas of pears a...,Pulvernai,85,,Northeastern Italy,Alto Adige,Pinot Grigio,Castel Sallegg
119519,US,"A massive wine, just tremendous. Made from 97%...",The Narrow & Straight,94,60.0,California,California,White Blend,Sanguis
60025,New Zealand,"Like all of the Kumeu River Chardonnays, this ...",Estate,88,33.0,Kumeu,,Chardonnay,Kumeu River
112545,Italy,"Honey, peach and apricot characterize the nose...",,88,17.0,Northeastern Italy,Colli Orientali del Friuli,Friulano,Ermacora
107884,Austria,"Like many naturally delicate wines in 2003, th...",Zöbinger Heiligenstein,88,46.0,Kamptal,,Riesling,Hirsch
42617,Australia,This wine's flamboyant aromas of passion fruit...,Siblings,87,20.0,Western Australia,Margaret River,Sauvignon Blanc-Semillon,Leeuwin Estate
5821,Italy,"Intense aromas of tropical fruit, banana and w...",Rocca dei Leoni,88,16.0,Southern Italy,Campania,Falanghina,Villa Matilde


In [37]:
predictors = df[['country', 'description', 'designation', 'province', 'region_1', 'variety', 'winery']]

In [38]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150930 entries, 0 to 150929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   country      150925 non-null  object
 1   description  150930 non-null  object
 2   designation  105195 non-null  object
 3   province     150925 non-null  object
 4   region_1     125870 non-null  object
 5   variety      150930 non-null  object
 6   winery       150930 non-null  object
dtypes: object(7)
memory usage: 8.1+ MB


## Missing Data

For the first iteration of this recommender system, I will drop observations with missing values across the board instead of being more selective. This cuts the available data in half. Next iterations could try modelling using fewer features, but more observations.

In [39]:
predictors.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predictors.dropna(inplace=True)


In [40]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 85614 entries, 0 to 150928
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      85614 non-null  object
 1   description  85614 non-null  object
 2   designation  85614 non-null  object
 3   province     85614 non-null  object
 4   region_1     85614 non-null  object
 5   variety      85614 non-null  object
 6   winery       85614 non-null  object
dtypes: object(7)
memory usage: 5.2+ MB


In [41]:
predictors = predictors.reset_index()

In [42]:
predictors.description.duplicated().value_counts()

False    55449
True     30165
Name: description, dtype: int64

In [43]:
predictors.duplicated().value_counts()

False    85614
dtype: int64

In [44]:
predictors[(predictors.description.duplicated() == True)]

Unnamed: 0,index,country,description,designation,province,region_1,variety,winery
203,300,US,This standout Rocks District wine brings earth...,The Funk Estate,Washington,Walla Walla Valley (WA),Syrah,Saviah
289,423,US,"The aromas on this wine are quite light, conve...",Weinbau,Washington,Wahluke Slope,Grenache,Sol Stone
290,424,Spain,"A mix of smoke and toast blends with fresh, cr...",Yá Cuvée 23 Brut Rosé,Catalonia,Cava,Sparkling Blend,Sumarroca
316,480,US,Made from what Californians call the Pommard c...,Charles Vineyard Clone O5,California,Anderson Valley,Pinot Noir,Foursight
520,810,Italy,Here's a lively Moscato made in a dry style th...,Bianco Dry,Sicily & Sardinia,Noto,Moscato,Planeta
...,...,...,...,...,...,...,...,...
85609,150923,France,"Rich and toasty, with tiny bubbles. The bouque...",Demi-Sec,Champagne,Champagne,Champagne Blend,Jacquart
85610,150924,France,"Really fine for a low-acid vintage, there's an...",Diamant Bleu,Champagne,Champagne,Champagne Blend,Heidsieck & Co Monopole
85611,150926,France,"Offers an intriguing nose with ginger, lime an...",Cuvée Prestige,Champagne,Champagne,Champagne Blend,H.Germain
85612,150927,Italy,This classic example comes from a cru vineyard...,Terre di Dora,Southern Italy,Fiano di Avellino,White Blend,Terredora


In [45]:
predictors.sample(10)

Unnamed: 0,index,country,description,designation,province,region_1,variety,winery
43187,75262,France,"A wine that is now showing some maturity, the ...",Seigneurs d'Aiguilhe,Bordeaux,Côtes de Castillon,Bordeaux-style Red Blend,Château d'Aiguilhe
1460,2419,Italy,The nose is rather closed on this 80% Sangiove...,Gran Selezione,Tuscany,Chianti Classico,Red Blend,Livernano
74923,130863,Argentina,"A rough, harsh nose of mustard and burnt stems...",Premium,Mendoza Province,Mendoza,Chardonnay,Sur de los Andes
16002,27233,Italy,"Creamy and fresh, this sparkling Lugana shows ...",Brut 36,Lombardy,Lugana,Turbiana,Citari
63998,111471,US,"Soft and generous in fruity flavor, this has p...",Rockin' One,California,Paso Robles,Rhône-style Red Blend,Cass
72101,125727,US,The first of Morgan's '07 Pinots to be release...,Twelve Clones,California,Santa Lucia Highlands,Pinot Noir,Morgan
58570,102244,US,Cooked or burnt black currants and caramelized...,Reserve,California,Sierra Foothills,Syrah,Naggiar
34172,58498,Italy,Poggio Salvi's Brunello Riserva offers genuine...,Riserva,Tuscany,Brunello di Montalcino,Sangiovese Grosso,Villa Poggio Salvi
65209,113962,US,"The blend is mostly Merlot, with Syrah and Cab...",Red Table Wine,Washington,Columbia Valley (WA),Red Blend,Magnificent Wine Company
60577,105920,France,"92-94 Barrel sample. A powerful, but velvet te...",Les Forts de Latour Barrel sample,Bordeaux,Pauillac,Bordeaux-style Red Blend,Château Latour


## Feature Engineering & Unique Names for Wines

In [47]:
## Creating a more detailed name for each wine by combining Winery and Designation

predictors['name'] = predictors['winery'] + ', ' + predictors['designation']

In [48]:
predictors.drop('index', axis=1, inplace=True)

In [49]:
predictors.sample(10)

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name
5572,Australia,Although sourced from the warm northern end of...,Paradox,South Australia,Barossa,Shiraz,Yalumba,"Yalumba, Paradox"
42098,Spain,"Pinched smelling at first, with a note of turp...",Tio Pepe Fino Muy Seco,Andalucia,Jerez,Palomino,González Byass,"González Byass, Tio Pepe Fino Muy Seco"
56772,Italy,This beautiful sparkler rested on its lees for...,Casa delle Colonne Zero Riserva Millesimato,Lombardy,Franciacorta,Sparkling Blend,Fratelli Berlucchi,"Fratelli Berlucchi, Casa delle Colonne Zero Ri..."
46120,France,Clos des Myglands is a vineyard wholly owned b...,Clos des Myglands Premier Cru,Burgundy,Mercurey,Pinot Noir,Domaine Faiveley,"Domaine Faiveley, Clos des Myglands Premier Cru"
47712,France,A red cherry flavor and a rich texture show th...,Les Champs Pimont Premier Cru,Burgundy,Beaune,Pinot Noir,Domaine Champy,"Domaine Champy, Les Champs Pimont Premier Cru"
58979,Argentina,Mild varnish and lacquer aromas stem from the ...,Estate Bottled,Mendoza Province,Mendoza,Chardonnay,Pascual Toso,"Pascual Toso, Estate Bottled"
42775,US,Vanilla and cookie dough flavors dominate. The...,H3,Washington,Horse Heaven Hills,Merlot,Columbia Crest,"Columbia Crest, H3"
62085,US,A letdown after the recent string of good vint...,Estate,California,Chalk Hill,Merlot,Chalk Hill,"Chalk Hill, Estate"
65510,US,"Grown in the cool heart of the valley, but in ...",Saralee's Vineyard,California,Russian River Valley,Pinot Noir,Lost Canyon,"Lost Canyon, Saralee's Vineyard"
19764,Italy,"An excellent pizza or pasta wine, San Lorenzo ...",San Lorenzo,Central Italy,Rosso Conero,Montepulciano,Umani Ronchi,"Umani Ronchi, San Lorenzo"


In [387]:
## Leaving this in for future ideas

## Creating a UID for each wine by combining all data into one variable

#predictors['uid'] = predictors['winery'] + ', ' + predictors['designation'] + ', ' + predictors['country'] + ', ' + predictors['description'] + ', ' + predictors['province'] + ', ' + predictors['region_1'] + ', ' + predictors['variety']

### Removing Duplicate Values

In [50]:
predictors.drop_duplicates(inplace=True)

In [51]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55461 entries, 0 to 85141
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      55461 non-null  object
 1   description  55461 non-null  object
 2   designation  55461 non-null  object
 3   province     55461 non-null  object
 4   region_1     55461 non-null  object
 5   variety      55461 non-null  object
 6   winery       55461 non-null  object
 7   name         55461 non-null  object
dtypes: object(8)
memory usage: 3.8+ MB


In [52]:
predictors.country.value_counts()

US           26157
Italy        10537
France        9761
Spain         4317
Argentina     2428
Australia     2147
Canada         114
Name: country, dtype: int64

In [54]:
predictors.name.duplicated().value_counts()

False    33549
True     21912
Name: name, dtype: int64

In order for the end user to recieve recommendations, using this model and approach, they need to enter a unique name for a wine they like. With so many duplicates this becomes tricky. For now I will drop duplicated wine values, which significantly reduces the volume of data but solved the uniqueness issue. There is almost certainly a better way around this!

In [57]:
predictors.drop_duplicates(subset='name', keep='last', inplace=True)

In [58]:
predictors.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33549 entries, 1 to 85141
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   country      33549 non-null  object
 1   description  33549 non-null  object
 2   designation  33549 non-null  object
 3   province     33549 non-null  object
 4   region_1     33549 non-null  object
 5   variety      33549 non-null  object
 6   winery       33549 non-null  object
 7   name         33549 non-null  object
dtypes: object(8)
memory usage: 2.3+ MB


In [80]:
predictors.reset_index(inplace=True)

In [82]:
predictors.drop('index', axis=1, inplace=True)

## Vectorizing With Tfidf

In [83]:
vectors = TfidfVectorizer(min_df = 3,
                         max_features = None,
                         strip_accents = 'unicode',
                         analyzer = 'word',
                         token_pattern = '\w{2,}',
                         ngram_range = (1,3),
                         stop_words = 'english')

In [84]:
vectors_matrix = vectors.fit_transform(predictors['description'])

In [85]:
vectors_matrix.shape

(33549, 73445)

## Calculating Similarity

In [86]:
sig_kern = sigmoid_kernel(vectors_matrix, vectors_matrix)

In [87]:
sig_kern

array([[0.76159987, 0.76159421, 0.76159418, ..., 0.76159416, 0.76159422,
        0.76159418],
       [0.76159421, 0.76159987, 0.76159418, ..., 0.76159417, 0.76159417,
        0.76159417],
       [0.76159418, 0.76159418, 0.76159987, ..., 0.76159418, 0.76159418,
        0.76159418],
       ...,
       [0.76159416, 0.76159417, 0.76159418, ..., 0.76159987, 0.76159417,
        0.76159423],
       [0.76159422, 0.76159417, 0.76159418, ..., 0.76159417, 0.76159987,
        0.76159419],
       [0.76159418, 0.76159417, 0.76159418, ..., 0.76159423, 0.76159419,
        0.76159987]])

In [88]:
index = pd.Series(predictors.index, index=predictors['name']).drop_duplicates()

In [89]:
index['Sobon Estate, Fiddletown']

21347

In [92]:
def recommend_wine(name, sig_kern=sig_kern):
    indx = index[name]
    sigmoid_score = list(enumerate(sig_kern[indx]))
    sigmoid_score = sorted(sigmoid_score, key = lambda x:x[1], reverse = True)
    sigmoid_score = sigmoid_score[1:4]
    position = [i[0] for i in sigmoid_score]
    return predictors.iloc[position]

In [93]:
recommend_wine('Sobon Estate, Fiddletown')

Unnamed: 0,country,description,designation,province,region_1,variety,winery,name
26069,Italy,"A pleasant, traditionally made Dolcetto, the R...",Colombè,Piedmont,Dolcetto d'Alba,Dolcetto,Renato Ratti,"Renato Ratti, Colombè"
10765,US,"A lightly tawny straw color, this is scented w...",Winemakers 25th Anniversary,Oregon,Oregon,Chardonnay,Oak Knoll,"Oak Knoll, Winemakers 25th Anniversary"
4480,US,"Soft, round and textured, this is a minty Zinf...","Romanzo Moon, Shoup Vineyard",California,Lodi,Zinfandel,Lapis Luna,"Lapis Luna, Romanzo Moon, Shoup Vineyard"
