In [None]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np

In [None]:
wine_table = pd.read_csv('winemag-data_first150k.csv')
pd.set_option("max_columns", None)
pd.set_option("max_rows", None)
wine_table.head()


## Creating numerical points for non-numerical data

In [90]:
# Finding unique varieties of wine in data 

print('Variety:')

varieties = list(set(wine_table['variety'].to_list()))
wine_table['variety'].sort_values().unique()


# Displaying a sample of varieties
print(varieties[0:50])


Variety:
['Vernaccia', 'Pied de Perdrix', 'Tempranillo-Cabernet Sauvignon', 'Antão Vaz', 'Sangiovese Cabernet', 'Gelber Traminer', 'Sauvignon Blanc-Verdejo', 'Prié Blanc', 'Merlot-Malbec', 'Catalanesca', 'Teran', 'Auxerrois', 'Gamay Noir', 'Pinot Bianco', 'Portuguese Red', 'Cortese', 'Macabeo-Chardonnay', 'Zweigelt', 'Okuzgozu', 'Tokay', 'Claret', "Cesanese d'Affile", 'Cabernet Franc-Merlot', 'Roter Traminer', 'Gros Manseng', 'Negroamaro', 'Trepat', 'Roussanne-Grenache Blanc', 'Romorantin', 'Vidadillo', 'Alvarinho-Chardonnay', 'Pansa Blanca', 'Pinot Blanc-Pinot Noir', 'Rhône-style Red Blend', 'Syrah-Carignan', 'Petite Syrah', 'Pinot Blanc', 'Shiraz-Viognier', 'Monastrell-Syrah', 'Enantio', 'Fer Servadou', 'Gragnano', 'Petit Meslier', 'Syrah-Petite Sirah', 'Morillon', 'Malvasia-Viura', 'Malvar', 'Charbono', 'Ciliegiolo', 'Silvaner']


In [91]:
# We will be using the index number from this set to act as the unique number identifier for each variety

print(varieties.index('Malbec'))

304


## Creating a 3-d point to represent a specific wine

Here in the data frame below, one can see two Malbec wines with very similar points and prices, our goal is to be able to find the most similar wine to our search wine as possible.

First we need to create an (x,y,z) coordinate to represent the (variety,price,points)

In [84]:
# Given a specific wine, we want to be able to predict the wine closest or most similar to the specified wine based on variety, price and score

is_Malbec =  wine_table['variety']=='Malbec'
malbec_table = wine_table[is_Malbec]

# Checking to see that filter worked
print('Malbec:')
print(malbec_table['variety'].sort_values().unique())
malbec_table.head()



Malbec:
['Malbec']


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
18,18,France,Coming from a seven-acre vineyard named after ...,Le Pigeonnier,95,290.0,Southwest France,Cahors,,Malbec,Château Lagrézette
60,60,Argentina,"Concentrated, ripe blackberry and cassis aroma...",The Apple Doesn't Fall Far From The Tree,91,30.0,Mendoza Province,Mendoza,,Malbec,Matias Riccitelli
62,62,Argentina,Smoky aromas of fresh-cut wood blend with berr...,Alegoría Gran Reserva,91,25.0,Mendoza Province,Mendoza,,Malbec,Navarro Correas
115,115,Argentina,"Aromas of prune, raisin and black plum are ful...",Reserva,86,15.0,Mendoza Province,Valle de Uco,,Malbec,Viñalba
190,190,Argentina,Dark-berry aromas are crisp and show a note of...,Reserve,87,15.0,Mendoza Province,Tupungato,,Malbec,Tupun


In [139]:
# Creating points from wine rows to find euclidean distance
# Taking the example of an under 20 dollar bottle of Malbec (This corresponds to row 115 in the data set)
wine = wine_table.loc[115]
print(wine)
print("===================================")
print(wine['variety'])
print(varieties.index(wine['variety']))
print(int(wine['price']))
print(wine['points'])
print("===================================")

variety_id = varieties.index(wine['variety'])

# intializing point for search wine
search  = np.array(((varieties.index(wine['variety'])), (int(wine['price'])), (wine['points']))) 

print("search element:")
print(search)

Unnamed: 0                                                   115
country                                                Argentina
description    Aromas of prune, raisin and black plum are ful...
designation                                              Reserva
points                                                        86
price                                                         15
province                                        Mendoza Province
region_1                                            Valle de Uco
region_2                                                     NaN
variety                                                   Malbec
winery                                                   Viñalba
Name: 115, dtype: object
Malbec
304
15
86
search element:
[304  15  86]


# Calculating Similarity

Here we use the Euclidean distance between our search wine and any other wine to see which one is the most similar.
As you can see, from the two wines below, a wine of the same variety and similar price  and points will have a much smaller "distance" from our search wine than that of a wine of different variety, and less similar price and points

A more basic example using simple integers can be found at: https://www.w3resource.com/python-exercises/math/python-math-exercise-79.php

In [138]:
# Calculating Euclidean distance for two random wines

elem1 = wine_table.loc[86]
print("first random wine: ")
print(elem1)
element1 = np.array(((varieties.index(elem1['variety'])), (int(elem1['price'])), (elem1['points'])))
print("")
print("first random wine point: ")
print(element1)
print("====================================================================================")


elem2 = wine_table.loc[190]
print("second random wine: ")
print(elem2)
element2 = np.array(((varieties.index(elem2['variety'])), (int(elem2['price'])), (elem2['points'])))
print("")
print("second random wine point: ")
print(element2)
print("====================================================================================")
print("")

print("search element:")
print(search)
print("====================================================================================")

# printing Euclidean distance 
print("")
print("Difference between first random wine and our wine")
dist1 = np.linalg.norm(search - element1) 
print(dist1)
print("")
print("Difference between second random wine and our wine")
dist2 = np.linalg.norm(search - element2) 
print(dist2)

first random wine: 
Unnamed: 0                                                    86
country                                                   France
description    This delicious, refreshing wine is textured, t...
designation                                Montmains Premier Cru
points                                                        91
price                                                         45
province                                                Burgundy
region_1                                                 Chablis
region_2                                                     NaN
variety                                               Chardonnay
winery                                  Domaine Gérard Duplessis
Name: 86, dtype: object

first random wine point: 
[265  45  91]
second random wine: 
Unnamed: 0                                                   190
country                                                Argentina
description    Dark-berry aromas are crisp and sh

Here we see the difference between wines, using this we can find the wine with the smallest difference from our search wine, not including itself

In [137]:
# looping through data-frame to find most similar wine and recording the wine with the smallest difference

print("Search Element:")
print("")
print(wine['variety'])
print(varieties.index(wine['variety']))
print(int(wine['price']))
print(wine['points'])
print("")
print(search)
print("")
print("====================================================================================")



print("Total number of Wines:")
print(len(wine_table.index))

print("")
# For the sake of saving calculation time and space I will be running the distance algorithm on a filtered set of wines
# This set is the set of wines of the same variety
print("Total number of Malbec wines:")
print(len(malbec_table.index))


Search Element:

Malbec
304
15
86

[304  15  86]

Total number of Wines:
150930

Total number of Malbec wines:
3208


In [133]:


# Here we could have used the entire table but for the sake of the tutorial and calculation time we will use the filered table
def findSimilar():
    min_row = 0
    min_dist = 100.0
    
    for x in malbec_table.dropna().index:
        elem = wine_table.loc[x]
        point = np.array(((varieties.index(elem['variety'])), (int(elem['price'])), (elem['points'])))
        dist = np.linalg.norm(search - point) 
        if dist < min_dist and dist > 0:
            min_dist = dist
            min_row = x
            
    return min_row
##
similar = findSimilar()

print("Wine most similar to our input row is:")
most_sim = wine_table.loc[similar]
print(wine_table.loc[similar])


Wine most similar to our input row is:
Unnamed: 0                                                  8500
country                                                       US
description    The generous fruit flavors in this medium-bodi...
designation                              The Heritage Collection
points                                                        86
price                                                         14
province                                              California
region_1                                                    Lodi
region_2                                          Central Valley
variety                                                   Malbec
winery                                                   Peirano
Name: 8500, dtype: object


In [140]:
print("Search Wine:")
print("")
print(wine['variety'])
print(int(wine['price']))
print(wine['points'])
print("")
print(wine)
print("______________________________")
print("Most Similar Wine:")
print("")
print(most_sim['variety'])
print(int(most_sim['price']))
print(most_sim['points'])
print("")
print(most_sim)
print("")

Search Wine:

Malbec
15
86

Unnamed: 0                                                   115
country                                                Argentina
description    Aromas of prune, raisin and black plum are ful...
designation                                              Reserva
points                                                        86
price                                                         15
province                                        Mendoza Province
region_1                                            Valle de Uco
region_2                                                     NaN
variety                                                   Malbec
winery                                                   Viñalba
Name: 115, dtype: object
______________________________
Most Similar Wine:

Malbec
14
86

Unnamed: 0                                                  8500
country                                                       US
description    The generous fruit fla