In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
# This is a list from winemag, through kaggle.com
# The file had to be converted into an Excel file to reduce the memory
# to upload into GitHub

# Read csv into pandas
wm_df=pd.read_csv('winemag-data-130k-v2.csv')

# Add source column and fill with 'WM'
wm_df['source']='WM'

# Extract vintage year from title (not all wines have a vintage provided)
info_vin=wm_df["title"]
vint=[]
for v in info_vin:
    vin_yr=''
    for i in v.split():
        if i.isdigit():
            vin_yr=vin_yr+i
    if vin_yr=='':
        vint.append('NaN')
    else:
        vint.append(int(vin_yr))
wm_df['vintage']=vint

wm_df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,source,vintage
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,WM,2013
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,WM,2011
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,WM,2013
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,WM,2013
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,WM,2012


In [3]:
# wines.xlsx is from dataworld

# Wines.csv has non-UTF-8 encoding, so we had to read the excel version
wine_df = pd.read_excel('Wines.xlsx')

# Add source column and fill with 'DM'
wine_df['source']='DM'

# Change the vintage from datetime or string to interger year (both formats were in the excel file)
vin=wine_df["Vintage"]
vin_year=[]
for v in vin:
    if type(v)==datetime.datetime:
        vin_year.append(int(v.year))
    else:
        vin_year.append(int(v[4:8]))
wine_df["Vintage"]=vin_year
wine_df.head()

Unnamed: 0,Vintage,Country,County,Designation,Points,Price,Province,Title,Variety,Winery,source
0,1919,Spain,Cava,1919 Brut Selecció,88,$13.00,Catalonia,L'Arboc NV 1919 Brut Selecció Sparkling (Cava),Sparkling Blend,L'Arboc,DM
1,1929,Italy,Vernaccia di San Gimignano,,87,$14.00,Tuscany,Guidi 1929 2015 Vernaccia di San Gimignano,Vernaccia,Guidi 1929,DM
2,1929,Italy,Sangiovese di Romagna Superiore,Prugneto,84,$15.00,Central Italy,Poderi dal Nespoli 1929 2011 Prugneto (Sangiov...,Sangiovese,Poderi dal Nespoli 1929,DM
3,1934,Portugal,,Reserva Velho,93,$495.00,Colares,Adega Viuva Gomes 1934 Reserva Velho Red (Cola...,Ramisco,Adega Viuva Gomes,DM
4,1945,France,Rivesaltes,Legend Vintage,95,$350.00,Languedoc-Roussillon,Gérard Bertrand 1945 Legend Vintage Red (Rives...,Red Blend,Gérard Bertrand,DM


In [4]:
# Columns to keep (wm_df is lower case, wine_df is capitalized):
# title, vintage, country, winery, designation, points, price, source

In [5]:
# Clean the Winemag-130k-v2 File

#Set the First Column Name as ID
wm_df = wm_df.rename(columns={'Unnamed: 0':'id'})

#Drop Unneeded Columns
wm_df = wm_df.drop(columns=['id', 'description', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'variety'])
wm_df


Unnamed: 0,country,designation,points,price,title,winery,source,vintage
0,Italy,Vulkà Bianco,87,,Nicosia 2013 Vulkà Bianco (Etna),Nicosia,WM,2013
1,Portugal,Avidagos,87,15.0,Quinta dos Avidagos 2011 Avidagos Red (Douro),Quinta dos Avidagos,WM,2011
2,US,,87,14.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Rainstorm,WM,2013
3,US,Reserve Late Harvest,87,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,St. Julian,WM,2013
4,US,Vintner's Reserve Wild Child Block,87,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Sweet Cheeks,WM,2012
...,...,...,...,...,...,...,...,...
129966,Germany,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Dr. H. Thanisch (Erben Müller-Burggraef),WM,2013
129967,US,,90,75.0,Citation 2004 Pinot Noir (Oregon),Citation,WM,2004
129968,France,Kritt,90,30.0,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Domaine Gresser,WM,2013
129969,France,,90,32.0,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Domaine Marcel Deiss,WM,2012


In [6]:
#Keep only Rows not Containing NaN Values in the Designated Columns
wm_df = wm_df[wm_df['country'].notna()]
wm_df = wm_df[wm_df['points'].notna()]
wm_df = wm_df[wm_df['price'].notna()]
wm_df = wm_df[wm_df['winery'].notna()]
wm_df = wm_df[wm_df['source'].notna()]
wm_df = wm_df[wm_df['vintage'].notna()]
wm_df

Unnamed: 0,country,designation,points,price,title,winery,source,vintage
1,Portugal,Avidagos,87,15.0,Quinta dos Avidagos 2011 Avidagos Red (Douro),Quinta dos Avidagos,WM,2011
2,US,,87,14.0,Rainstorm 2013 Pinot Gris (Willamette Valley),Rainstorm,WM,2013
3,US,Reserve Late Harvest,87,13.0,St. Julian 2013 Reserve Late Harvest Riesling ...,St. Julian,WM,2013
4,US,Vintner's Reserve Wild Child Block,87,65.0,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Sweet Cheeks,WM,2012
5,Spain,Ars In Vitro,87,15.0,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tandem,WM,2011
...,...,...,...,...,...,...,...,...
129966,Germany,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Dr. H. Thanisch (Erben Müller-Burggraef),WM,2013
129967,US,,90,75.0,Citation 2004 Pinot Noir (Oregon),Citation,WM,2004
129968,France,Kritt,90,30.0,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Domaine Gresser,WM,2013
129969,France,,90,32.0,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Domaine Marcel Deiss,WM,2012


In [7]:
# Group by Title and Return Averages of Points & Price
wm_df_group = wm_df.groupby('title').mean()

# Order the Data By Points to Obtain the Top 5 & Bottom 5 Scoring Wine Titles
wm_df_group = wm_df_group.sort_values('points', ascending=False)
wm_df_group

Unnamed: 0_level_0,points,price
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Avignonesi 1995 Occhio di Pernice (Vin Santo di Montepulciano),100.0,210.0
Cardinale 2006 Cabernet Sauvignon (Napa Valley),100.0,200.0
Quinta do Noval 2011 Nacional Vintage (Port),100.0,650.0
Château Léoville Las Cases 2010 Saint-Julien,100.0,359.0
Louis Roederer 2008 Cristal Vintage Brut (Champagne),100.0,250.0
...,...,...
Cristobal 1492 2009 1492 White White (Mendoza),80.0,8.0
Cristalino NV Extra Dry (Penedès),80.0,9.0
Flock by Smoking Loon 2007 Merlot (Napa Valley),80.0,15.0
Stella Mia NV Rosso e Dolce Brachetto (Italy),80.0,11.0
