In [1]:
import pandas as pd
pd.set_option("display.max_rows", 5)
reviews = pd.read_csv("../datasets/winemag-data-130k-v2.csv", index_col=0)

#from learntools.core import binder; binder.bind(globals())
#from learntools.pandas.summary_functions_and_maps import *
print("Setup complete.")

reviews.head()

Setup complete.


Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [2]:
#dtype: Gives the data type of a specific column
reviews.price.dtype

dtype('float64')

In [4]:
#It can also give the type of every column of a dataframe
reviews.dtypes

country        object
description    object
                ...  
variety        object
winery         object
Length: 13, dtype: object

In [None]:
#One peculiarity to keep in mind (and on display very clearly here) 
#is that columns consisting entirely of strings do not get their own type; they are instead given the object type.

In [5]:
#It's possible to convert a column of one type into another wherever such a conversion makes sense by using the astype() function
reviews.points.astype('float64')

0         87.0
1         87.0
          ... 
129969    90.0
129970    90.0
Name: points, Length: 129971, dtype: float64

In [7]:
#Entries missing values are given the value NaN, short for "Not a Number". 
#For technical reasons these NaN values are always of the float64 dtype

#Pandas provides some methods specific to missing data. To select NaN entries you can use pd.isnull() (or its companion pd.notnull()).

reviews[pd.isnull(reviews.price)]

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
13,Italy,This is dominated by oak and oak-driven aromas...,Rosso,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Masseria Setteporte 2012 Rosso (Etna),Nerello Mascalese,Masseria Setteporte
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129893,Italy,"Aromas of passion fruit, hay and a vegetal not...",Corte Menini,91,,Veneto,Soave Classico,,Kerin O’Keefe,@kerinokeefe,Le Mandolare 2015 Corte Menini (Soave Classico),Garganega,Le Mandolare
129964,France,"Initially quite muted, this wine slowly develo...",Domaine Saint-Rémy Herrenweg,90,,Alsace,Alsace,,Roger Voss,@vossroger,Domaine Ehrhart 2013 Domaine Saint-Rémy Herren...,Gewürztraminer,Domaine Ehrhart


In [None]:
#Replacing missing values is a common operation. Pandas provides a really handy method for this problem: 
# fillna(). fillna() provides a few different strategies for mitigating such data. 
#Or we could fill each missing value with the first non-null value that appears sometime after the given record in the database. This is known as the backfill strategy.
reviews.region_2.fillna("Unknown")

0         Unknown
1         Unknown
           ...   
129969    Unknown
129970    Unknown
Name: region_2, Length: 129971, dtype: object

In [9]:
#Alternatively, we may have a non-null value that we would like to replace. For example, suppose that since this dataset was published, 
# reviewer Kerin O'Keefe has changed her Twitter handle from @kerinokeefe to @kerino.
#One way to reflect this in the dataset is using the replace() method

reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino")

0            @kerino
1         @vossroger
             ...    
129969    @vossroger
129970    @vossroger
Name: taster_twitter_handle, Length: 129971, dtype: object

In [3]:
#What are the most common wine-producing regions? Create a Series counting the number of times each value occurs in the region_1 field. 
#This field is often missing data, so replace missing values with Unknown. Sort in descending order. 

reviews_per_region = reviews.region_1.fillna("Unknown").value_counts().sort_values(ascending = False)
print(reviews_per_region)

region_1
Unknown                21247
Napa Valley             4480
                       ...  
Bardolino Superiore        1
Paestum                    1
Name: count, Length: 1230, dtype: int64
