# Wine Magazine Reviews

In [None]:
import pandas as pd

wine_reviews = pd.read_csv("../Wine Reviews/winemag-data-130k-v2.csv", index_col=0)

### shape() 
returns (rows, cols)

In [None]:
wine_reviews.shape

### head()

by default returns first five rows
Else any integer can be passed as an argument

In [None]:
wine_reviews.head()

In [None]:
wine_reviews.head(10)

### Native Accessors

In [None]:
wine_reviews.country

In [None]:
wine_reviews['country'][0]

In [None]:
df = wine_reviews.loc[[0, 1, 10, 100], ["country", "province", "region_1", "region_2"]]
df

In [None]:
wine_reviews.iloc[0]

In [None]:
wine_reviews.iloc[:3, 0]

In [None]:
wine_reviews.iloc[1:3, 0]

In [None]:
wine_reviews.loc[:10, ['taster_name', 'taster_twitter_handle', 'points']]

In [None]:
first_descriptions = wine_reviews.description.iloc[:10]
first_descriptions

### Conditional Formatting

In [None]:
wine_reviews.country == 'Italy'

In [None]:
wine_reviews.loc[(wine_reviews.country == 'Italy') & (wine_reviews.points <= 90)]

### isin()
lets you select data from multiple matching values

In [None]:
wine_reviews.loc[wine_reviews.country.isin(['Italy', 'France'])]

In [None]:
sample_reviews = wine_reviews.loc[wine_reviews.index.isin([1, 2, 3, 5, 8])]
sample_reviews

In [None]:
top_oceania_wines = wine_reviews.loc[(wine_reviews.points >= 95) & (wine_reviews.country.isin(['Australia','New Zealand']))]


### notnull()

In [None]:
wine_reviews.loc[wine_reviews.price.notnull()]

### isnull()

In [None]:
wine_reviews.loc[wine_reviews.price.isnull()]

### Summary Functions

#### describe()
generates a high-level summary of the attributes of the given column.

In [None]:
wine_reviews.points.describe()

In [None]:
wine_reviews.country.describe()

In [None]:
wine_reviews.points.mean()

In [None]:
wine_reviews.country.unique() 

In [None]:
wine_reviews.country.value_counts() # count of reviews of wines from each country.

In [None]:
wine_reviews.taster_name.value_counts()

In [None]:
median_points = wine_reviews.points.median()
median_points

In [None]:
centered_price = wine_reviews.price - wine_reviews.price.mean()
centered_price

In [None]:
bargain_wine = wine_reviews.loc[(wine_reviews.points / wine_reviews.price).idxmax(), 'title']
bargain_wine

In [None]:
n_trop = wine_reviews.description.map(lambda desc: "tropical" in desc).sum()
n_fruity = wine_reviews.description.map(lambda desc: "fruity" in desc).sum()
descriptor_counts = pd.Series([n_trop, n_fruity], index=['tropical', 'fruity'])
descriptor_counts

A score of 95 or higher counts as 3 stars, a score of at least 85 but less than 95 is 2 stars. Any other score is 1 star.

Also, the Canadian Vintners Association bought a lot of ads on the site, so any wines from Canada should automatically get 3 stars, regardless of points.

In [None]:
def stars(row):
    if row.country == 'Canada':
        return 3
    elif row.points >= 95:
        return 3
    elif row.points >= 85:
        return 2
    else:
        return 1

star_ratings = wine_reviews.apply(stars, axis='columns')
star_ratings

In [None]:
wine_reviews.points.value_counts()

Another method to do:

### groupby()

In [None]:
wine_reviews.groupby('points').points.count()

In [None]:
wine_reviews.groupby(['country']).price.agg([len, min, max])

In [None]:
wine_reviews.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])


In [None]:
# for only Canada's result
wine_reviews[wine_reviews['country'] == 'Canada'].groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])

In [None]:
countries_reviewed = wine_reviews.groupby(['country', 'province']).description.agg([len])
countries_reviewed

### Sorting

> ascending

In [None]:
countries_reviewed.sort_values(by='len')

> Descending

In [None]:
countries_reviewed.sort_values(by='len', ascending=False)

In [None]:
countries_reviewed.sort_values(by=['country', 'len'])

In [None]:
wine_reviews.groupby('taster_twitter_handle').taster_twitter_handle.count()
# or
# wine_reviews.groupby('taster_twitter_handle').size()

In [None]:
wine_reviews.groupby('price').points.max().sort_index()

In [None]:
price_extremes = wine_reviews.groupby('variety').price.agg(['min','max'])
price_extremes

In [None]:
price_extremes.sort_values(by = ['min','max'], ascending = False)

In [49]:
reviewer_mean_ratings = wine_reviews.groupby('taster_name').points.mean()
reviewer_mean_ratings

taster_name
Alexander Peartree    85.855422
Anna Lee C. Iijima    88.415629
Anne Krebiehl MW      90.562551
Carrie Dykes          86.395683
Christina Pickard     87.833333
Fiona Adams           86.888889
Jeff Jenssen          88.319756
Jim Gordon            88.626287
Joe Czerwinski        88.536235
Kerin O’Keefe         88.867947
Lauren Buzzeo         87.739510
Matt Kettmann         90.008686
Michael Schachner     86.907493
Mike DeSimone         89.101167
Paul Gregutt          89.082564
Roger Voss            88.708003
Sean P. Sullivan      88.755739
Susan Kostrzewa       86.609217
Virginie Boone        89.213379
Name: points, dtype: float64

In [51]:
wine_reviews.groupby(['country', 'variety']).size().sort_values(ascending = False)

country  variety                 
US       Pinot Noir                  9885
         Cabernet Sauvignon          7315
         Chardonnay                  6801
France   Bordeaux-style Red Blend    4725
Italy    Red Blend                   3624
                                     ... 
Mexico   Cinsault                       1
         Grenache                       1
         Merlot                         1
         Rosado                         1
Uruguay  White Blend                    1
Length: 1612, dtype: int64

### Data Types

In [52]:
wine_reviews.dtypes

country                   object
description               object
designation               object
points                     int64
price                    float64
province                  object
region_1                  object
region_2                  object
taster_name               object
taster_twitter_handle     object
title                     object
variety                   object
winery                    object
dtype: object

### Missing Values

In [55]:
wine_reviews[pd.isnull(wine_reviews.country)]

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
913,,"Amber in color, this wine has aromas of peach ...",Asureti Valley,87,30.0,,,,Mike DeSimone,@worldwineguys,Gotsa Family Wines 2014 Asureti Valley Chinuri,Chinuri,Gotsa Family Wines
3131,,"Soft, fruity and juicy, this is a pleasant, si...",Partager,83,,,,,Roger Voss,@vossroger,Barton & Guestier NV Partager Red,Red Blend,Barton & Guestier
4243,,"Violet-red in color, this semisweet wine has a...",Red Naturally Semi-Sweet,88,18.0,,,,Mike DeSimone,@worldwineguys,Kakhetia Traditional Winemaking 2012 Red Natur...,Ojaleshi,Kakhetia Traditional Winemaking
9509,,This mouthwatering blend starts with a nose of...,Theopetra Malagouzia-Assyrtiko,92,28.0,,,,Susan Kostrzewa,@suskostrzewa,Tsililis 2015 Theopetra Malagouzia-Assyrtiko W...,White Blend,Tsililis
9750,,This orange-style wine has a cloudy yellow-gol...,Orange Nikolaevo Vineyard,89,28.0,,,,Jeff Jenssen,@worldwineguys,Ross-idi 2015 Orange Nikolaevo Vineyard Chardo...,Chardonnay,Ross-idi
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124176,,This Swiss red blend is composed of four varie...,Les Romaines,90,30.0,,,,Jeff Jenssen,@worldwineguys,Les Frères Dutruy 2014 Les Romaines Red,Red Blend,Les Frères Dutruy
129407,,Dry spicy aromas of dusty plum and tomato add ...,Reserve,89,22.0,,,,Michael Schachner,@wineschach,El Capricho 2015 Reserve Cabernet Sauvignon,Cabernet Sauvignon,El Capricho
129408,,El Capricho is one of Uruguay's more consisten...,Reserve,89,22.0,,,,Michael Schachner,@wineschach,El Capricho 2015 Reserve Tempranillo,Tempranillo,El Capricho
129590,,"A blend of 60% Syrah, 30% Cabernet Sauvignon a...",Shah,90,30.0,,,,Mike DeSimone,@worldwineguys,Büyülübağ 2012 Shah Red,Red Blend,Büyülübağ


In [57]:
wine_reviews.region_1

0                        Etna
1                         NaN
2           Willamette Valley
3         Lake Michigan Shore
4           Willamette Valley
                 ...         
129966                    NaN
129967                 Oregon
129968                 Alsace
129969                 Alsace
129970                 Alsace
Name: region_1, Length: 129971, dtype: object

In [56]:
# filling NaN values with Unknown
wine_reviews.region_1.fillna("Unknown")

0                        Etna
1                     Unknown
2           Willamette Valley
3         Lake Michigan Shore
4           Willamette Valley
                 ...         
129966                Unknown
129967                 Oregon
129968                 Alsace
129969                 Alsace
129970                 Alsace
Name: region_1, Length: 129971, dtype: object

In [58]:
wine_reviews.taster_twitter_handle.replace("@kerinokeefe", "@kerino")

0             @kerino
1          @vossroger
2         @paulgwine 
3                 NaN
4         @paulgwine 
             ...     
129966            NaN
129967    @paulgwine 
129968     @vossroger
129969     @vossroger
129970     @vossroger
Name: taster_twitter_handle, Length: 129971, dtype: object

In [64]:
point_strings = wine_reviews.points.astype(str)
point_strings

0         87
1         87
2         87
3         87
4         87
          ..
129966    90
129967    90
129968    90
129969    90
129970    90
Name: points, Length: 129971, dtype: object

In [65]:
n_missing_prices = wine_reviews.price.isnull().sum()
n_missing_prices

8996

Other same codes

`n_missing_prices = pd.isnull(wine_reviews.price).sum()`

or

`missing_price_reviews = wine_reviews[wine_reviews.price.isnull()]`

`n_missing_prices = len(missing_price_reviews)`

In [70]:
reviews_per_region = wine_reviews.region_1.fillna('Unknown').value_counts().sort_values(ascending=False)
reviews_per_region

region_1
Unknown                 21247
Napa Valley              4480
Columbia Valley (WA)     4124
Russian River Valley     3091
California               2629
                        ...  
Offida Rosso                1
Corton Perrières            1
Isle St. George             1
Geelong                     1
Paestum                     1
Name: count, Length: 1230, dtype: int64