# Wine words

In [1]:
import pandas as pd
import numpy as np
import string

In [2]:
df = pd.read_csv('../data/winemag-150k-reviews.csv',
                 usecols=['country',
                          'province',
                          'description',
                          'variety'])

1. lower the strings of the serise
2. use .splite method > it will break the string to lists
3. .explode will make those lists to a single words per row
4. clear the punctuations

now return what the function is made for. 

In [3]:
def top_10_words(s):
    common_wine_words = ['flavors',
                         'pslste',
                         'finish',
                         'drink',
                         'aromas']
    
    words = s.str.lower().str.split().explode().str.strip(string.punctuation)

    return_ = words.loc[(words.str.len() >= 5) & (~words.isin(common_wine_words))].value_counts().head(10)
    return return_

In [4]:
top_10_words(df['description'])

description
fruit      56496
acidity    32603
tannins    32186
cherry     30685
palate     28945
black      24599
spice      22658
sweet      21284
notes      19606
fresh      17662
Name: count, dtype: int64

 10 most common words used in French wine reviews

In [5]:
top_10_words(df.loc[df['country'] == 'France', 'description'])

description
fruit        8722
acidity      8641
tannins      6509
fruits       5459
fresh        4217
character    3499
black        3124
texture      3074
years        2957
crisp        2879
Name: count, dtype: int64

In [6]:
top_10_words(df.loc[df['province'] != 'California', 'description'])

description
fruit      46517
palate     25497
acidity    22325
tannins    22007
cherry     19473
spice      18572
black      17785
notes      16591
fresh      16221
berry      15499
Name: count, dtype: int64

find the words most commonly associated with wines made outside of California

In [7]:
top_10_words(df.loc[df['province'] != 'California', 'description'])

description
fruit      46517
palate     25497
acidity    22325
tannins    22007
cherry     19473
spice      18572
black      17785
notes      16591
fresh      16221
berry      15499
Name: count, dtype: int64

**NOTE:** Words used most often for white, red and rose wine are Chardonny, Sauvigon Blance, Riesling.

In [9]:
top_10_words(df.loc[df['variety'].isin(['Chardonnay', 'Sauvignon Blanc', 'Riesling']), 'description'])

description
fruit         9156
acidity       8367
apple         5893
citrus        5384
palate        5244
chardonnay    4914
crisp         4908
green         4191
notes         4030
pineapple     3859
Name: count, dtype: int64

In [12]:
top_10_words(df.loc[df['variety'].isin(['Pinot Noir',
                                        'Cabernet Sauvignon',
                                        'Syrah', 
                                        'Merlot',
                                        'Zinfandel']), 'description'])

description
fruit         15081
cherry        14041
tannins       13173
black          9551
blackberry     6777
acidity        6353
pinot          6342
sweet          5984
palate         5625
cherries       5378
Name: count, dtype: int64

In [13]:
top_10_words(df.loc[df['variety'] == 'Rosé', 'description'])

description
acidity       1135
fruit          697
crisp          672
fresh          622
strawberry     534
light          518
raspberry      510
cherry         470
fruity         428
fruits         420
Name: count, dtype: int64

10 most common words for the five most commonly mentioned wine varieties

In [15]:
df['variety'].value_counts().head()

variety
Chardonnay                  14482
Pinot Noir                  14291
Cabernet Sauvignon          12800
Red Blend                   10062
Bordeaux-style Red Blend     7347
Name: count, dtype: int64

In [18]:
five_most_mentioned_wines = df['variety'].value_counts().head(5).index


# finding description of thoes 5 mentions type of wines
df.loc[df['variety'].isin(five_most_mentioned_wines),'description']

0         This tremendous 100% varietal wine hails from ...
3         This spent 20 months in 30% new French oak, an...
8         This re-named vineyard was formerly bottled as...
9         The producer sources from two blocks of the vi...
11        From 18-year-old vines, this supple well-balan...
                                ...                        
150908    Another premier cru from Michel Gros, this one...
150909    This is a lovely, fragrant Burgundy, with a sm...
150910    Scents of graham cracker and malted milk choco...
150911    This needs a good bit of breathing time, then ...
150912    The nose is dominated by the attractive scents...
Name: description, Length: 58982, dtype: object

In [19]:
top_10_words(df.loc[df['variety'].isin(five_most_mentioned_wines),'description'])

description
fruit       22859
tannins     16007
cherry      13991
acidity     12526
black       11236
cabernet     9480
palate       9222
spice        7910
sweet        7881
blend        7597
Name: count, dtype: int64

In [33]:
df_ = pd.read_csv('../data/winemag-150k-reviews.csv',
                  usecols= ['country', 'points',
                            'province',
                            'description',
                            'variety'])

df_['points'] = df_['points'].astype(np.float32)

Which country’s wines got the highest average score?

In [34]:
df_.groupby('country')['points'].mean().sort_values(ascending= False).head(10)

country
England     92.888885
Austria     89.276741
France      88.925873
Germany     88.626427
Italy       88.413666
Canada      88.239799
Slovenia    88.234039
Morocco     88.166664
Turkey      88.096153
Portugal    88.057686
Name: points, dtype: float32

Create a pivot table in which the index contains countries, the columns contain varieties, and the cells contain mean scores. Include only the top 10 varieties.

In [38]:
(
    df_.loc[df['variety']
           .isin(df['variety']
                 .value_counts()
                 .head(10)
                 .index)]
            .pivot_table(index='country', 
                         columns='variety', 
                         values='points')
)

variety,Bordeaux-style Red Blend,Cabernet Sauvignon,Chardonnay,Merlot,Pinot Noir,Red Blend,Riesling,Sauvignon Blanc,Syrah,Zinfandel
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Argentina,89.57547,85.527748,84.17749,84.341972,85.058334,88.19706,85.0,83.295303,85.232391,
Australia,88.841461,88.115501,86.727951,85.258827,86.405266,87.816177,87.790207,86.624062,91.952377,88.199997
Austria,91.625,87.75,90.016396,89.0,88.753845,88.890511,90.583954,88.694214,87.0,
Brazil,86.0,81.0,,83.199997,,84.0,,,,
Bulgaria,,84.8125,86.875,84.36364,87.400002,89.0,83.75,84.400002,90.0,
Canada,89.0,88.666664,88.653847,87.875,89.111115,89.5,87.564514,87.75,89.666664,
Chile,89.754715,86.561562,85.24601,84.939186,85.827271,88.683167,85.714287,85.895805,87.506737,85.0
China,,,82.0,,,,,,,
Croatia,,,85.0,,84.0,86.0,,82.0,,
Cyprus,,88.0,,,,85.714287,,,,


What is the correlation between the number of wines offered by a country, and the mean score for that country? That is: If a country enters more wines, does its average score in reviews go up?

In [40]:
(
    df_
    .groupby('country')['points']
    .agg(['count', 'mean'])
    .corr()
)

Unnamed: 0,count,mean
count,1.0,0.236117
mean,0.236117,1.0
