# STAT29000 Project 3 Examples

In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in the wine dataset
data = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv")

# get the first few values
data.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [3]:
from pandas import read_csv

# works pretty much the same
data = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-28/winemag-data-130k-v2.csv")

# see
data.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [4]:
# let's make things a little more obvious
data.rename(columns={'province':'location'}, inplace=True)

# much better
data.head(3)

# lets do this too
data.rename(columns={'Unnamed: 0':'wine_id'}, inplace=True)

In [5]:
# what do the points look like again?
data['points'].describe()

count    129971.000000
mean         88.447138
std           3.039730
min          80.000000
25%          86.000000
50%          88.000000
75%          91.000000
max         100.000000
Name: points, dtype: float64

In [6]:
# thats interesting lets write a function that accepts a wine id
# and returns the normalized point score. Let's pass the data
# to the function as well.
def normalized_points(data, wine_id):
    stats = data['points'].describe()
    numerator = data['points'].loc[data['wine_id']==wine_id] - stats['min']
    denominator = stats['max'] - stats['min']
    return numerator / denominator

print(normalized_points(data, 0))
normalized_points(data, 100)

0    0.35
Name: points, dtype: float64


100    0.4
Name: points, dtype: float64

In [7]:
# are there any wines from Indiana?
data[data['location']=="Indiana"]

# not in this list

Unnamed: 0,wine_id,country,description,designation,points,price,location,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery


In [8]:
# let's write a function that returns the wine_id of 
# every smooth wine: wines with points > 95
# or ( | means or, & means and ) ones described as such
def smooth_wines():
    # assuming data is available in the scope of this function
    return data['wine_id'][(data['points']>95) | (data['description'].str.contains('smooth'))].to_list()
    
smooth_wines()

[1,
 25,
 38,
 66,
 104,
 126,
 127,
 137,
 183,
 188,
 203,
 208,
 272,
 284,
 294,
 312,
 325,
 332,
 334,
 345,
 346,
 347,
 348,
 349,
 350,
 351,
 352,
 353,
 354,
 355,
 362,
 390,
 396,
 411,
 419,
 443,
 465,
 481,
 486,
 517,
 533,
 536,
 546,
 571,
 635,
 646,
 658,
 667,
 678,
 713,
 719,
 784,
 801,
 806,
 894,
 915,
 921,
 923,
 935,
 940,
 942,
 944,
 954,
 964,
 1008,
 1010,
 1035,
 1044,
 1056,
 1097,
 1119,
 1157,
 1187,
 1192,
 1203,
 1204,
 1220,
 1232,
 1239,
 1326,
 1397,
 1406,
 1439,
 1478,
 1496,
 1513,
 1516,
 1524,
 1529,
 1542,
 1552,
 1556,
 1557,
 1558,
 1559,
 1560,
 1561,
 1562,
 1563,
 1564,
 1565,
 1566,
 1567,
 1568,
 1569,
 1570,
 1571,
 1572,
 1573,
 1574,
 1575,
 1576,
 1577,
 1578,
 1603,
 1643,
 1663,
 1676,
 1688,
 1693,
 1694,
 1695,
 1701,
 1715,
 1726,
 1740,
 1756,
 1792,
 1860,
 1872,
 1878,
 1884,
 1885,
 1903,
 1945,
 1954,
 1960,
 1999,
 2015,
 2044,
 2047,
 2072,
 2074,
 2083,
 2084,
 2085,
 2090,
 2098,
 2102,
 2104,
 2106,
 2152,
 2158

In [9]:
# a useful trick sometimes
import string
print(string.ascii_letters)
print(list(string.ascii_letters))
set.intersection(set(string.ascii_letters), set(string.hexdigits))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


{'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'}

In [10]:
# lets write a function that returns a dict of country: list(variety)
from collections import defaultdict
def afunction():
    result = defaultdict()
    for idx, row in data.iterrows():
        # the get() method will return None if the key doesn't exist
        if result.get(row[1]):
            result[row[1]].append(row[len(row)-1])
        else:
            result[row[1]] = [row[len(row)-1]]
        
    return dict(result)

wines = afunction()

In [20]:
big_wines = [str(key).upper() for key, value in wines.items()]
big_wines

# with any sort of tuple or list result you can separate parts of the result 
# into different variables

# here, we are packing values into big_wines
sample_wine, *big_wines = [str(key).upper() for key, value in wines.items()]

print(f'Our sample: {sample_wine}')
print(f'The rest: {big_wines}')

sample_wine, *big_wines, last_wine = [str(key).upper() for key, value in wines.items()]

print(f'Our sample: {sample_wine}')
print(f'The rest: {big_wines}')
print(f'Last wine: {last_wine}')

# similarly, unpacking values is when you take a list or tuple and 
# separate it into individual arguments, look at the difference
print(*big_wines)
print(big_wines)

Our sample: ITALY
The rest: ['PORTUGAL', 'US', 'SPAIN', 'FRANCE', 'GERMANY', 'ARGENTINA', 'CHILE', 'AUSTRALIA', 'AUSTRIA', 'SOUTH AFRICA', 'NEW ZEALAND', 'ISRAEL', 'HUNGARY', 'GREECE', 'ROMANIA', 'MEXICO', 'CANADA', 'NAN', 'TURKEY', 'CZECH REPUBLIC', 'SLOVENIA', 'LUXEMBOURG', 'CROATIA', 'GEORGIA', 'URUGUAY', 'ENGLAND', 'LEBANON', 'SERBIA', 'BRAZIL', 'MOLDOVA', 'MOROCCO', 'PERU', 'INDIA', 'BULGARIA', 'CYPRUS', 'ARMENIA', 'SWITZERLAND', 'BOSNIA AND HERZEGOVINA', 'UKRAINE', 'SLOVAKIA', 'MACEDONIA', 'CHINA', 'EGYPT']
Our sample: ITALY
The rest: ['PORTUGAL', 'US', 'SPAIN', 'FRANCE', 'GERMANY', 'ARGENTINA', 'CHILE', 'AUSTRALIA', 'AUSTRIA', 'SOUTH AFRICA', 'NEW ZEALAND', 'ISRAEL', 'HUNGARY', 'GREECE', 'ROMANIA', 'MEXICO', 'CANADA', 'NAN', 'TURKEY', 'CZECH REPUBLIC', 'SLOVENIA', 'LUXEMBOURG', 'CROATIA', 'GEORGIA', 'URUGUAY', 'ENGLAND', 'LEBANON', 'SERBIA', 'BRAZIL', 'MOLDOVA', 'MOROCCO', 'PERU', 'INDIA', 'BULGARIA', 'CYPRUS', 'ARMENIA', 'SWITZERLAND', 'BOSNIA AND HERZEGOVINA', 'UKRAINE', 'SLOV

In [12]:
# in pandas, if a function has an argument called "inplace", you can run the function inplace, 
# which means you can just call the function and the data will be changed, you don't have to 
# re-assign the part of the data you are changing. 
dat = {'col1': [1, 2], 'col2': [3, np.nan]}
df = pd.DataFrame(data=dat)

# this will not change df
df.dropna()
df

# this will change the data
df = df.dropna()

# reset
dat = {'col1': [1, 2], 'col2': [3, np.nan]}
df = pd.DataFrame(data=dat)

# this will also change the data
df.dropna(inplace=True)
df

Unnamed: 0,col1,col2
0,1,3.0
