In [67]:
import pandas as pd
import numpy as np

In [68]:
path = '../../pandas-workout-data/data/oecd_tourism.csv'
columns = ['LOCATION', 'SUBJECT', 'TIME', 'Value']

In [69]:
tourism_df = pd.read_csv(filepath_or_buffer=path, usecols=columns)

In [70]:
tourism_df.head(5)

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,31159.8
1,AUS,INT_REC,2009,29980.7
2,AUS,INT_REC,2010,35165.5
3,AUS,INT_REC,2011,38710.1
4,AUS,INT_REC,2012,38003.7


Find the five countries that received the greatest amount of tourist dollars, on average, across years in the data set.

In [71]:
tourism_df['SUBJECT'].unique()

array(['INT_REC', 'INT-EXP'], dtype=object)

In [72]:
tourism_df.groupby('LOCATION').count().loc['USA']

SUBJECT    24
TIME       24
Value      24
Name: USA, dtype: int64

In [73]:
(
    tourism_df[tourism_df['SUBJECT'] == 'INT_REC']
    .groupby('LOCATION')['Value']
    .sum()
    .sort_values(ascending=False)
    .iloc[slice(5)]    
) # Sum

LOCATION
USA    2419362.000
ESP     766213.991
FRA     715696.693
DEU     587494.277
GBR     569273.000
Name: Value, dtype: float64

In [74]:
tourism_df[tourism_df['SUBJECT'] == 'INT_REC'].groupby('LOCATION').count().loc['USA']

SUBJECT    12
TIME       12
Value      12
Name: USA, dtype: int64

In [75]:
(
    tourism_df[tourism_df['SUBJECT'] == 'INT_REC']
    .groupby('LOCATION')['Value']
    .mean()
    .sort_values(ascending=False)
    .iloc[slice(0,5)]
) # Mean

LOCATION
USA    201613.500000
ESP     69655.817364
FRA     65063.335727
DEU     53408.570636
GBR     51752.090909
Name: Value, dtype: float64

Find the five countries whose citizens spent the least amount of tourist dollars, on average, across years in the data set.

In [76]:
(
    tourism_df[tourism_df['SUBJECT'] == 'INT-EXP']
    .groupby('LOCATION')['Value']
    .mean()
    .sort_values(ascending=True)
    .iloc[slice(5)]
)

LOCATION
MLT     387.801667
CRI     867.075000
LVA     919.545455
ISL    1072.819636
HRV    1115.628083
Name: Value, dtype: float64

The separate CSV file oecd_locations.csv has two columns: one contains the three-letter abbreviated location name from the first CSV file, and the second is the full country name. Load this into a data frame, using the abbreviated data as an index.

In [77]:
path = '../../pandas-workout-data/data/oecd_locations.csv'

In [78]:
df_locations = pd.read_csv(filepath_or_buffer=path, header=None, names=['LOCATION', 'NAME'], index_col='LOCATION')

In [79]:
df_locations

Unnamed: 0_level_0,NAME
LOCATION,Unnamed: 1_level_1
AUS,Australia
AUT,Austria
BEL,Belgium
CAN,Canada
DNK,Denmark
FIN,Finland
FRA,France
DEU,Germany
HUN,Hungary
ITA,Italy


Join these two data frames together into a new one. In the new data frame, there is no LOCATION column. Instead, there is a name column with the full name of the country.

In [80]:
tourism_df.head(2)

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,31159.8
1,AUS,INT_REC,2009,29980.7


In [81]:
tourism_df = tourism_df.set_index('LOCATION')

In [82]:
tourism_df.tail(2)

Unnamed: 0_level_0,SUBJECT,TIME,Value
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SRB,INT-EXP,2018,1837.317
SRB,INT-EXP,2019,1999.313


In [83]:
df = df_locations.join(tourism_df)

In [84]:
df

Unnamed: 0_level_0,NAME,SUBJECT,TIME,Value
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,Australia,INT_REC,2008,31159.8
AUS,Australia,INT_REC,2009,29980.7
AUS,Australia,INT_REC,2010,35165.5
AUS,Australia,INT_REC,2011,38710.1
AUS,Australia,INT_REC,2012,38003.7
...,...,...,...,...
ISR,Israel,INT-EXP,2015,7507.0
ISR,Israel,INT-EXP,2016,8210.3
ISR,Israel,INT-EXP,2017,8986.0
ISR,Israel,INT-EXP,2018,9974.7


In [85]:
df['NAME'].isnull().sum()

np.int64(0)

In [86]:
df[df['NAME'].isnull()].index.unique()

Index([], dtype='object', name='LOCATION')

In [None]:
df[df['NAME'].isnull()].index.nunique()

0

In [89]:
(
    df[df['SUBJECT'] == 'INT_REC']
    .groupby('NAME')['Value']
    .mean()
    .sort_values(ascending=False)
    .iloc[slice(0,5)]
) 

NAME
United States     201613.500000
France             65063.335727
Germany            53408.570636
United Kingdom     51752.090909
Italy              44930.211545
Name: Value, dtype: float64

In [90]:
(
    df[df['SUBJECT'] == 'INT-EXP']
    .groupby('NAME')['Value']
    .mean()
    .sort_values(ascending=True)
    .iloc[slice(5)]
)

NAME
Hungary     2918.390182
Finland     5877.080909
Israel      6726.524833
Denmark    11326.169636
Austria    11934.563636
Name: Value, dtype: float64

In [91]:
(
    df[df['SUBJECT'] == 'INT_REC']
    .groupby('NAME')['Value']
    .mean()
    .sort_values(ascending=True)
    .iloc[slice(5)]
)

NAME
Finland    4700.236273
Brazil     6321.476083
Israel     6542.383250
Hungary    7299.353000
Denmark    9398.957636
Name: Value, dtype: float64

# Beyond the exercise

What happens if you perform the join in the other direction? That is, what if you invoke join on tourism_df, passing it an argument of locations_df? Do you get the same result?

In [93]:
df_locations.head()

Unnamed: 0_level_0,NAME
LOCATION,Unnamed: 1_level_1
AUS,Australia
AUT,Austria
BEL,Belgium
CAN,Canada
DNK,Denmark


In [94]:
tourism_df.head()

Unnamed: 0_level_0,SUBJECT,TIME,Value
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AUS,INT_REC,2008,31159.8
AUS,INT_REC,2009,29980.7
AUS,INT_REC,2010,35165.5
AUS,INT_REC,2011,38710.1
AUS,INT_REC,2012,38003.7


In [95]:
tourism_df.join(df_locations)

Unnamed: 0_level_0,SUBJECT,TIME,Value,NAME
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,INT_REC,2008,31159.800,Australia
AUS,INT_REC,2009,29980.700,Australia
AUS,INT_REC,2010,35165.500,Australia
AUS,INT_REC,2011,38710.100,Australia
AUS,INT_REC,2012,38003.700,Australia
...,...,...,...,...
SRB,INT-EXP,2015,1253.644,
SRB,INT-EXP,2016,1351.098,
SRB,INT-EXP,2017,1549.183,
SRB,INT-EXP,2018,1837.317,


In [97]:
# If there is no match on the right, then we get a null value in NAME.
tourism_df.join(df_locations)['NAME'].isnull().sum()

np.int64(870)

Get the mean tourism income per year rather than by country. Do you see any evidence of less tourism income during the time of the Great Recession, which started in 2008?

In [99]:
(
    tourism_df[tourism_df['SUBJECT'] == 'INT_REC']
        .groupby('TIME')['Value']
        .mean()
        .sort_index()    
)

TIME
2008    16841.151327
2009    15081.294774
2010    16003.938556
2011    17788.743759
2012    18216.112815
2013    19296.536037
2014    20198.824148
2015    19301.865907
2016    19574.941796
2017    20763.391981
2018    22436.338296
2019    23005.937500
Name: Value, dtype: float64

In [101]:
# Yes, we definitely see that 2008, 2009, and 2010 are at the bottom of the list.
(
    tourism_df[tourism_df['SUBJECT'] == 'INT_REC']
        .groupby('TIME')['Value']
        .mean()
        .sort_values(ascending=False)    
)

TIME
2019    23005.937500
2018    22436.338296
2017    20763.391981
2014    20198.824148
2016    19574.941796
2015    19301.865907
2013    19296.536037
2012    18216.112815
2011    17788.743759
2008    16841.151327
2010    16003.938556
2009    15081.294774
Name: Value, dtype: float64