# Inner and Outer Joins

Inner join - intersection, only include indices present in common to both tables. 
Outer join - union, include all indices, insert `NaN` values where a row and column do not coincide.

In [1]:
import pandas as pd
import numpy as np
from glob import glob

bronze = pd.read_csv('./data/Summer Olympic medals/Bronze.csv', index_col='Country')
silver = pd.read_csv('./data/Summer Olympic medals/Silver.csv', index_col='Country')
gold = pd.read_csv('./data/Summer Olympic medals/Gold.csv', index_col='Country')

In [2]:
bronze = bronze[['Total']][:5]
bronze

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1052.0
Soviet Union,584.0
United Kingdom,505.0
France,475.0
Germany,454.0


In [3]:
silver = silver[['Total']][:5]
silver

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,1195.0
Soviet Union,627.0
United Kingdom,591.0
France,461.0
Germany,350.0


In [4]:
gold = gold[['Total']][:5]
gold

Unnamed: 0_level_0,Total
Country,Unnamed: 1_level_1
United States,2088.0
Soviet Union,838.0
United Kingdom,498.0
France,378.0
Germany,407.0


In [8]:
# concatenate horizontally using an inner join
pd.concat([bronze, silver, gold], keys=['bronze', 'silver', 'gold'], axis=1, join='inner')

Unnamed: 0_level_0,bronze,silver,gold
Unnamed: 0_level_1,Total,Total,Total
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
United States,1052.0,1195.0,2088.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0
France,475.0,461.0,378.0
Germany,454.0,350.0,407.0


We'll compare the historical 10-year GDP (Gross Domestic Product) growth in the US and in China. The data for the US starts in 1947 and is recorded quarterly; by contrast, the data for China starts in 1961 and is recorded annually.

We'll need to use a combination of resampling and an inner join to align the index labels. We'll also need an appropriate offset alias for resampling, and the method `.resample()` must be chained with some kind of aggregation method (`.pct_change()` and `.last()` in this case).

In [14]:
us = pd.read_csv('./data/GDP/gdp_usa.csv', index_col='DATE', parse_dates=True)
us.head()

Unnamed: 0_level_0,VALUE
DATE,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1
1947-10-01,260.3
1948-01-01,266.2


In [16]:
us.columns = ['GDP']
us.index.name = 'Year'
us.head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1
1947-10-01,260.3
1948-01-01,266.2


In [18]:
china = pd.read_csv('./data/GDP/gdp_china.csv', index_col='Year', parse_dates=True)
china.head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-01-01,59.184116
1961-01-01,49.55705
1962-01-01,46.685179
1963-01-01,50.097303
1964-01-01,59.062255


1. Create a new DataFrame `china_annual` by resampling the DataFrame china with .resample('A').last() (i.e., with annual frequency)
2. Chain `.pct_change(10)` as an aggregation method to compute the percentage change with an offset of ten years.
3. Chain .dropna() to eliminate rows containing null values.

In [21]:
china.resample('A').last().head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-12-31,59.184116
1961-12-31,49.55705
1962-12-31,46.685179
1963-12-31,50.097303
1964-12-31,59.062255


In [27]:
china.resample('A').last().pct_change(10).head(15)

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-12-31,
1961-12-31,
1962-12-31,
1963-12-31,
1964-12-31,
1965-12-31,
1966-12-31,
1967-12-31,
1968-12-31,
1969-12-31,


In [29]:
china_annual = china.resample('A').last().pct_change(10).dropna()
china_annual.head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1970-12-31,0.546128
1971-12-31,0.98886
1972-12-31,1.402472
1973-12-31,1.730085
1974-12-31,1.408556


Repeat the process with the `us` dataframe.

In [30]:
us_annual = us.resample('A').last().pct_change(10).dropna()
us_annual.head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1957-12-31,0.827507
1958-12-31,0.782686
1959-12-31,0.953137
1960-12-31,0.689354
1961-12-31,0.630959


Concatenate the two dataframes horizontally, along columns with an **inner join**.

In [37]:
gdp = pd.concat([china_annual, us_annual], axis=1, join='inner')
gdp.head()

Unnamed: 0_level_0,GDP,GDP
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,0.546128,1.017187
1971-12-31,0.98886,1.05227
1972-12-31,1.402472,1.172566
1973-12-31,1.730085,1.258858
1974-12-31,1.408556,1.295246


In [38]:
gdp.columns = ['China', 'US']
gdp.head()

Unnamed: 0_level_0,China,US
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,0.546128,1.017187
1971-12-31,0.98886,1.05227
1972-12-31,1.402472,1.172566
1973-12-31,1.730085,1.258858
1974-12-31,1.408556,1.295246


Resample `gdp` very decade (i.e., using `.resample('10A')`) and aggregating with the method `.last()`.

In [39]:
gdp.resample('10A').last()

Unnamed: 0_level_0,China,US
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-31,0.546128,1.017187
1980-12-31,1.072537,1.742556
1990-12-31,0.89282,1.012126
2000-12-31,2.357522,0.738632
2010-12-31,4.011081,0.454332
2020-12-31,3.789936,0.36178
