# Multiple files with `pandas`

In [36]:
import numpy as np
import pandas as pd
import matplotlib

## Merge Practice

In [37]:
a_df = pd.DataFrame({
    'Country': ['Germany', 'France', 'Belgium', 'Finland'],
    'Population (M)': [82.8, 67.2, 11.4, 5.5],
    'Capital': ['Berlin', 'Paris', 'Brussels', 'Helsinki']
})
a_df

Unnamed: 0,Country,Population (M),Capital
0,Germany,82.8,Berlin
1,France,67.2,Paris
2,Belgium,11.4,Brussels
3,Finland,5.5,Helsinki


In [38]:
b_df = pd.DataFrame({
    'Country': ['Germany', 'France', 'Belgium', 'Canada'],
    'HDI': [0.936, 0.901, 0.916, 0.926]
})
b_df

Unnamed: 0,Country,HDI
0,Germany,0.936
1,France,0.901
2,Belgium,0.916
3,Canada,0.926


### Inner Merge Practice

In [39]:
inner_merged_df = pd.merge(a_df, b_df, on='Country', how='inner')
inner_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916


#### Check your code

In [40]:
from nbresult import ChallengeResult

result = ChallengeResult('inner_merge',
    inner_merged_shape=inner_merged_df.shape,
    inner_merged_nulls=sum(inner_merged_df.isnull().sum())
)
result.write()

print(result.check())


platform linux -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /home/saikotdasjoy/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/saikotdasjoy/code/Saikot1997/data-multiple-files-with-pandas/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 2 items

test_inner_merge.py::TestInnerMerge::test_inner_merged_nulls [32mPASSED[0m[32m      [ 50%][0m
test_inner_merge.py::TestInnerMerge::test_inner_merged_shape [32mPASSED[0m[32m      [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/inner_merge.pickle

[32mgit[39m commit -m [33m'Completed inner_merge step'[39m

[32mgit[39m push origin master



### Left Merge Practice

In [41]:
left_merged_df = pd.merge(a_df, b_df, on='Country', how='left')
left_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Finland,5.5,Helsinki,


#### Check your code

In [42]:
from nbresult import ChallengeResult

result = ChallengeResult('left_merge',
    left_merged_shape=left_merged_df.shape,
    left_merged_nulls=sum(left_merged_df.isnull().sum())
)
result.write()

print(result.check())


platform linux -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /home/saikotdasjoy/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/saikotdasjoy/code/Saikot1997/data-multiple-files-with-pandas/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 2 items

test_left_merge.py::TestLeftMerge::test_left_merged_df_shape [32mPASSED[0m[32m      [ 50%][0m
test_left_merge.py::TestLeftMerge::test_left_merged_nulls [32mPASSED[0m[32m         [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/left_merge.pickle

[32mgit[39m commit -m [33m'Completed left_merge step'[39m

[32mgit[39m push origin master



### Right Merge Practice

In [43]:
right_merged_df = pd.merge(a_df, b_df, how='right', on='Country')
right_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Canada,,,0.926


#### Check your code

In [44]:
from nbresult import ChallengeResult

result = ChallengeResult('right_merge',
    right_merged_shape=right_merged_df.shape,
    right_merged_nulls=sum(right_merged_df.isnull().sum())
)
result.write()

print(result.check())


platform linux -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /home/saikotdasjoy/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/saikotdasjoy/code/Saikot1997/data-multiple-files-with-pandas/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 2 items

test_right_merge.py::TestRightMerge::test_right_merged_df_shape [32mPASSED[0m[32m   [ 50%][0m
test_right_merge.py::TestRightMerge::test_right_merged_nulls [32mPASSED[0m[32m      [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/right_merge.pickle

[32mgit[39m commit -m [33m'Completed right_merge step'[39m

[32mgit[39m push origin master



### Outer Merge Practice


In [45]:
outer_merged_df = pd.merge(a_df, b_df, on='Country', how='outer')
outer_merged_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,0.936
1,France,67.2,Paris,0.901
2,Belgium,11.4,Brussels,0.916
3,Finland,5.5,Helsinki,
4,Canada,,,0.926


#### Check your code

In [46]:
from nbresult import ChallengeResult

result = ChallengeResult('outer_merge',
    outer_merged_shape=outer_merged_df.shape,
    outer_merged_nulls=sum(outer_merged_df.isnull().sum())
)
result.write()

print(result.check())


platform linux -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /home/saikotdasjoy/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/saikotdasjoy/code/Saikot1997/data-multiple-files-with-pandas/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 2 items

test_outer_merge.py::TestOuterMerge::test_outer_merged_df_shape [32mPASSED[0m[32m   [ 50%][0m
test_outer_merge.py::TestOuterMerge::test_outer_merged_nulls [32mPASSED[0m[32m      [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/outer_merge.pickle

[32mgit[39m commit -m [33m'Completed outer_merge step'[39m

[32mgit[39m push origin master



## Join Practice

In [47]:
aa_df = a_df.set_index("Country")
aa_df

Unnamed: 0_level_0,Population (M),Capital
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Germany,82.8,Berlin
France,67.2,Paris
Belgium,11.4,Brussels
Finland,5.5,Helsinki


In [48]:
bb_df = b_df.set_index("Country")
bb_df

Unnamed: 0_level_0,HDI
Country,Unnamed: 1_level_1
Germany,0.936
France,0.901
Belgium,0.916
Canada,0.926


In [49]:
aa_df.join(bb_df)

Unnamed: 0_level_0,Population (M),Capital,HDI
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Germany,82.8,Berlin,0.936
France,67.2,Paris,0.901
Belgium,11.4,Brussels,0.916
Finland,5.5,Helsinki,


## Concat Practice

In [50]:
concat_df = pd.concat([a_df, b_df], axis="index", sort=False)
concat_df

Unnamed: 0,Country,Population (M),Capital,HDI
0,Germany,82.8,Berlin,
1,France,67.2,Paris,
2,Belgium,11.4,Brussels,
3,Finland,5.5,Helsinki,
0,Germany,,,0.936
1,France,,,0.901
2,Belgium,,,0.916
3,Canada,,,0.926


## Olympic Sports and Medals, 1896-2014

In [51]:
# Load countries.csv
countries_df = pd.read_csv('dictionary.csv')

# Load summer.csv and rename columns
summer_df = pd.read_csv('summer.csv')
summer_df = summer_df.rename(columns={'Country': 'Code'})

# Load winter.csv and rename columns
winter_df = pd.read_csv('winter.csv')
winter_df = winter_df.rename(columns={'Country': 'Code'})

### Combining The Data

In [71]:
summer_countries_df = pd.merge(countries_df, summer_df, on='Code', how='inner')
summer_countries_df = summer_countries_df.assign(Season='Summer')

winter_countries_df = pd.merge(countries_df, winter_df, on='Code', how='inner')
winter_countries_df = winter_countries_df.assign(Season='Winter')

merged_df = pd.merge(summer_countries_df, winter_countries_df, on=['Code', 'Season'], how='inner')

common_cols = np.intersect1d(summer_countries_df.columns, winter_countries_df.columns)
all_df = merged_df[common_cols]


KeyError: "['Athlete', 'City', 'Country', 'Discipline', 'Event', 'GDP per Capita', 'Gender', 'Medal', 'Population', 'Sport', 'Year'] not in index"

#### Check your code

In [61]:
from nbresult import ChallengeResult

result = ChallengeResult('all_df',
    all_df_shape=all_df.shape,
    all_df_columns=set(all_df.columns)
)
result.write()

print(result.check())


platform linux -- Python 3.10.6, pytest-7.1.3, pluggy-1.0.0 -- /home/saikotdasjoy/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/saikotdasjoy/code/Saikot1997/data-multiple-files-with-pandas/tests
plugins: asyncio-0.19.0, anyio-3.6.2
asyncio: mode=strict
[1mcollecting ... [0mcollected 2 items

test_all_df.py::TestAllDf::test_all_df_columns [31mFAILED[0m[31m                    [ 50%][0m
test_all_df.py::TestAllDf::test_all_df_shape [31mFAILED[0m[31m                      [100%][0m

[31m[1m________________________ TestAllDf.test_all_df_columns _________________________[0m

self = <test_all_df.TestAllDf testMethod=test_all_df_columns>

    [94mdef[39;49;00m [92mtest_all_df_columns[39;49;00m([96mself[39;49;00m):
        cols = {
            [33m'[39;49;00m[33mAthlete[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mCity[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mCode[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mCountry[39;49;