# DataFrame examples for screencasts


This notebook contains the non-trivial examples used in the slides. In the long run 
we want to convert all DataFrame examples to tasks. 

The examples that are not in this notebook can be created trivially by using the 
`get_gapminder_sample` function with the correct argument and executing code from the 
slides. 

In [None]:
import pandas as pd

pd.options.mode.copy_on_write = True
pd.options.future.infer_string = True
pd.options.plotting.backend = "plotly"

import plotly.express as px

from epp_topics.slidev_utilities import get_html


def get_gapminder_sample(version="tiny"):
    df = px.data.gapminder()
    df = df.rename(
        columns={
            "lifeExp": "life_exp",
            "gdpPercap": "gdp_per_cap",
        },
    )
    if version == "tiny":
        df = df[["country", "continent", "year", "life_exp"]]
        df = df.query("country in ['Cuba', 'Spain']")
        df = df.query("year in [2002, 2007]")
    elif version == "full":
        pass
    elif version == "all_observations":
        df = df[["country", "continent", "year", "life_exp"]]
    elif version == "income":
        df = df[["country", "year", "gdp_per_cap", "pop"]]
        df = df.query("country in ['Cuba', 'Spain']")
        df = df.query("year in [2002, 2007]")
    else:
        msg = f"Unknown version: {version}"
        raise ValueError(msg)

    return df.reset_index(drop=True)

## Example for inspecting and summarizing data

This recreates the look of the full DataFrame in a pandas notebook at default display
settings. 

In [None]:
df = get_gapminder_sample("full")
df = pd.concat([df.loc[:6], df.loc[1699:]])
df.loc[6] = "..."
new_index = df.index.tolist()
new_index[6] = "..."
df.index = new_index
get_html(df)

## Gapminder data in wide format for the normal forms / rules for data management screencast

In [None]:
# gapminder in wide format

df = get_gapminder_sample("income")

df_wide = df.pivot(index="country", columns="year", values=["gdp_per_cap", "pop"])

df_wide.columns = df_wide.columns.map(lambda x: f"{x[0]}_{x[1]}")

get_html(df_wide)

## Combining dataframes

In [None]:
# concat without axis argument

df = get_gapminder_sample("tiny").set_index(["country", "year"])
top = df.query("country == 'Cuba'")
bottom = df.query("country == 'Spain'")

df = pd.concat([top, bottom])
get_html(df)

In [None]:
# concat with axis = 1


left = get_gapminder_sample("tiny").set_index(["country", "year"])
right = get_gapminder_sample("income").set_index(["country", "year"])

df = pd.concat([left, right], axis="columns")
get_html(df)
df

In [None]:
# 1:1 merge

left = get_gapminder_sample("tiny").iloc[:3].reset_index(drop=True)
right = get_gapminder_sample("income").iloc[1:].reset_index(drop=True)

df = pd.merge(
    left=left,
    right=right,
    on=["country", "year"],
    how="outer",
)

get_html(df)

In [None]:
# m:1 merge

left = get_gapminder_sample("tiny")[["country", "year", "life_exp"]]

right = pd.DataFrame()
right["country"] = ["Cuba", "Spain"]
right["capital"] = ["Havana", "Madrid"]

df = pd.merge(
    left=left,
    right=right,
    on="country",
)


get_html(df)

## Functional data management