In [None]:
import pandas as pd
from glob import glob
from random import choice, sample
import altair as alt
from os import exists

In [None]:
country_filepaths = glob("*/*/*/*_data.csv")
print(country_filepaths)

## Testing With One Country

In [None]:
poland = pd.read_csv(country_filepaths[0])

In [None]:
poland.head()

In [None]:
poland.info()

In [None]:
poland_long = poland.melt(
        id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
        var_name='Year', value_name='Value'
    )

In [None]:
poland_long[poland_long["Value"].notna()].head().max()

In [None]:
pivoted_df = poland_long.pivot_table(
    index=['Country Name', 'Country Code', 'Year'],
    columns='Indicator Name',
    values='Value'
)

## Combining All Countries

Data is shaped as follows:
> [Country, Country Code, Year, [All Indicators]]

In [None]:
df_list = []

In [None]:
for country in country_filepaths:
    df = pd.read_csv(country)
    df_long = df.melt(
        id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
        var_name='Year', value_name='Value'
    )
    pivoted_df = df_long.pivot_table(
    index=['Country Name', 'Country Code', 'Year'],
    columns='Indicator Name',
    values='Value'
    ).reset_index()
    df_list.append(pivoted_df)

In [None]:
combined_data = pd.concat(df_list)

In [None]:
combined_data.dropna(subset="Year", inplace=True)
combined_data = combined_data.astype({"Year": int})

In [None]:
for col in combined_data.columns:
    if combined_data[col].dtype != float:
        print(col, combined_data[col].dtype)

In [None]:
cols = ['Country Name', 'Country Code', 'Year'] + [col for col in combined_data if col not in ['Country Name', 'Country Code', 'Year', "Indicator Name"]]
combined_data = combined_data[cols].reset_index()

In [None]:
combined_data.sample(10)

## Choosing a random Indicator to Check For 3 Random Countries

In [None]:
randomly_chosen_countries = sample(combined_data['Country Name'].unique().tolist(), 7)
filtered_data = combined_data[combined_data['Country Name'].isin(randomly_chosen_countries)]
randomly_chosen_countries


In [None]:
randomly_chosen_column = choice([col for col in combined_data if col not in ['Country Name', 'Country Code', 'Year', "Indicator Name"]])
randomly_chosen_column

In [None]:
alt.Chart(filtered_data).mark_line().encode(
    x='Year:O',
    y=alt.Y(randomly_chosen_column.replace(":", r"\:"), title=f'{randomly_chosen_column}'),
    color="Country Name"
).properties(
    title=f'{randomly_chosen_column} by Year'
).show()

In [None]:
if not exists(f"{randomly_chosen_column} by Year".replace(" ", "_")):
    ...

In [None]:
combined_data.to_csv("./data/world_bank/combined_world_data.csv", index=False)