In [12]:
import pandas as pd
import streamlit as st

@st.cache_data
def load_and_preprocess_data():
    # Load the CSV files
    life_expectancy_df = pd.read_csv('lex.csv')
    population_df = pd.read_csv('pop.csv')
    gni_df = pd.read_csv('ny_gnp_pcap_pp_cd.csv')

    # Forward fill missing values
    life_expectancy_df.ffill(inplace=True)
    population_df.ffill(inplace=True)
    gni_df.ffill(inplace=True)

    # Transform each dataframe to tidy data format
    life_expectancy_df = life_expectancy_df.melt(id_vars=["country"], var_name="year", value_name="life_expectancy")
    population_df = population_df.melt(id_vars=["country"], var_name="year", value_name="population")
    gni_df = gni_df.melt(id_vars=["country"], var_name="year", value_name="gni_per_capita")

    # Merge the dataframes
    merged_df = life_expectancy_df.merge(population_df, on=["country", "year"])
    merged_df = merged_df.merge(gni_df, on=["country", "year"])

    return merged_df

if __name__ == "__main__":
    df = load_and_preprocess_data()
    print(df.head())


2024-05-18 16:49:21.859 No runtime found, using MemoryCacheStorageManager
2024-05-18 16:49:21.866 No runtime found, using MemoryCacheStorageManager


       country  year  life_expectancy population gni_per_capita
0  Afghanistan  1990             53.8      10.7M          20.5k
1       Angola  1990             49.7      11.8M           3000
2      Albania  1990             72.8       3.3M           2550
3          UAE  1990             68.7       1.9M           2550
4    Argentina  1990             72.5      32.6M           6870


In [3]:
import pandas as pd

def load_and_transform_data(file_path, value_name):
    df = pd.read_csv(file_path)
    melted_df = pd.melt(df, id_vars=['country'], var_name='year', value_name=value_name)

    # Convert columns to numeric (improved handling of missing values)
    melted_df[value_name] = (
        melted_df[value_name]
        .astype(str)
        .str.replace(r'[kMB]', '', regex=True)
        .str.replace(',', '.', regex=False)
        .replace('', '0')
        .astype(float)
    )
    melted_df[value_name] = melted_df[value_name].fillna(method='ffill')

    # Scale values
    melted_df[value_name] = melted_df[value_name].apply(
        lambda x: x * 1000 if x < 1000000 else (x * 1000000 if x < 1000000000 else x)
    )
    return melted_df

# Load and transform data
gni_df = load_and_transform_data("app/gni.csv", "GNI")
lex_df = load_and_transform_data("app/lex.csv", "LEX")
pop_df = load_and_transform_data("app/pop.csv", "POP")

# Merge dataframes 
merged_df = gni_df.merge(lex_df, on=['country', 'year']).merge(pop_df, on=['country', 'year'])
merged_df['year'] = pd.to_numeric(merged_df['year'],errors='coerce')

# Save as Parquet
merged_df.to_parquet("app/gapminder_data.parquet")

FileNotFoundError: [Errno 2] No such file or directory: 'app/gni.csv'