In [None]:
################################################################################
# Author 1:      Bernhard Lugger
# MatNr 1:       01114792
# Author 2:      Paula Nauta
# MatNr 2:       11938311
# File:          assignment2.ipynb
# Description: ... short description of the file ...
# Comments:    ... comments for the tutors ...
#              ... can be multiline ...
################################################################################

In [None]:
# imports
import matplotlib.pyplot as plt # or from matplotlib import pyplot as plt
import matplotlib.dates as mdates # we use this to space our date-xticks
import numpy as np
import pandas as pd
import pycountry as pc

import plotly.express as px
# Plotly express is especcialy useful if you want to plot dataframes (e.g. Pandas Dataframe).
from plotly.subplots import make_subplots
# import plotly.graph_objects as go


In [None]:
# -------- functions ------------

def countries_fullname(country):   # z.b 'AUS', 'BEL'
    """ausgewählte länder aus dem dataframe in ein neues dataframe"""
    country_full = pc.countries.get(alpha_3=country)
    country_name = country_full.name
    return country_name

def load_chosen_oecd_data(file_path, *chosen_countries):
    """loads one csv from the given path and returns the relevant columns of arbitrary chosen countries to a dataframe."""
    oecd_data = pd.read_csv(file_path, header=0, usecols=['LOCATION','SUBJECT', 'MEASURE', 'TIME', 'Value'], delimiter=",")

    final_filtered_df = pd.DataFrame()
    country_alpha3_names = []

    for country in chosen_countries:
        country_alpha3_names.append(country)

    for country in country_alpha3_names:
        df_mask=oecd_data['LOCATION']==country
        filtered_df = oecd_data[df_mask]
        final_filtered_df = final_filtered_df.append(filtered_df)

    for country in chosen_countries:    # replace alpha3 names with full names of countries
        full_name = countries_fullname(country)
        final_filtered_df = final_filtered_df.replace([country], full_name)

    return final_filtered_df


def filter_dataframe(df, column, value):
    """returns only data where the given value is in column."""
    df_mask=df[column]==value
    filtered_df = df[df_mask]
    return filtered_df


def plot_lineplot(df, title, xaxis_text):
    """plots a lineplot of the given DataFrame with the given title and x-axis text."""
    fig = px.line(df, x="TIME",y="Value", color="LOCATION", title=title)
    fig.update_layout(xaxis={"rangeslider": {"visible": True}})
    fig.update_yaxes(title_text = xaxis_text)
    fig.show()


def plot_barplot(df, title, xaxis_text):
    """plots a barplot of the given DataFrame with the given title and x-axis text."""
    fig = px.bar(df, x="LOCATION",y="Value", color="LOCATION", title=title)
    fig.update_yaxes(title_text = xaxis_text)
    fig.show()


def plot_scatterplot(df):
    #df1_year = filter_dataframe(df1, 'TIME', year)
    #df2_year = filter_dataframe(df2, 'TIME', year)

    fig = px.scatter(df, x="Values1", y="Values2") # size='petal_length', hover_data=['petal_width'],color="LOCATION
    fig.show()


def load_chosen_oecd_data_pure(file_path, *chosen_countries):
    """loads one csv from the given path and returns the relevant columns of arbitrary chosen countries to a dataframe."""
    oecd_data = pd.read_csv(file_path, header=0, usecols=['LOCATION','SUBJECT', 'MEASURE', 'TIME', 'Value'], delimiter=",")

    final_filtered_df = pd.DataFrame()
    country_alpha3_names = []

    for country in chosen_countries:
        country_alpha3_names.append(country)

    for country in country_alpha3_names:
        df_mask=oecd_data['LOCATION']==country
        filtered_df = oecd_data[df_mask]
        final_filtered_df = final_filtered_df.append(filtered_df)

    return final_filtered_df

def plot_scatterplot_prepare(df, country1, country2):
    """plots data from 2 dataframes from one year in a scatterplot"""
    df1_year = filter_dataframe(df1, 'TIME', year)
    df2_year = filter_dataframe(df2, 'TIME', year)

    fig = px.scatter(df1_year, x="sepal_width", y="sepal_length", color="species",
                     size='petal_length', hover_data=['petal_width'])
    fig.show()

In [None]:
# ----- Scatter-Plots --------
# Greenhouse gas (GHG), Tonnes/capita
df_GHG = load_chosen_oecd_data_pure('datasets/used/DP_LIVE_04012022042348960.csv', 'AUT', 'ITA', 'AUS', 'USA', 'BEL')
df_GHG_filtered = filter_dataframe(df_GHG, 'SUBJECT', 'GHG')
df_GHG_filtered = filter_dataframe(df_GHG_filtered, 'MEASURE', 'TONNE_CAP')
df_GHG_filtered = filter_dataframe(df_GHG_filtered, 'TIME', 2016)

df_GHG_filtered_indexed = df_GHG_filtered.set_index('LOCATION')
df_GHG_filtered_indexed = df_GHG_filtered_indexed.rename(columns={'Value': 'Values1'})
df1_final = df_GHG_filtered_indexed.Values1


# Adult education level - Tertiary, % of 25-64 year-olds, 2020 or latest available
df_edu_ter = load_chosen_oecd_data_pure('datasets/used/DP_LIVE_04012022044322787.csv', 'AUT', 'ITA', 'AUS', 'USA', 'BEL')
df_edu_ter_filtered = filter_dataframe(df_edu_ter, 'SUBJECT', 'TRY')
df_edu_ter_filtered = filter_dataframe(df_edu_ter_filtered, 'TIME', 2016)

df_edu_ter_filtered_indexed = df_edu_ter_filtered.set_index('LOCATION')
df_edu_ter_filtered_indexed = df_edu_ter_filtered_indexed.rename(columns={'Value': 'Values2'})
df2_final = df_edu_ter_filtered_indexed.Values2

df_final = pd.concat([df1_final, df2_final], axis=1) # <-- works but i guess not clean
# df_final_real = df_final[]
print(df_final)

plot_scatterplot(df_final)
# df_now = df1_final.assign(df2_final)



In [None]:
# ------------------ Test -------------------------

# Government researchers
path_to_file = 'datasets/used/DP_LIVE_02012022214642229.csv'
df_researchers = load_chosen_oecd_data(path_to_file, 'AUT', 'ITA', 'AUS', 'USA', 'BEL')
df_researchers_filtered = filter_dataframe(df_researchers, 'SUBJECT', 'TOT')
df_researchers_filtered = filter_dataframe(df_researchers_filtered, 'MEASURE', 'PC_NATIONAL')
plot_lineplot(df_researchers_filtered,"Government researchers","% of national total")

# Young population
path_to_file = 'datasets/used/DP_LIVE_02012022233800555.csv'
df_young = load_chosen_oecd_data(path_to_file, 'AUT', 'ITA', 'AUS', 'USA', 'BEL')
plot_lineplot(df_young,"Young population", "% of population")

# Discrimination in the family
path_to_file = 'datasets/used/DP_LIVE_03012022021117254.csv'
df_discrimination = load_chosen_oecd_data(path_to_file, 'AUT', 'ITA', 'AUS', 'USA', 'BEL')
df_discrimination_filtered = filter_dataframe(df_discrimination, 'SUBJECT', 'ATTWORKMUM')
plot_barplot(df_discrimination_filtered, "Discrimination in the family 2019", "Attitudes towards working mothers [%]")


In [None]:
# program
"""
plot_lineplot(land_a, land_b)
plot_lineplot(land_a, land_b)
plot_lineplot(land_a, land_b)

bar_lineplot(country1, country2, country3, country4, country5)
bar_lineplot(country1, country2, country3)
bar_lineplot(country1, country2, country3)

scatter
scatter
scatter
"""