In [6]:
import os
import sys

# Dynamically add the src directory to the path
sys.path.append(os.path.abspath("../src"))


In [None]:
import pandas as pd
import polars as pl

SOURCE = "../data/sampled_df.csv"
COLS = [
    "Year",
    "DEDonorcode",
    "DonorName",
    "DERecipientcode",
    "RecipientName",
    "FlowCode",
    "FlowName",
    "Bi_Multi",
    "Category",
    "Finance_t",
    "Aid_t",
    "USD_Commitment",
    "USD_Disbursement",
    "USD_Received",
    "USD_Commitment_Defl",
    "USD_Disbursement_Defl",
    "USD_Received_Defl",
    "CurrencyCode",
    "Commitment_National",
    "Disbursement_National",
    "USD_GrantEquiv",
    "Geography",
    "LDCflag",
    "LDCflagName",
    #  'SDGfocus',
    #  'Keywords',
    #  'Gender',
    #  'Environment',
    #  'DIG',
    #  'Trade',
    #  'RMNCH',
    #  'DRR',
    #  'Nutrition',
    #  'Disability',
    #  'FTC',
    #  'PBA',
    #  'InvestmentProject',
    #  'AssocFinance',
    "Biodiversity",
    "ClimateMitigation",
    "ClimateAdaptation",
    "Desertification",
    "climate_relevance",
    "climate_class_number",
    "climate_class",
    "meta_category",
    "labelled_bilateral",
    "DonorType",
]

### pandas

In [8]:
df = pd.read_csv(SOURCE, usecols=COLS)

### polars

In [None]:
df = pl.read_csv(SOURCE, columns=COLS)
df


In [None]:
df.columns

In [None]:
df.to_pandas()

In [None]:
# ASK: how to distinguish between donors and recipients?
from typing import Literal


def read_data(
    selected_type: Literal["donors", "recipients"],
    source: str,
    columns: list,
    donor_type: Literal["bilateral", "multilateral", "all"],
) -> pl.DataFrame:
    """Read the data from the source and return the data based on the selected type and donor type.

    Args:
        selected_type (Literal["donors", "recipients"]): Whether to return the donors or recipients
        source (str): The path to the source file
        columns (list): The columns to read from the source
        donor_type (Literal["bilateral", "multilateral", "all"]): The type of donor to filter the data

    Returns:
        pl.DataFrame: The data based on the selected type and donor type
    """
    df = pl.read_csv(source=source, columns=columns)

    # Reshape the table based on the selected type
    if selected_type == "donors":
        data = df.drop(["DERecipientcode", "RecipientName"]).rename(
            {"DEDonorcode": "CountryCode", "DonorName": "CountryName"}
        )
    elif selected_type == "recipients":
        data = df.drop(["DEDonorcode", "DonorName"]).rename(
            {"DERecipientcode": "CountryCode", "RecipientName": "CountryName"}
        )

    # Filter the data based on the donor type
    if donor_type in ["bilateral", "multilateral"]:
        data = data.filter(
            df["DonorType"]
            == ("Donor Country" if donor_type == "bilateral" else "Multilateral Donor")
        )

    return data


read_data("donors", source=SOURCE, columns=COLS, donor_type="all")


In [None]:
def from_dummies(df, separator="_"):
    col_exprs = {}

    for col in df.columns:
        name, value = col.rsplit(separator, maxsplit=1)
        expr = pl.when(pl.col(col) == 1).then(value)
        col_exprs.setdefault(name, []).append(expr)

    return df.select(
        pl.coalesce(exprs).alias(  # keep the first non-null expression value by row
            name
        )
        for name, exprs in col_exprs.items()
    )


In [None]:
from functions.data_operations import fetch_data

fetch_data("recipients")


In [None]:
from typing import List

import polars as pl


def from_dummies(
    df: pl.DataFrame, selected_vars: List[str], separator="_"
) -> pl.DataFrame:
    col_exprs = {}

    # Loop through each column to build the expressions
    for col in df.columns:
        name, value = col.rsplit(separator, maxsplit=1)  # Split column name

        # Process only the columns whose base name is in selected_vars
        if name in selected_vars:
            expr = pl.when(pl.col(col) == 1).then(
                pl.lit(value)
            )  # Ensure value is used as a literal
            col_exprs.setdefault(name, []).append(expr)  # Group by base name

    # Select the coalesced expressions for each selected base name
    return df.select(
        [
            pl.coalesce(exprs).alias(name)  # Combine expressions into a single column
            for name, exprs in col_exprs.items()
        ]
    )


# Example usage
data = pl.DataFrame(
    {
        "climate_class_500": [1, 0, 0],
        "climate_class_Adaptation": [0, 1, 0],
        "climate_class_Mitigation": [0, 0, 1],
        "other_class_Option1": [1, 0, 0],
        "other_class_Option2": [0, 1, 0],
    }
)

# Specify which variables to merge
selected_vars = ["climate_class", "other_class"]

reshaped_data = from_dummies(data, selected_vars)
print(reshaped_data)
