In [6]:
import os
import sys

# Add parent directory to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.db_connection import DatabaseConnection
import uuid
import pandas as pd

In [8]:
db_connection = DatabaseConnection()
cleaned_data = db_connection.read_dataframe_from_db(
    "SELECT * FROM silver.covid where ingested_at = (SELECT MAX(ingested_at) FROM silver.covid);"
)
cleaned_data.shape

(43065, 9)

In [24]:
def fill_date_dim_table(cleansed_data: pd.DataFrame) -> pd.DataFrame:
    ingested_date = cleaned_data["ingested_at"].max()
    iso_date = ingested_date.isocalendar()
    date_key = int(str(ingested_date).replace("-", ""))
    full_date = ingested_date
    day_of_week = iso_date[2]
    day_of_month = ingested_date.day
    day_name = ingested_date.strftime("%A")
    week_of_year = iso_date[1]
    month = ingested_date.month
    month_name = ingested_date.strftime("%B")
    quarter = (month - 1) // 3 + 1
    year = iso_date[0]
    is_weekend = day_of_week >= 5

    date_record = {
        "date_key": date_key,
        "full_date": full_date,
        "day_of_week": day_of_week,
        "day_of_month": day_of_month,
        "week_of_year": week_of_year,
        "month": month,
        "month_name": month_name,
        "day_name": day_name,
        "quarter": quarter,
        "year": year,
        "is_weekend": is_weekend,
    }

    return pd.DataFrame([date_record])


date_dim = fill_date_dim_table(cleaned_data)


date_dim.head(1)

Unnamed: 0,date_key,full_date,day_of_week,day_of_month,week_of_year,month,month_name,day_name,quarter,year,is_weekend
0,20210102,2021-01-02,6,2,53,1,January,Saturday,1,2020,True


In [25]:
def fill_region_dim(cleansed_data: pd.DataFrame) -> pd.DataFrame:
    stored_regions = db_connection.read_dataframe_from_db(
        "SELECT * FROM gold.region_dim;"
    )
    new_regions = cleansed_data[["province_state", "country_region"]]
    new_regions.drop_duplicates(
        subset=("province_state", "country_region"), inplace=True
    )

    delta_regions = new_regions.merge(
        stored_regions[["country_region", "province_state"]],
        on=["country_region", "province_state"],
        how="left",
        indicator=True,
    )
    delta_regions = delta_regions[delta_regions["_merge"] == "left_only"]
    delta_regions.drop(columns=["_merge"], inplace=True)

    delta_regions["region_key"] = delta_regions.apply(lambda _: uuid.uuid4(), axis=1)
    return delta_regions


region_dim = fill_region_dim(cleaned_data)

region_dim

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_regions.drop_duplicates(subset=('province_state' , 'country_region'),inplace=True)


Unnamed: 0,province_state,country_region,region_key


In [26]:
def load_all_dimension(dims_tables: list):
    for dim in dims_tables:
        db_connection.load_dataframe_into_db(dim["table"], "gold", dim["name"])


dimension_list = [
    {"table": date_dim, "name": "date_dim"},
    {"table": region_dim, "name": "region_dim"},
]

load_all_dimension(dimension_list)

In [None]:
def fill_fact_table(cleaned_data: pd.DataFrame, date_dim: pd.DataFrame) -> pd.DataFrame:
    all_regions = db_connection.read_dataframe_from_db("SELECT * FROM gold.region_dim;")

    cleaned_data = cleaned_data.merge(
        all_regions, how="left", on=["province_state", "country_region"]
    )

    fact_data = cleaned_data[
        [
            "confirmed",
            "deaths",
            "recovered",
            "active",
            "incident_rate",
            "case_fatality_ratio",
            "region_key",
        ]
    ]
    fact_data["date_key"] = date_dim.loc[0, "date_key"]
    return fact_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_data['date_key'] = date_dim.loc[0,'date_key']


Unnamed: 0,confirmed,deaths,recovered,active,incident_rate,case_fatality_ratio,region_key,date_key
0,52586,2211,41727,8648,135.084102,4.204541,bb27ce1e-29ba-45e7-b008-a1be6e2bed91,20210102
1,58991,1190,34353,23448,2049.864480,2.017257,76b9031d-4311-4b5e-b3fe-ceaf4288d41b,20210102
2,100159,2769,67611,29779,228.407338,2.764604,3d4a7061-3e60-4b65-8e08-1963142309b3,20210102
3,8166,84,7463,619,10568.821588,1.028655,289c93e8-8d09-4b97-adeb-ef5c3774e4b1,20210102
4,17608,407,11189,6012,53.574686,2.311449,59347d78-414a-4e3c-be25-ad2b699220a2,20210102
...,...,...,...,...,...,...,...,...
3910,0,0,0,0,0.000000,0.000000,7f96591c-715c-4dc8-a052-1f4b9f9e1baf,20210102
3911,0,0,0,0,0.000000,0.000000,6aebe85d-8edd-42a8-92f7-9e96ba0dbd5c,20210102
3912,0,0,0,0,0.000000,0.000000,816fdf9d-b7f5-4c7e-ae20-2bf07a7424e2,20210102
3913,0,0,0,0,0.000000,0.000000,c276f3b5-90f5-47f9-ba59-b77ebfe81400,20210102


In [29]:
def load_fact_table(fact_data: pd.DataFrame):
    db_connection.load_dataframe_into_db(fact_data, "gold", "fact")


load_fact_table(fact_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fact_data['date_key'] = date_dim.loc[0,'date_key']


{'date_dim_records': 1, 'region_dim_records': 765, 'fact_records': 7830}

In [5]:
df = db_connection.read_dataframe_from_db("SELECT * FROM GOLD.date_dim;")

NameError: name 'db_connection' is not defined

In [5]:
df

Unnamed: 0,date_key,full_date,day_of_week,day_of_month,day_name,week_of_year,month,month_name,quarter,year,is_weekend
0,20210101,2021-01-01,5,1,Friday,53,1,January,1,2021,False


In [4]:
import pandas as pd

df = pd.DataFrame({"ingested_at": [pd.to_datetime("01-01-2025").date()]})

df

Unnamed: 0,ingested_at
0,2025-01-01


In [13]:
df = db_connection.read_dataframe_from_db("SELECT * FROM gold.region_dim;")

In [10]:
df

Unnamed: 0,province_state,country_region,confirmed,deaths,recovered,active,incident_rate,case_fatality_ratio,ingested_at
0,,Afghanistan,52513,2201,41727,8585,134.896578,4.191343,2021-01-01
1,,Albania,58316,1181,33634,23501,2026.409062,2.025173,2021-01-01
2,,Algeria,99897,2762,67395,29740,227.809861,2.764848,2021-01-01
3,,Andorra,8117,84,7463,570,10505.403482,1.034865,2021-01-01
4,,Angola,17568,405,11146,6017,53.452981,2.305328,2021-01-01
...,...,...,...,...,...,...,...,...,...
43060,,Ukraine,0,0,0,0,0.000000,0.000000,2021-01-01
43061,,Nauru,0,0,0,0,0.000000,0.000000,2021-01-01
43062,Niue,New Zealand,0,0,0,0,0.000000,0.000000,2021-01-01
43063,,Tuvalu,0,0,0,0,0.000000,0.000000,2021-01-01


In [11]:
df = pd.DataFrame(
    [
        {
            "province_state": None,
            "country_region": "Afghanistan",
            "confirmed": 52513,
            "deaths": 2201,
            "recovered": 41727,
            "active": 8585,
            "incident_rate": 134.896578,
            "case_fatality_ratio": 4.191343,
            "ingested_at": "2021-01-01",
        }
    ]
)
df

Unnamed: 0,province_state,country_region,confirmed,deaths,recovered,active,incident_rate,case_fatality_ratio,ingested_at
0,,Afghanistan,52513,2201,41727,8585,134.896578,4.191343,2021-01-01


In [None]:



from datetime import date


iso = date.today().isocalendar



<function date.isocalendar>