In [24]:
import os
import psycopg2
from dotenv import load_dotenv
import pandas as pd
import chardet


load_dotenv()

dis_db_password = os.getenv("REMOTE_POSTGRES_DIS_PASSWORD")

conn_params = {
    'host': 'vsisdb.informatik.uni-hamburg.de',
    'dbname': 'dis-2025',
    'user': 'vsisp42',
    'password': dis_db_password
}


In [26]:
def show_existing_tables():
    conn = psycopg2.connect(**conn_params)

    with conn:
        with conn.cursor() as cur:
            cur.execute(
                """
                SELECT table_name
                FROM information_schema.tables
                WHERE table_schema = %s
                ORDER BY table_name;
                """,
                (conn_params["user"],),
            )
            tables = cur.fetchall()

            for table in tables:
                print("-", table[0])
show_existing_tables()

- article
- city
- country
- productcategory
- productfamily
- productgroup
- region
- shop


In [8]:
with open('resources/sales.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))

print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [16]:
bad_lines = []
with open('resources/sales.csv', encoding='ISO-8859-1') as f:
    for i, line in enumerate(f, start=1):
        if line.count(';') != 4:  # 4 Semikolons = 5 Spalten
            bad_lines.append((i, line.strip()))

for i, l in bad_lines:
    print(f"Fehler in Zeile {i}: {l}")

Fehler in Zeile 35906: 06.04.2019;12.03.2019;Superstore Dresden;AEG Öko Lavatherm 59850 Sensidry;3;2997,00


In [36]:
def load_csv_data():
    return pd.read_csv(
        'resources/sales.csv',
        encoding='ISO-8859-1',
        sep=';',
        decimal=',',
        on_bad_lines='skip'
    )

df = load_csv_data()
print(df.head())


         Date               Shop                           Article  Sold  \
0  01.01.2019  Superstore Berlin  AEG Öko Lavatherm 59850 Sensidry    25   
1  01.01.2019  Superstore Berlin     AEG Öko-Lavamat Öko Plus 1400    25   
2  01.01.2019  Superstore Berlin              Bauknecht TK Care 6B    13   
3  01.01.2019  Superstore Berlin      Bauknecht WA Sensitive 36 DI     2   
4  01.01.2019  Superstore Berlin                       BenQ DE350P    31   

    Revenue  
0  24975.00  
1  14975.00  
2   3639.74  
3    699.80  
4   8369.69  


In [29]:
def create_star_schema_tables(conn):
    with conn.cursor() as cur:
        # Produktdimension
        cur.execute("""
            CREATE TABLE IF NOT EXISTS dim_product (
                product_id SERIAL PRIMARY KEY,
                article_id INT,
                article_name VARCHAR(255),
                product_group_id INT,
                product_group_name VARCHAR(255),
                product_family_id INT,
                product_family_name VARCHAR(255),
                product_category_id INT,
                product_category_name VARCHAR(255)
            );
        """)



        # Geodimension
        cur.execute("""
            CREATE TABLE IF NOT EXISTS dim_geo (
                geo_id SERIAL PRIMARY KEY,
                shop_id INT,
                shop_name VARCHAR(255),
                city_id INT,
                city_name VARCHAR(255),
                region_id INT,
                region_name VARCHAR(255),
                country_id INT,
                country_name VARCHAR(255)
            );
        """)

        # Zeitdimension
        cur.execute("""
            CREATE TABLE IF NOT EXISTS dim_time (
                time_id SERIAL PRIMARY KEY,
                date DATE,
                day INT,
                month INT,
                quarter INT,
                year INT
            );
        """)

        # Faktentabelle
        cur.execute("""
            CREATE TABLE IF NOT EXISTS fact_sales (
                sales_id SERIAL PRIMARY KEY,
                time_id INT,
                geo_id INT,
                product_id INT,
                quantity INT,
                revenue NUMERIC,

                CONSTRAINT fk_time FOREIGN KEY (time_id) REFERENCES dim_time(time_id),
                CONSTRAINT fk_geo FOREIGN KEY (geo_id) REFERENCES dim_geo(geo_id),
                CONSTRAINT fk_product FOREIGN KEY (product_id) REFERENCES dim_product(product_id)
            );
        """)

    conn.commit()
    print("Star-Schema-Tabellen erfolgreich erstellt.")

with psycopg2.connect(**conn_params) as conn:
    create_star_schema_tables(conn)

Star-Schema-Tabellen erfolgreich erstellt.


In [32]:
def populate_dim_product(conn):
    with conn.cursor() as cur:
         cur.execute("""
            INSERT INTO dim_product (
                article_id,
                article_name,
                product_group_id,
                product_group_name,
                product_family_id,
                product_family_name,
                product_category_id,
                product_category_name
            )
            SELECT
                a.ArticleID,
                a.Name,
                pg.ProductGroupID,
                pg.Name,
                pf.ProductFamilyID,
                pf.Name,
                pc.ProductCategoryID,
                pc.Name
            FROM Article a
                INNER JOIN ProductGroup pg ON pg.productgroupid = a.productgroupid
                INNER JOIN vsisp42.productfamily pf on pf.productfamilyid = pg.productfamilyid
                INNER JOIN vsisp42.productcategory pc on pc.productcategoryid = pf.productcategoryid

        """)
    conn.commit()
    print("dim_product erfolgreich befüllt.")

with psycopg2.connect(**conn_params) as conn:
    populate_dim_product(conn)

DimProduct erfolgreich befüllt.


In [34]:
def populate_dim_geo(conn):
    with conn.cursor() as cur:
        cur.execute("""
            INSERT INTO dim_geo (
                shop_id,
                shop_name,
                city_id,
                city_name,
                region_id,
                region_name,
                country_id,
                country_name
            )
            SELECT
                s.ShopID,
                s.Name,
                c.CityID,
                c.Name,
                r.RegionID,
                r.Name,
                co.CountryID,
                co.Name
            FROM Shop s
                INNER JOIN City c ON s.CityID = c.CityID
                INNER JOIN Region r ON c.RegionID = r.RegionID
                INNER JOIN Country co ON r.CountryID = co.CountryID
        """)
    conn.commit()
    print("dim_geo erfolgreich befüllt.")

with psycopg2.connect(**conn_params) as conn:
    populate_dim_geo(conn)

dim_geo erfolgreich befüllt.


In [39]:
df = load_csv_data()

def populate_dim_time(conn, df_sales):

    df_sales['date'] = pd.to_datetime(df_sales['date'], errors='coerce')
    print(df_sales.head())

with psycopg2.connect(**conn_params) as conn:
    populate_dim_time(conn, df)

KeyError: 'date'