In [21]:
import os
import psycopg2
from dotenv import load_dotenv
import pandas as pd
import chardet


load_dotenv()

dis_db_password = os.getenv("REMOTE_POSTGRES_DIS_PASSWORD")

conn_params = {
    'host': 'vsisdb.informatik.uni-hamburg.de',
    'dbname': 'dis-2025',
    'user': 'vsisp42',
    'password': dis_db_password
}


In [23]:
def show_existing_tables():
    conn = psycopg2.connect(**conn_params)

    with conn:
        with conn.cursor() as cur:
            cur.execute(
                """
                SELECT table_name
                FROM information_schema.tables
                WHERE table_schema = %s
                ORDER BY table_name;
                """,
                (conn_params["user"],),
            )
            tables = cur.fetchall()

            for table in tables:
                print("-", table[0])
show_existing_tables()

KeyboardInterrupt: 

In [8]:
with open('resources/sales.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))

print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [16]:
bad_lines = []
with open('resources/sales.csv', encoding='ISO-8859-1') as f:
    for i, line in enumerate(f, start=1):
        if line.count(';') != 4:  # 4 Semikolons = 5 Spalten
            bad_lines.append((i, line.strip()))

for i, l in bad_lines:
    print(f"Fehler in Zeile {i}: {l}")

Fehler in Zeile 35906: 06.04.2019;12.03.2019;Superstore Dresden;AEG Öko Lavatherm 59850 Sensidry;3;2997,00


In [17]:
def load_csv_data():
    df = pd.read_csv(
        'resources/sales.csv',
        encoding='ISO-8859-1',
        sep=';',
        decimal=',',
        on_bad_lines='skip'
    )
    print(df.head())

load_csv_data()

         Date               Shop                           Article  Sold  \
0  01.01.2019  Superstore Berlin  AEG Öko Lavatherm 59850 Sensidry    25   
1  01.01.2019  Superstore Berlin     AEG Öko-Lavamat Öko Plus 1400    25   
2  01.01.2019  Superstore Berlin              Bauknecht TK Care 6B    13   
3  01.01.2019  Superstore Berlin      Bauknecht WA Sensitive 36 DI     2   
4  01.01.2019  Superstore Berlin                       BenQ DE350P    31   

    Revenue  
0  24975.00  
1  14975.00  
2   3639.74  
3    699.80  
4   8369.69  


In [22]:
def create_star_schema_tables(conn):
    with conn.cursor() as cur:
        # Produktdimension
        cur.execute("""
            CREATE TABLE IF NOT EXISTS DimProduct (
                product_id SERIAL PRIMARY KEY,
                article_id INT,
                article_name VARCHAR(255),
                product_group_id INT,
                product_group_name VARCHAR(255),
                product_family_id INT,
                product_family_name VARCHAR(255),
                product_category_id INT,
                product_category_name VARCHAR(255)
            );
        """)

        # Geodimension
        cur.execute("""
            CREATE TABLE IF NOT EXISTS DimGeo (
                geo_id SERIAL PRIMARY KEY,
                shop_id INT,
                shop_name VARCHAR(255),
                city_id INT,
                city_name VARCHAR(255),
                region_id INT,
                region_name VARCHAR(255),
                country_id INT,
                country_name VARCHAR(255)
            );
        """)

        # Zeitdimension
        cur.execute("""
            CREATE TABLE IF NOT EXISTS DimTime (
                time_id SERIAL PRIMARY KEY,
                date DATE,
                day INT,
                month INT,
                quarter INT,
                year INT
            );
        """)

        # Faktentabelle
        cur.execute("""
            CREATE TABLE IF NOT EXISTS FactSales (
                sales_id SERIAL PRIMARY KEY,
                time_id INT,
                geo_id INT,
                product_id INT,
                quantity INT,
                revenue NUMERIC,

                CONSTRAINT fk_time FOREIGN KEY (time_id) REFERENCES DimTime(time_id),
                CONSTRAINT fk_geo FOREIGN KEY (geo_id) REFERENCES DimGeo(geo_id),
                CONSTRAINT fk_product FOREIGN KEY (product_id) REFERENCES DimProduct(product_id)
            );
        """)

    conn.commit()
    print("Star-Schema-Tabellen erfolgreich erstellt.")

with psycopg2.connect(**conn_params) as conn:
    create_star_schema_tables(conn)

OperationalError: connection to server at "vsisdb.informatik.uni-hamburg.de" (134.100.14.171), port 5432 failed: Operation timed out
	Is the server running on that host and accepting TCP/IP connections?
