# KONTOUDAKIS NIKOS 283024

## libraries and utilities

In [2]:
#required in order to run im my linux pc
import sys
sys.path.append('/usr/lib/python3.13/site-packages') 

In [63]:
import pandas as pd
import psycopg2
from io import StringIO

# Data Ingestion and Cleaning

In [41]:
def check_dataframe_issues(df):
    issues = {}

    # missing data per column
    missing = df.isnull()
    cols_with_missing = missing.any()
    if cols_with_missing.any():
        issues['missing_data'] = {}
        for col in df.columns[cols_with_missing]:
            indices = missing.index[missing[col]].tolist()
            issues['missing_data'][col] = indices

    # eclude the id column when searching for duplicates
    duplicate_mask = df.iloc[:, 1:].duplicated()
    if duplicate_mask.any():
        duplicate_indices = df.index[duplicate_mask].tolist()
        issues['duplicate_rows'] = duplicate_indices

    return issues

### If an input city is found to have a different value in the "StandardCity" column, it is considered an inconsistency

In [38]:
def find_inconsistent_cities(input_df, cities_lookup_df):
    issues = {}

    # filter mismatched raw and standard city names
    mismatched = cities_lookup_df[cities_lookup_df['RawCity'] != cities_lookup_df['StandardCity']]
    print(f"We are searching for the following errors: {mismatched["RawCity"].to_list()}")
    # get list of raw city names to search for
    raw_city_names = mismatched['RawCity'].unique()

    # check for matches in 'City' column
    if 'City' in input_df.columns:
        city_matches = input_df[input_df['City'].isin(raw_city_names)]
        if not city_matches.empty:
            issues['City'] = city_matches.index.tolist()

    # check for matches in 'Region' column
    if 'Region' in input_df.columns:
        region_matches = input_df[input_df['Region'].isin(raw_city_names)]
        if not region_matches.empty:
            issues['Region'] = region_matches.index.tolist()

    return issues

In [39]:
calendar_df = pd.read_csv("./cityretail_dataset/calendar.csv")
cities_lookup_df = pd.read_csv("./cityretail_dataset/cities_lookup.csv")
products_df = pd.read_csv("./cityretail_dataset/products.csv")
sales_df = pd.read_csv("./cityretail_dataset/sales.csv")
stores_df = pd.read_csv("./cityretail_dataset/stores.csv")
all_data_dfs = [calendar_df, products_df, sales_df, stores_df]

In [40]:
for df in all_data_dfs:
    print(check_dataframe_issues(df))

{}
{}
{}
{}


In [42]:
inconsistent_cities = find_inconsistent_cities(stores_df, cities_lookup_df)
print(f"inconsistent cities found: {inconsistent_cities}")

We are searching for the following errors: ['Athns', 'Thess', 'Patrass', 'Herakleion', 'Larisa']
inconsistent cities found: {}


### since all data are valid and have no inconsistencies, they will be inserted as bellow

In [52]:
for df in all_data_dfs:
    display(df.head())

Unnamed: 0,DateID,Date,Year,Quarter,Month,Day,Weekday
0,1,2024-01-01,2024,1,1,1,Monday
1,2,2024-01-02,2024,1,1,2,Tuesday
2,3,2024-01-03,2024,1,1,3,Wednesday
3,4,2024-01-04,2024,1,1,4,Thursday
4,5,2024-01-05,2024,1,1,5,Friday


Unnamed: 0,ProductID,ProductName,Category,Subcategory,CostPrice,SalePrice
0,1001,Product_1,Dairy,Medium,16.03,25.61
1,1002,Product_2,Household,Low,5.19,7.2
2,1003,Product_3,Beverages,Medium,11.01,16.65
3,1004,Product_4,Dairy,Medium,12.46,19.04
4,1005,Product_5,Dairy,Medium,2.36,3.09


Unnamed: 0,SalesID,DateID,ProductID,StoreID,QtySold,Revenue
0,1,125,1015,208,3,40.35
1,2,71,1014,208,4,69.52
2,3,209,1011,203,2,57.72
3,4,269,1007,207,9,69.66
4,5,85,1008,202,1,4.76


Unnamed: 0,StoreID,StoreName,City,Region
0,201,Store_1,Heraklion,Crete
1,202,Store_2,Larissa,Thessaly
2,203,Store_3,Athens,Attica
3,204,Store_4,Larissa,Thessaly
4,205,Store_5,Larissa,Thessaly


# OLAP Schema

In [54]:
db_params = {
    'host': 'localhost',
    'port': 5432,
    'dbname': 'project',
    'user': 'postgres',
    'password': ''
}

### Read the ddl and create tables/constraints

In [62]:
with open("./cityretail_dataset/star_schema.sql", 'r') as f:
    sql_commands = f.read()

# split statements by semicolon and strip whitespace
statements = [s.strip() for s in sql_commands.split(';') if s.strip()]

with psycopg2.connect(**db_params) as conn:
    with conn.cursor() as cur:
        for stmt in statements:
            try:
                cur.execute(stmt)
            except Exception as e:
                print(f"error executing statement:\n{stmt}\n{e}")
    conn.commit()

In [66]:
def insert_df_copy(df, table_name, db_params):
    buffer = StringIO()
    df.to_csv(buffer, index=False, header=False)
    buffer.seek(0)

    with psycopg2.connect(**db_params) as conn:
        with conn.cursor() as cur:
            try:
                cur.copy_from(buffer, table_name, sep=',', null='')
            except Exception as e:
                print(f"copy failed: {e}")
        conn.commit()

In [69]:
insert_df_copy(calendar_df, "dimdate", db_params= db_params)
insert_df_copy(products_df, "dimproduct", db_params= db_params)
insert_df_copy(stores_df, "dimstore", db_params= db_params)
insert_df_copy(sales_df, "factsales", db_params= db_params)

# Measures and Aggregations