### Setting up the environment 

In [12]:
from sqlalchemy import create_engine
import pandas as pd

# Connection parameters
PGHOST = "rammyserver.postgres.database.azure.com"
PGUSER = "rammysubiate"
PGPASSWORD = "data11212*" 
PGPORT = 5432
PGDATABASE = "postgres"

# Create engine
engine = create_engine(
    f"postgresql+psycopg2://{PGUSER}:{PGPASSWORD}@{PGHOST}:{PGPORT}/{PGDATABASE}"
)


### Reusable SQL Query Function in Python

In [13]:
def run_query(query):
    """
    Executes a SQL query using the provided SQLAlchemy engine and returns a DataFrame.
    
    Parameters:
    - query (str): The SQL query to execute.
    - engine : SQLAlchemy engine connected to DB.
    
    Returns:
    - pd.DataFrame: Result of the query.
    """
    try:
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        print(f"Error running query: {e}")
        return None


In [14]:
query = """

SELECT *
FROM fact_transactions
INNER JOIN dim_date
    USING(date_id)
INNER JOIN dim_product
    USING(product_id)
INNER JOIN dim_customer
    USING(customer_id)
LIMIT 5;
"""

df = run_query(query)
df

Unnamed: 0,customer_id,product_id,date_id,transaction_id,transaction_num,quantity,revenue,date,product_num,product_name,price,customer_num,country,customer_frequency,customer_contribution
0,1296,5857,192,1,581482,12,257.64,2019-12-09,22485,set of 2 wooden market crates,21.47,17490,United Kingdom,more_frequent,most_contributor
1,3005,1209,192,2,581475,36,383.4,2019-12-09,22596,christmas star wish list chalkboard,10.65,13069,United Kingdom,most_frequent,most_contributor
2,3005,18046,192,3,581475,12,138.36,2019-12-09,23235,storage tin vintage leaf,11.53,13069,United Kingdom,most_frequent,most_contributor
3,3005,8297,192,4,581475,12,127.8,2019-12-09,23272,tree t-light holder willie winkie,10.65,13069,United Kingdom,most_frequent,most_contributor
4,3005,11255,192,5,581475,6,71.64,2019-12-09,23239,set of 4 knick knack tins poppies,11.94,13069,United Kingdom,most_frequent,most_contributor


### Checking Column Names and Data Types

In [15]:
query = """
SELECT column_name, data_type
FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name IN (
        'fact_transactions',
        'dim_date', 
        'dim_product', 
        'dim_customer'
)
ORDER BY ordinal_position

"""
data_check = run_query(query)
data_check


Unnamed: 0,column_name,data_type
0,date_id,integer
1,transaction_id,integer
2,customer_id,integer
3,product_id,integer
4,date,date
5,customer_num,integer
6,transaction_num,integer
7,product_num,character varying
8,product_name,character varying
9,date_id,integer


### Row Count

In [16]:
query = """

SELECT COUNT(*) AS row_count
FROM fact_transactions
INNER JOIN dim_date
    USING(date_id)
INNER JOIN dim_product
    USING(product_id)
INNER JOIN dim_customer
    USING(customer_id)


"""
row_count = run_query(query)
row_count


Unnamed: 0,row_count
0,522601


### Checking for NULLs

In [17]:
query = """

SELECT COUNT(*) - COUNT(transaction_num) AS transaction_num_nulls,
    COUNT(*) - COUNT(product_num) AS product_num_nulls,
    COUNT(*) - COUNT(product_name) AS product_name_nulls,
    COUNT(*) - COUNT(price) AS price_nulls,
    COUNT(*) - COUNT(quantity) AS quantity_nulls
FROM fact_transactions
INNER JOIN dim_date
    USING(date_id)
INNER JOIN dim_product
    USING(product_id)
INNER JOIN dim_customer
    USING(customer_id)

"""

null_check = run_query(query)
null_check

Unnamed: 0,transaction_num_nulls,product_num_nulls,product_name_nulls,price_nulls,quantity_nulls
0,0,0,0,0,0


### Checking for Duplicates

In [18]:
columns = ["transaction_num","date","product_num","product_name","price",
           "quantity","revenue","customer_num","country", "customer_frequency", "customer_contribution"]

cols = (", ").join(columns)
query = f"""
WITH ranked AS (
    SELECT *,
        ROW_NUMBER() OVER(PARTITION BY {cols} ORDER BY date) AS rank
        FROM fact_transactions
        INNER JOIN dim_date
            USING(date_id)
        INNER JOIN dim_product
            USING(product_id)
        INNER JOIN dim_customer
            USING(customer_id)
)
SELECT COUNT(*) AS duplicate_count
FROM ranked
WHERE rank > 1;
"""

dup_check = run_query(query)
dup_check

Unnamed: 0,duplicate_count
0,0
