# Notebook to create Final Fact and Dimensional tables. 


In [1]:
! pip install azure-storage-blob



## Prepare User defined Functions for the query. 

In [2]:
import requests
from selectolax.parser import HTMLParser
import duckdb

conn = duckdb.connect(':memory:')

def extract_opening_data(url):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Referer": "https://www.google.com",
    }

    try:
        response = requests.get(url, headers=headers, timeout=5)
        if response.status_code == 200:
            tree = HTMLParser(response.text)
            
            # Extract H1 (Opening Name)
            h1_tag = tree.css_first("h1")
            opening_name = h1_tag.text(strip=True) if h1_tag else None
            
            # Extract Moves List
            move_list_div = tree.css_first(".openings-view-move-list")
            moves = move_list_div.text(strip=True) if move_list_div else None

            return {"opening_name": opening_name, "moves": moves}

    except requests.RequestException:
        return {"opening_name": None, "moves": None}  # Handle errors

    return {"opening_name": None, "moves": None}


def get_opening_family(opening_name: str) -> str:
    # Get the parent name of the move numbers

    if ":" in opening_name:
        name_splitted = opening_name.split(":")
        return name_splitted[0]

    else:
        return opening_name
    
conn.create_function('get_opening_family', get_opening_family)


def extract_opening_name(url: str) -> str:
    # extract Opening Data
    opening_data = extract_opening_data(url)
    return opening_data['opening_name']
#
conn.create_function('extract_opening_name', extract_opening_name) 


def format_time_control(timecontrol: str)-> str:
    if "+" in timecontrol:
        tc = timecontrol.split("+")
        minute = int(int(tc[0]) / 60)
        return "".join([str(minute), "|", tc[1]])

    else: 
        return str(int(int(timecontrol) / 60))



conn.create_function('format_time_control', format_time_control)
# def get_time_class(time_control: str) -> str:
#     # Get the time control from the time control string

#     if "+" in time_control:
#         # Split the time control and increment
#         time_control = int(time_control.split("+")[0])
#     else:
#         time_control = int(time_control)
    
#     time_class = ""
#     if time_control < 180:
#         time_class = "bullet" 
#     elif time_control < 600:
#         time_class = "blitz"
#     else:
#         time_class = "rapid"       

#     return time_class
# conn.create_function('get_time_class', get_time_class)

<duckdb.duckdb.DuckDBPyConnection at 0x7fb62d635c30>

## Load Fact tables


In [3]:
# Setup the notebook
from azure.storage.blob import BlobServiceClient
import duckdb
from airflow.providers.microsoft.azure.hooks.wasb import WasbHook
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

conn_string = os.getenv('AZURE_STORAGE_CONN_STRING')

# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(conn_string)
container_client = blob_service_client.get_container_client('chess-etl-files')

# Install azure extension
conn.sql(
    """INSTALL azure; 
        LOAD azure"""
)

conn.sql(f"""
    CREATE SECRET azure_adls_secret (
    TYPE azure,
    CONNECTION_STRING '{conn_string}' );
 """)

# Set the azure_transport_option_type to curl to avoid read error
conn.sql(
    """SET azure_transport_option_type = 'curl';"""
)

In [21]:

file_to_update = "silver/fact-2023-03-games.parquet"
source_file_path = f"az://rbchesssa.blob.core.windows.net/chess-etl-files/{file_to_update}"
dim_file_name = "gold/dim_opening.parquet"
destination_file_path = f"az://rbchesssa.blob.core.windows.net/chess-etl-files/{dim_file_name}"

In [16]:
# Creating Opening Dimension Table by either creating one from scratch or updating the existing one
blob_client = container_client.get_blob_client(dim_file_name)

if blob_client.exists():
    print("File exists! Proceeding with DuckDB query...")
    cur_dim_openings = conn.sql(f"""SELECT DISTINCT pgn_eco_url, 
                                           extract_opening_name(pgn_eco_url) as opening_name,
                                           get_opening_family(opening_name) as opening_family
                                    FROM '{source_file_path}'
                                    WHERE pgn_eco_url NOT IN (
                                        SELECT pgn_eco_url 
                                        FROM '{destination_file_path}'
                                            )
                                    UNION  -- Simply append existing data

                                    SELECT * FROM '{destination_file_path}';                                  
                                """)
    
else: 
    cur_dim_openings = conn.sql(f"""SELECT DISTINCT pgn_eco_url, 
                                           extract_opening_name(pgn_eco_url) as opening_name,
                                           get_opening_family(opening_name) as opening_family 
                                FROM '{source_file_path}';
""")

# cur_dim_openings = conn.sql(f"""SELECT *,  
#                                         get_opening_family(opening_name) as opening_family 
#                             FROM cur_dim_openings;""")


[[34m2025-03-24T17:11:42.999+0100[0m] {[34m_universal.py:[0m509} INFO[0m - Request URL: 'https://rbchesssa.blob.core.windows.net/chess-etl-files/gold/dim_opening.parquet'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.24.1 Python/3.10.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': 'ad27f7be-08ca-11f0-8218-c5d5f77a6a15'
    'Authorization': 'REDACTED'
No body was attached to the request[0m
[[34m2025-03-24T17:11:44.228+0100[0m] {[34m_universal.py:[0m545} INFO[0m - Response status: 404
Response headers:
    'Transfer-Encoding': 'chunked'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'dd3ace7a-201e-002c-47d7-9c09d5000000'
    'x-ms-client-request-id': 'ad27f7be-08ca-11f0-8218-c5d5f77a6a15'
    'x-ms-version': 'REDACTED'
    'x-ms-error-code': 'BlobNotFound'
   

In [17]:

cur_dim_openings

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌───────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────────┬─────────────────────────┐
│                                        pgn_eco_url                                        │                    opening_name                     │     opening_family      │
│                                          varchar                                          │                       varchar                       │         varchar         │
├───────────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────────┼─────────────────────────┤
│ https://www.chess.com/openings/Petrovs-Defense-Three-Knights-Game                         │ Petrov's Defense: Three Knights Game                │ Petrov's Defense        │
│ https://www.chess.com/openings/Closed-Sicilian-Defense-Traditional-Line-3.Nf3-e5-4.Bc4-d6 │ Closed Sicilian Defense: Traditional

In [12]:
## Create Dimension table for the results and the Game types. 
conn.sql(f"""SELECT DISTINCT white_result,
                    CASE WHEN pgn_result = '1-0' THEN 'Win'
                        WHEN pgn_result = '0-1' THEN 'Loss'
                        WHEN pgn_result = '1/2-1/2' THEN 'Draw'
                        ELSE 'Other'
                    END AS result_category
FROM '{source_file_path}';
""")

IOException: IO Error: AzureBlobStorageFileSystem could not open file: 'az://rbchesssa.blob.core.windows.net/chess-etl-files/silver/fact-2023-03-games.parquet', unknown error occurred, this could mean the credentials used were wrong. Original error message: 'Fail to get a new connection for: https://rbchesssa.blob.core.windows.net. Problem with the SSL CA cert (path? access rights?)' 

In [15]:
conn.sql(f"""SELECT * FROM '{source_file_path}';""")

┌─────────────────────────────────────────────┬──────────────┬─────────┬────────────┬─────────┬──────────────┬──────────────┬──────────────┬──────────────┬────────────┬───────────┬─────────────────────┬────────────────┬────────────────┬────────────┬────────────────────────────────────────────────────────────┬──────────────┬─────────┬───────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [49]:
# Dimenstion Table for dim_Date
dim_file_name = "gold/dim_date.parquet"


blob_client = container_client.get_blob_client(dim_file_name)

if blob_client.exists():
    dim_date = conn.sql(f"""
                SELECT DISTINCT game_date,
                        EXTRACT(YEAR FROM game_date) AS year,
                        EXTRACT(MONTH FROM game_date) AS month, 
                        strftime('%B', game_date) AS month_name,
                        EXTRACT(DAY FROM game_date) AS day,
                        TO_CHAR(game_date, 'Day') AS weekday,
                        CASE WHEN CAST(strftime('%m', game_date) AS INTEGER) BETWEEN 1 AND 3 THEN 1
                                             WHEN CAST(strftime('%m', game_date) AS INTEGER) BETWEEN 4 AND 6 THEN 2
                                             WHEN CAST(strftime('%m', game_date) AS INTEGER) BETWEEN 7 AND 9 THEN 3
                                             ELSE 4
                                        END AS quarter
                             
                        FROM '{source_file_path}' 
                        ORDER BY game_date
                        WHERE time_control NOT IN 
                ( SELECT time_control FROM '{destination_file_path}')
                
                """)

else:
    dim_date = conn.sql(f""" SELECT DISTINCT game_date,
                                        EXTRACT(YEAR FROM game_date) AS year,
                                        EXTRACT(MONTH FROM game_date) AS month,
                                        strftime('%B', game_date) AS month_name,
                                        EXTRACT(DAY FROM game_date) AS day,
                                        strftime('%A', game_date) AS weekday,
                                        CASE WHEN CAST(strftime('%m', game_date) AS INTEGER) BETWEEN 1 AND 3 THEN 1
                                             WHEN CAST(strftime('%m', game_date) AS INTEGER) BETWEEN 4 AND 6 THEN 2
                                             WHEN CAST(strftime('%m', game_date) AS INTEGER) BETWEEN 7 AND 9 THEN 3
                                             ELSE 4
                                        END AS quarter
                                        FROM '{source_file_path}'
                                       ORDER BY game_date; 
""")

dim_date

┌─────────────────────┬───────┬───────┬────────────┬───────┬───────────┬─────────┐
│      game_date      │ year  │ month │ month_name │  day  │  weekday  │ quarter │
│      timestamp      │ int64 │ int64 │  varchar   │ int64 │  varchar  │  int32  │
├─────────────────────┼───────┼───────┼────────────┼───────┼───────────┼─────────┤
│ 2023-03-03 00:00:00 │  2023 │     3 │ March      │     3 │ Friday    │       1 │
│ 2023-03-09 00:00:00 │  2023 │     3 │ March      │     9 │ Thursday  │       1 │
│ 2023-03-15 00:00:00 │  2023 │     3 │ March      │    15 │ Wednesday │       1 │
│ 2023-03-17 00:00:00 │  2023 │     3 │ March      │    17 │ Friday    │       1 │
│ 2023-03-31 00:00:00 │  2023 │     3 │ March      │    31 │ Friday    │       1 │
└─────────────────────┴───────┴───────┴────────────┴───────┴───────────┴─────────┘

In [22]:
# dimenstion table for the time control.

dim_file_name = "gold/dim_time_class.parquet"
blob_client = container_client.get_blob_client(dim_file_name)

if blob_client.exists():
    dim_time_control = conn.sql(f"""
                SELECT format_time_control(time_control) as time_control, time_class 
                FROM '{source_file_path}' WHERE time_control NOT IN 
                ( SELECT time_control FROM '{destination_file_path}')
                """)

else:
    dim_time_control = conn.sql(f"""SELECT DISTINCT  format_time_control(time_control) as time_control, 
                                time_class
             FROM '{source_file_path}';
""")

dim_time_control

[[34m2025-03-24T17:14:27.599+0100[0m] {[34m_universal.py:[0m509} INFO[0m - Request URL: 'https://rbchesssa.blob.core.windows.net/chess-etl-files/gold/dim_time_class.parquet'
Request method: 'HEAD'
Request headers:
    'x-ms-version': 'REDACTED'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.24.1 Python/3.10.12 (Linux-5.15.167.4-microsoft-standard-WSL2-x86_64-with-glibc2.35)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '0f4405e6-08cb-11f0-8218-c5d5f77a6a15'
    'Authorization': 'REDACTED'
No body was attached to the request[0m
[[34m2025-03-24T17:14:28.970+0100[0m] {[34m_universal.py:[0m545} INFO[0m - Response status: 404
Response headers:
    'Transfer-Encoding': 'chunked'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': 'bf6fe4c6-201e-0085-73d7-9cc8a7000000'
    'x-ms-client-request-id': '0f4405e6-08cb-11f0-8218-c5d5f77a6a15'
    'x-ms-version': 'REDACTED'
    'x-ms-error-code': 'BlobNotFound'


┌──────────────┬────────────┐
│ time_control │ time_class │
│   varchar    │  varchar   │
├──────────────┼────────────┤
│ 10           │ rapid      │
│ 15|10        │ rapid      │
└──────────────┴────────────┘

In [24]:
print(type(dim_time_control.fetchdf()))

<class 'pandas.core.frame.DataFrame'>


In [4]:
conn.sql(f"""SELECT *
         FROM 'az://rbchesssa.blob.core.windows.net/chess-etl-files/gold/fact-games.parquet'
         
         ;""")

┌──────────────────────────────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬────────────────────┬──────────────┬──────────┬─────────────┬───────────────────┬───────────┬─────────────────┬────────────┬─────────────────┬────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────