In [1]:
from dotenv import load_dotenv
import os
from typing import List, Dict, Any, Optional, Union
from pathlib import Path

load_dotenv()


# Import from our Classes module
from Classes.model_classes import SQLLineageExtractor, SQLLineageResult, create_sql_lineage_extractor
from Classes.regexp_extractor import RegexSQLExtractor
from Classes.validation_classes import SQLLineageValidator


MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
PROVIDER = "scaleway"
HF_TOKEN = os.environ.get("HF_TOKEN")

PROMPT = """ Please extract source-to-target lineage from the SQL query with the following requirements:

### SQL Lineage Extraction Task
Extract source-to-target lineage from the SQL statement below. Return ONLY valid JSON containing:
- "target": The main object being created or modified (fully qualified name)
- "sources": List of DISTINCT base tables/views (fully qualified names)"""



In [2]:
# Get the current working directory
current_dir = Path.cwd()
# Construct the path relative to current directory
file_path = current_dir / 'data' / 'SQL.txt'
file_path_valid = current_dir / 'data' / 'SQL_valid.txt'

# Read file with example
with open(file_path, "r", encoding="utf-8") as f:
    SQL = f.read()

# Read file with example (valid for LLM)
with open(file_path_valid, "r", encoding="utf-8") as f:
    SQL_valid = f.read()

In [3]:
#Create a validaition Class
validation = SQLLineageValidator()

# Create Regexp extractor
expected_result_extractor = RegexSQLExtractor()

# Create extractor using factory function
extractor = create_sql_lineage_extractor(
    model=MODEL,
    provider=PROVIDER,
    hf_token=HF_TOKEN,
    max_new_tokens=2048,
    do_sample=False,
    max_retries=3,
    use_pydantic_parser=True,
    human_prompt_template = PROMPT
)

In [4]:
validation.run_comprehensive_validation(
    extractor, 
    SQL,
    expected_result = expected_result_extractor.extract(SQL))

{'status': 'FAILED',
 'validation_type': 'uniqueness',
 'message': "Duplicate sources found: ['s_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_core_uvdo', 's_grnplm_as_t_didsd_010_vd_dwh.v_coa']",
 'result': {'target': 's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred',
  'sources': ['s_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_tmp',
   's_grnplm_as_t_didsd_010_vd_dwh.v_$eks_agrmnt_to_coa_3',
   's_grnplm_as_t_didsd_010_vd_dwh.v_coa',
   's_grnplm_as_t_didsd_010_vd_dwh.v_gl_main_acct',
   's_grnplm_vd_t_bvd_db_dmslcl.a_agr_cred_coa_period',
   's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_optn',
   's_grnplm_as_t_didsd_010_vd_dwh.v_loan_agrmnt_rate',
   's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_cust',
   's_grnplm_as_t_didsd_029_vd_dwh.v_agr_cred',
   's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_core_uvdo',
   's_grnplm_as_t_didsd_029_vd_dwh.v_crncy',
   's_grnplm_as_t_didsd_010_vd_dwh.v_crncy',
   's_grnplm_as_t_didsd_029_vd_dwh.v_agr_cred_metric_hist$$$',
   's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_core_uvdo',
   's_grnp

In [5]:
validation.run_comprehensive_validation(
    extractor, 
    SQL_valid,
    expected_result = expected_result_extractor.extract(SQL_valid))

{'status': 'SUCCESS',
 'validation_type': 'comprehensive',
 'message': 'All validations passed',
 'result': {'target': 's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_dmcl_attr',
  'sources': ['s_grnplm_vd_t_bvd_db_dmcl.a_agr_cred_qlty_period',
   's_grnplm_vd_t_bvd_db_dmcl.d_agr_cred',
   's_grnplm_vd_t_bvd_db_dmcl.a_agr_cred_clsfctn_fin_pos_period',
   's_grnplm_vd_t_bvd_db_dmcl.a_agr_cred_f303_inf_type_period',
   's_grnplm_vd_t_bvd_db_dmcl.a_agr_cred_prvsn_period',
   's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_tmp',
   's_grnplm_vd_t_bvd_db_dmslcl.d_agr_cred_prnt_tmp']},
 'metrics': {'precision': 1.0, 'recall': 1.0, 'f1_score': 1.0}}