In [1]:
import pandas as pd
from pathlib import Path
from dq_checks.check_completeness_by_records import check_completeness_by_records
from dq_checks.check_state_country_consistency import check_state_country_consistency
from dq_checks.check_cancellation_code_consistency import check_cancellation_code_consistency 
from dq_checks.check_crs_elapsed_time import check_crs_elapsed_time

base_path = Path("../raw")
folders = ["airports", "carriers", "flights"]

dfs = {}

for folder in folders:
    file_paths = list(base_path.joinpath(folder).glob("*.parquet"))
    df_list = [pd.read_parquet(path) for path in file_paths]
    combined_df = pd.concat(df_list, ignore_index=True)
    dfs[folder] = combined_df


In [2]:
# Collect results
dq_results_list = []
 
# Run checks and store results
for name, df in dfs.items():

    total_records = len(df)
    
    if name == 'carriers':
        status, bad_data = check_completeness_by_records(df)
        dq_results_list.append({
            "Table": "Carriers",
            "DQ Check": "Completeness",
            "Column": "All",
            "Status": status,
            "Count Of Records": total_records,
            "Failed Records": len(bad_data),
            "Bad Data": bad_data.to_dict(orient='records')
        })
 
    if name == 'airports':
        status, bad_data = check_state_country_consistency(df)
        dq_results_list.append({
            "Table": "Airports",
            "DQ Check": "Consistency",
            "Column": "State, Country",
            "Status": status,
            "Count Of Records": total_records,
            "Failed Records": len(bad_data),
            "Bad Data": bad_data.to_dict(orient='records')
        })
 
    if name == 'flights':
        status, bad_data = check_cancellation_code_consistency(df)
        dq_results_list.append({
            "Table": "Flights",
            "DQ Check": "Consistency",
            "Column": "Cancelled, CancellationCode",
            "Status": status,
            "Count Of Records": total_records,
            "Failed Records": len(bad_data),
            "Bad Data": bad_data.to_dict(orient='records')
        })
 
        status, bad_data = check_crs_elapsed_time(df)
        dq_results_list.append({
            "Table": "Flights",
            "DQ Check": "Consistency",
            "Column": "CRSElapsedTime, CRSArrTime, CRSDepTime",
            "Status": status,
            "Count Of Records": total_records,
            "Failed Records": len(bad_data),
            "Bad Data": bad_data.to_dict(orient='records')
        })
 
# Convert to a results DataFrame
dq_results_df = pd.DataFrame(dq_results_list)
 
# Display or export
dq_results_df.head()


Unnamed: 0,Table,DQ Check,Column,Status,Count Of Records,Failed Records,Bad Data
0,Airports,Consistency,"State, Country",Failed,3388,35,"[{'state': '', 'country': ''}, {'state': '', '..."
1,Carriers,Completeness,All,Passed,1505,0,[]
2,Flights,Consistency,"Cancelled, CancellationCode",Failed,537,4,"[{'Cancelled': '0', 'CancellationCode': 'A'}, ..."
3,Flights,Consistency,"CRSElapsedTime, CRSArrTime, CRSDepTime",Failed,537,242,"[{'CRSDepTime': '915', 'CRSArrTime': '1130', '..."
