In [1]:
import duckdb
import os
import re
from datetime import datetime
import pandas as pd
from customized_profiling import customized_profiling

In [2]:
con = duckdb.connect('../formatted_zone/formatted.db')
con.sql("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main';")

┌──────────────────────┐
│      table_name      │
│       varchar        │
├──────────────────────┤
│ idealista_01_06_2020 │
│ idealista_01_08_2020 │
│ idealista_01_11_2020 │
│ idealista_02_01_2020 │
│ idealista_02_03_2020 │
│ idealista_02_06_2020 │
│ idealista_02_08_2020 │
│ idealista_03_01_2021 │
│ idealista_03_03_2020 │
│ idealista_03_06_2020 │
│          ·           │
│          ·           │
│          ·           │
│ idealista_29_07_2020 │
│ idealista_29_08_2020 │
│ idealista_30_01_2020 │
│ idealista_30_08_2020 │
│ idealista_30_10_2020 │
│ idealista_31_05_2020 │
│ idealista_31_10_2020 │
│ idealista_31_12_2020 │
│ income_2020          │
│ income_2021          │
├──────────────────────┤
│ 136 rows (20 shown)  │
└──────────────────────┘

In [3]:
con.close()

In [4]:
def data_profiling(db_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Connect to the DuckDB database
    conn = duckdb.connect(db_path)
    
    # Get list of all tables in the database
    tables = conn.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'main';").fetchall()
    
    # Extract the most recent table based on the date in the table name
    latest_table = None
    latest_date = None
    date_pattern = re.compile(r'_(\d{2})_(\d{2})_(\d{4})$')
    
    for table in tables:
        table_name = table[0]
        match = date_pattern.search(table_name)
        if match:
            day, month, year = match.groups()
            table_date = datetime.strptime(f"{day}/{month}/{year}", "%d/%m/%Y")
            if latest_date is None or table_date > latest_date:
                latest_date = table_date
                latest_table = table_name
    
    # Profile the latest table if found
    if latest_table:
        print(f"Reading and profiling table: {latest_table}")
        try:
            # Read the table into a DataFrame
            df = conn.execute(f"SELECT * FROM {latest_table}").fetchdf()
            
            # Remove unwanted columns if they exist
            keyword_list = [
                'thumbnail','externalReference','numPhotos','showAddress',
                'url','distance','hasVideo','detailedType','suggestedTexts',
                'hasPlan','has3DTour','has360','hasStaging','topNewDevelopment',
                'parkingSpace','json','index','priceInfo','description',
                'topPlus','highlight','newDevelopmentFinished'
            ]
            df = df.drop(columns=[col for col in keyword_list if col in df.columns], errors='ignore')
            
            # Convert unhashable types (e.g., dict) to string
            df = df.apply(lambda col: col.map(lambda x: str(x) if isinstance(x, dict) else x))
            
            # Perform customized profiling
            customized_profiling(df, latest_table, output_dir)
        except Exception as e:
            print(f"Error while profiling table '{latest_table}': {e}")
    else:
        print("No tables with a valid date format found in the database.")
    
    # Close the connection
    conn.close()

In [5]:
data_profiling('../formatted_zone/formatted.db', '../formatted_zone/')

Reading and profiling table: idealista_28_10_2024
