In [4]:
#versione read_sas7bdat

from concurrent.futures import ThreadPoolExecutor, as_completed
import pyreadstat
from datetime import datetime
import sys
import polars
import psutil
import shutil
import time
import os
import gc


sas_file = "45cols.sas7bdat"
run_ref = "_202510_"
extension = ".parquet"




file_prefix = sas_file.replace(".sas7bdat","")
table_folder = file_prefix
output_folder = os.path.join("output_folder_snappy", table_folder)


chunksize = 1000000
offset = 0
chunk_idx = 0
max_workers = 1

#Common Functions
def snapshot(label):
   current, peak = tracemalloc.get_traced_memory()
   print(f"[{label}] Current: {current / 1024**2:.2f} MB | Peak: {peak / 1024**2:.2f} MB | {timenow()}")

def mem():
    mem = psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2
    return mem

def timenow():
    timenow = datetime.now().strftime('%H:%M:%S')
    return timenow



#Operator
def process_chunk(output_folder, run_ref, parquet_file, extension, chunk_idx, chunksize):

    filepath = os.path.join(output_folder, parquet_file + run_ref + str(chunk_idx) + extension)

    chunk_start = time.time()
    #print(f"Chunk {chunk_idx}: Processing üîÑ | {timenow()}")

    #snapshot("Start_Chunk")
    offset = chunk_idx * chunksize
    
    """"
     #Ha senso solo se ti carichi tanti GB in una volta
    chunk, meta = pyreadstat.read_file_multiprocessing(
        pyreadstat.read_sas7bdat,
        sas_file,
        num_processes = 4,
        row_offset=offset,
        row_limit=chunksize,
        output_format = "polars",
        encoding = "windows-1252",
        disable_datetime_conversion = True
    )
    
    """
    try:
        chunk, meta = pyreadstat.read_sas7bdat(
            sas_file,
            row_offset=offset,
            row_limit=chunksize,
            output_format = "polars",
            disable_datetime_conversion = "True",
            encoding = "windows-1252",
            usecols = [
                "libname", "memname", "memtype", "dbms_memtype",
                "memlabel",
                "typemem", "crdate", "modate", "nobs", "obslen",
                "nvar", "protect", "compress", "encrypt", "npage",
                "filesize", "pcompress", "reuse", "bufsize", "delobs",
                "nlobs", "maxvar", "maxlabel", "maxgen", "gen", "attr",
                "indxtype", "datarep", "sortname", "sorttype", "sortchar",
                "datarepname", "encoding", "audit", "audit_before", "audit_admin",
                "audit_error", "audit_data", "num_character", "num_numeric"
                ]
            )
    except Exception as e:
        print(f"{chunk_idx}: Riga{chunk_idx + 1}: Errore critico in lettura: {e}")
    
    #snapshot("Polars_Done")
    #print(f"Dopo lettura - RAM usata: {mem():.2f} MB | {timenow()}")

    #if chunk.shape[0] == 0:
    # print(f"{chunk_idx}: Vuoto, Termino Processo | StartingRow: {offset}")
    # sys.exit(1)

    #table = pa.table(chunk)
    #del chunk
    #snapshot("Arrow_Done")

    #print(f"Dopo arrow - RAM usata: {mem():.2f} MB | {timenow()}")

    chunk.write_parquet(filepath)
    #pq.write_table(table, filepath, compression = 'snappy')
    #snapshot("Parquet_Done")

    del chunk
    #del table
    
    #print(f"Parquet scritto - RAM usata: {mem():.2f} MB | {timenow()}")
    chunk_end = time.time()
    chunk_elab_time = round((chunk_end - chunk_start),0)

    print(f"Chunk {chunk_idx}: Completed ‚úÖ | ElabTime: {chunk_elab_time} s | {timenow()}")
    gc.collect()



#Main with Iteration
def main():

    #tracemalloc.start()
    if os.path.exists(output_folder):
            shutil.rmtree(output_folder)
    os.mkdir(output_folder)

    #Controllo_Iniziale_File

    if os.path.exists(sas_file):
        _,meta = pyreadstat.read_sas7bdat(sas_file, metadataonly = True)
    else:
        print(f"File_Check: File {sas_file} non trovato")
        sys.exit(1)

    rows = meta.number_rows
    num_chunks = (meta.number_rows + chunksize - 1)//chunksize

    print(f"File Trovato: #Rows: {meta.number_rows}, encoding:{meta.file_encoding}, chunksize: {chunksize}, #Chunks:{num_chunks}") 

    with ThreadPoolExecutor(max_workers = max_workers) as executor:
            futures = {
                executor.submit(
                    process_chunk,
                    output_folder,
                    run_ref, 
                    file_prefix,
                    extension, 
                    chunk_idx,
                    chunksize
                    ): chunk_idx for chunk_idx in range(num_chunks)
            }

    for future in as_completed(futures):
        idx = futures[future]
        try:
            future.result()
        except Exception as e:
            print(f"[Chunk {idx}] Producer error: {e}")

main()

#duckdb.sql("""
#    COPY (SELECT * FROM 'output_folder/*.parquet') 
#    TO 'merged.parquet' 
#    (FORMAT PARQUET, CODEC 'snappy');
#""")

File Trovato: #Rows: 8979525, encoding:UTF-8, chunksize: 1000000, #Chunks:9
Chunk 0: Completed ‚úÖ | ElabTime: 25.0 s | 00:47:46
Chunk 1: Completed ‚úÖ | ElabTime: 27.0 s | 00:48:13
Chunk 2: Completed ‚úÖ | ElabTime: 36.0 s | 00:48:49
Chunk 3: Completed ‚úÖ | ElabTime: 44.0 s | 00:49:33
Chunk 4: Completed ‚úÖ | ElabTime: 55.0 s | 00:50:28
Chunk 5: Completed ‚úÖ | ElabTime: 44.0 s | 00:51:12
Chunk 6: Completed ‚úÖ | ElabTime: 40.0 s | 00:51:52
Chunk 7: Completed ‚úÖ | ElabTime: 36.0 s | 00:52:28
Chunk 8: Completed ‚úÖ | ElabTime: 36.0 s | 00:53:05


KeyboardInterrupt: 

In [33]:
import time
import pyarrow.dataset as ds

input_folder = "output_folder_zstd"

start_time = time.time()

# Leggi tutti i file Parquet nella cartella come dataset
dataset = ds.dataset(input_folder, format="parquet")

# Carica l'intero dataset in memoria (solo per test)
table = dataset.to_table()

end_time = time.time()
elapsed = end_time - start_time



# Stampa informazioni
print("üìÑ Schema del dataset:")
print(f"\nüìä Numero di righe: {table.num_rows}")
print(f"üì¶ Numero di colonne: {table.num_columns}")
print("\n‚è±Ô∏è Tempo di lettura: {:.3f} secondi".format(elapsed))

del table


üìÑ Schema del dataset:

üìä Numero di righe: 8560000
üì¶ Numero di colonne: 15

‚è±Ô∏è Tempo di lettura: 0.861 secondi
