In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import sys
import dask.dataframe as dd

root_path = pathlib.Path().resolve().parent
if root_path not in sys.path:
    sys.path.append(str(root_path))

In [None]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import multiprocessing
import time
import shutil
from tqdm import tqdm
from data_generators.invoices_data_generation import generate_invoice_dataframe
save_path = pathlib.Path(root_path, 'data_generators', 'raw.parquet')

save_path.mkdir(parents=True, exist_ok=True)
shutil.rmtree(save_path, ignore_errors=True)

total_iterations = 5000
with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
    futures = []
    for i in tqdm(range(total_iterations)):
        X = generate_invoice_dataframe(10000)
        path = pathlib.Path(save_path, f"part_{i}.parquet")
        futures.append(executor.submit(X.to_parquet, path, engine='pyarrow', compression='snappy'))

# Wait for all futures to complete
done = False
pbar = tqdm(total=total_iterations)
while True:
    # find out total futures done
    total_done = sum(1 if future.done() else 0 for future in futures)
    if total_done == total_iterations:
        break
    pbar.update(total_done - pbar.n)
    time.sleep(1)

In [3]:
import dask.dataframe as dd
import pandas as pd
from src.tools.dask_tools import validate_column, compute, repartition_data
from uuid import uuid4
import shutil
from tqdm import tqdm

save_path = pathlib.Path(root_path, 'data_generators', 'processed.parquet')
final_path = pathlib.Path(root_path, 'data_generators', 'final.parquet')

final_path.mkdir(parents=True, exist_ok=True)

X = dd.read_parquet(save_path, engine='pyarrow')

index_col = 'SPT_RowID'
print('Index Column', index_col)
print('Total data length', len(X))

# Drop the existing index column
if index_col in X.columns:
    X = X.drop(columns=[index_col], axis=1)

start_index = 0
end_index = 0
total_partitions = X.npartitions
for i in tqdm(range(total_partitions)):
    part = X.get_partition(i).compute()
    start_index = end_index + 1
    end_index = start_index + len(part) - 1

    part[index_col] = range(start_index, end_index + 1)

    # print(i, len(part), part[index_col].min(), part[index_col].max())
    uid = uuid4()

    # save to final_path
    part.to_parquet(f"{final_path}/{uid}.parquet")

Index Column SPT_RowID
Total data length 18340000


100%|██████████| 7336/7336 [36:15<00:00,  3.37it/s]  


In [4]:
import humanize

X = dd.read_parquet(final_path, engine='pyarrow')
display(X.head())
print(X[index_col].min().compute(), X[index_col].max().compute())
print('Total records', len(X))
print('Total Size', humanize.naturalsize(X.memory_usage().sum().compute()))

Unnamed: 0,VendorNumber,VendorName,VendorCountry,VendorCountryName,VendorCategory,OneTimeVendorFlag,VendorCurrency,CPIScore,CompanyCode,CompanyCodeName,...,P2PFMIN271_Tran_Score,P2PFMIN268_Tran_Score,P2PHRIN302_Tran_Score,P2PHRIN303_Tran_Score,P2PHRIN304_Tran_Score,P2PHRIN305_Tran_Score,P2PICIN611_Tran_Score,P2PICIN604_Tran_Score,P2PSTIN990_Tran_Score,SPT_RowID
0,59638,Tommy Walter,Cayman Islands,Cayman Islands,Technology Vendor,True,CYM,21,VG,VG,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2872501
1,49672,Carla Gray,Senegal,Senegal,Raw Material Supplier,True,SEN,93,OH,OH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2872502
2,34760,Ethan Adams,Armenia,Armenia,Service Provider,True,ARM,41,OI,OI,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2872503
3,49672,Carla Gray,Senegal,Senegal,Raw Material Supplier,True,SEN,75,OH,OH,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2872504
4,51173,Patty Perez,Central African Republic,Central African Republic,Logistics Provider,True,CAF,82,RP,RP,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2872505


1 18340000
Total records 18340000
Total Size 25.9 GB


In [None]:
from src.automl.training_pipeline import TrainingPipeline
import numpy as np
import dask.dataframe as dd
from src.utils.submodule import Submodule
from src.tools.dask_tools import validate_column, compute, repartition_data

sub = Submodule('P2P', 'Invoices', 'bfee379f-02d2-478b-a7f0-bd84f08a6b0d')

final_path = pathlib.Path(root_path, 'data_generators', 'final.parquet')
X = dd.read_parquet(final_path, engine='pyarrow')
# X = repartition_data(X)

index_col = 'SPT_RowID'
X = X.set_index(index_col)

# create y with random 0 and 1
X['target'] = dd.from_array(np.random.randint(0, 2, size=len(X)), chunksize=10000)
y = compute(X['target'])
X = X.drop(columns=['target'], axis=1) 

TrainingPipeline(sub).fit(X, y)


[37m2025-09-02 12:31:53 - INFO - Force computing dask[0m
[37m2025-09-02 12:31:53 - INFO - Force computing dask[0m
[37m2025-09-02 12:32:51 - INFO - Inferring data types[0m
[37m2025-09-02 12:32:51 - INFO - No columns need data type update.[0m
[37m2025-09-02 12:35:34 - INFO - Force compute complete[0m
[37m2025-09-02 12:35:34 - INFO - Table Patterns exists. - Server=localhost, Database=GDB_DIT_Testing, Username=dofus[0m
[37m2025-09-02 12:35:34 - INFO - Downloading data from table - Server=localhost, Database=GDB_DIT_Testing, Username=dofus - Table Name: Model.Patterns - Chunksize: 150000[0m
[37m2025-09-02 12:35:35 - INFO - Downloading chunk - Chunk: 1[0m
[37m2025-09-02 12:35:35 - INFO - Data downloaded - Server=localhost, Database=GDB_DIT_Testing, Username=dofus - Rows: 220 - Columns: 102 - Size: 980.2 kB - Path: C:\Users\RoopakPrajapat\Documents\KonaAI_ML\project_data\konaai_temp\7596656b2aaf4b48856c04aad2bc17f6[0m
[37m2025-09-02 12:35:35 - INFO - Starting data repartit

Validating applicable anomaly features: 100%|██████████| 4/4 [00:00<?, ?it/s]

[37m2025-09-02 12:35:36 - INFO - Transforming custom features transformer[0m



Processing anomaly feature builders:   0%|          | 0/1 [00:00<?, ?it/s]