In [695]:
import pandas as pd
import numpy as np
import uuid
from scipy.stats import truncnorm 

# Requisito #1

## Random Values generators

In [932]:
def generate_category_random_values(arr, size):
    return pd.Series(np.random.choice(arr, size=size))

def generate_unique_random_values(size):
    return [uuid.uuid4().hex[:16] for _ in range(size)]

def generate_date_random_values(min, max, size):
    date_range = pd.date_range(start=min, end=max, freq='D')
    return np.random.choice(date_range, size)

def generate_foreign_random_values(serie, size):
    return np.random.choice(serie.drop_duplicates(), size)

def generate_truncated_normal_data(min_val, max_val, std, mean, size):
    a, b = (min_val - mean) / std, (max_val - mean) / std
    data = truncnorm(a, b, loc=mean, scale=std).rvs(size)
    data = np.round(data, 3)
    return data

def generate_numeric_random_values(min, max, std, mean, size):
    if std is not None and mean is not None:
        values = generate_truncated_normal_data(min, max, std, mean, size)
    else:
        values = np.random.uniform(min, max, size)
        values = np.round(values, 3)
    return values

## Helpers

In [946]:
def calc_greater(cols_arr):
    max_values = 0 # Inicia en cero suponiendo que no hay nada todavia al inicio
    
    for col in cols_arr:
        col_type = col['type']

        if col_type == 'foreign':
            pass
        
        if col_type == 'date':
            max_values = max(max_values, pd.date_range(start=col['values']['min'], end=col['values']['max']).size)

        if col_type == 'category':
            max_values = max(max_values, len(col['values']))

    return max_values

def validate_relationships(all_dfs, reference_df):
    # Primero valido que exista el dataframe
    df_name, df_column = reference_df.split('.')

    # Validar por nombre del dataframe
    if df_name not in all_dfs:
        raise ValueError(f'El dataframe {df_name} no existe :(')

    # Sacamos las columnas validas
    columns = list(all_dfs[df_name].columns)

    # Validamos por columna
    if df_column not in columns:
        raise ValueError(f'La columna {df_column} no existe :( en el dataframe {df_name}. Las columnas validas son: {columns}')

def extract_dependencies(obj):
    dependencies = []
    
    for col in obj['columns']:
        if col['type'] == 'foreign':
            df_name, df_column = col['values'].split('.')
            dependencies.append(df_name)
    return dependencies

def reorder_by_dependencies(arr): # Usando el algoritmo de insertion sort
    n = len(arr)
    
    for i, value in enumerate(arr[1:], start=1):
        value_dependencies = extract_dependencies(value)
        j = i - 1

        # Validar que j no sea menor que cero y que las dependencias del valor actual recorrido esten en el valor de la izq|
        while j >= 0 and any([dep for dep in value_dependencies if dep == arr[j]['ds']]):
            arr[j + 1] = arr[j]
            j = j - 1
        
        # Remplazar el valor actual despues del elemento del que dependa
        arr[j + 1] = value

    return list(reversed(arr))

# Guarda en un diccionario todos los dataframes creados
all_dfs = {}
def generate_df(sett):
    result_df = pd.DataFrame()

    # Aqui mandamos a llamar a la funcion calc_greater para determinar cual columna contiene la mayor cantidad de registros
    max_values = calc_greater(sett['columns']) if sett['random'] == False else sett['random_rows']
    
    for col in sett['columns']:
        col_name = col['name']
        col_type = col['type']
        
        if col_type == 'category':
            values = col['values']

            if (len(values) < max_values):
                result_df[col_name] = generate_category_random_values(values, max_values)
            else:    
                result_df[col_name] = values

        if col_type == 'date':
            min = col['values']['min']
            max = col['values']['max']
            date_range = pd.date_range(start=min, end=max)
            
            if (date_range.size < max_values):
                result_df[col_name] = generate_date_random_values(min, max, max_values)
            else:    
                result_df[col_name] = pd.Series(date_range)

        if col_type == 'unique':
            result_df[col_name] = generate_unique_random_values(max_values)

        if col_type == 'foreign':
            col_reference_name, col_reference_area = col['values'].split('.')
            
            validate_relationships(all_dfs, col['values']) # Si no se traba aqui por un error de validacion entonces continua su ejecucion

            result_df[col_name] = generate_foreign_random_values(all_dfs[col_reference_name][col_reference_area], max_values)

        if col_type == 'numeric':
            if 'std' in col['values'].keys() and 'mean' in col['values'].keys():
                result_df[col_name] = generate_numeric_random_values(col['values']['min'], col['values']['max'], col['values']['std'], col['values']['mean'], max_values)
            else:
                result_df[col_name] = generate_numeric_random_values(col['values']['min'], col['values']['max'], None, None, max_values)
            

    result_df = pd.DataFrame(result_df)
    result_df.name = sett['ds']

    all_dfs[result_df.name] = result_df
    return result_df

## Main

In [1252]:
def build_dataframes(conf_list):
    temp_conf_list = reorder_by_dependencies(conf_list.copy())
    df_arr = []

    for df in temp_conf_list:
        df_arr.append(generate_df(df))

    # Reordenar considerando el orden original (conf_list)
    return  [j for i in conf_list for j in df_arr if i['ds'] == j.name]

## Settings

In [1306]:
d1 = {
    "ds": "dataset",
    "columns": [
        {
            "name": "area",
            "type": "category",
            "values": ["TI", "FIN", "HR"]
        },
        {
            "name": "id",
            "type": "unique"
        }
    ],
    "random": False
}

d2 = {
    "ds": "dataset2",
    "columns": [
        {
            "name": "id",
            "type": "unique"
        },
        {
            "name": "area",
            "type": "foreign",
            "values": "dataset.area"
        },
        {
            "name": "subarea",
            "type": "category",
            "values": ["SA1", "SA2", "SA3", "SA4"]
        }
    ],
    "random": False
}

d3 = {
    "ds": "dataset3",
    "columns": [
        {
            "name": "id",
            "type": "unique"
        }, 
        {
            "name": "subarea",
            "type": "foreign",
            "values": "dataset2.id"
        },
        {
            "name": "income",
            "type": "numeric",
            "values": {
                "min": 0,
                "max": 10000
            }
        },
        {
            "name": "goal",
            "type": "numeric",
            "values": {
                "min": 0,
                "max": 50000,
                "std": 20000,
                "mean": 25000
            }
        }
    ],
    "random": True,
    "random_rows": 1000
}

d4 = {
    "ds": "dataset4",
    "columns": [
        {
            "name": "area",
            "type": "category",
            "values": ["TIC", "FIN", "HR", "MKT"]
        },
        {
            "name": "Fecha",
            "type": "date",
            "values": {
                "min": "2024-01-01",
                "max": "2024-02-28"
            }
        },
        {
            "name": "subarea",
            "type": "foreign",
            "values": "dataset3.id"
        },
        {
            "name": "id",
            "type": "unique"
        }
    ],
    "random": False
}


In [1308]:
conf_list = [d3, d2, d1, d4]

dataframe_list = build_dataframes(conf_list)

In [1312]:
dataframe_list[0]

Unnamed: 0,id,subarea,income,goal
0,0826f36f77404480,a771c3b0faa04bab,7904.892,28584.586
1,131bb188d0ce493f,0643d233967743b1,8157.097,14716.946
2,f41f6b9b0612473d,fce939f946474b87,731.054,32037.965
3,cde7214523054677,85c8d51d75f84ca4,6335.119,1207.205
4,51cbf353c1b74d34,0643d233967743b1,8230.743,15385.754
...,...,...,...,...
995,b1a0d600f3874819,a771c3b0faa04bab,5453.952,26251.460
996,fcab584b86054b18,fce939f946474b87,3613.713,44167.790
997,580c75ccc0e64e19,a771c3b0faa04bab,67.014,30625.019
998,e5e5a7e10468407c,0643d233967743b1,1759.721,49440.695


In [1344]:
result = pd.merge(dataframe_list[0], dataframe_list[1], left_on='subarea', right_on='id', how='inner')
result

Unnamed: 0,id_x,subarea_x,income,goal,id_y,area,subarea_y
0,0826f36f77404480,a771c3b0faa04bab,7904.892,28584.586,a771c3b0faa04bab,FIN,SA2
1,39bc9e2c69d345be,a771c3b0faa04bab,7538.679,5413.452,a771c3b0faa04bab,FIN,SA2
2,fff5d15527214d68,a771c3b0faa04bab,7353.661,3401.006,a771c3b0faa04bab,FIN,SA2
3,0858ab8f00df4d81,a771c3b0faa04bab,7077.299,23868.160,a771c3b0faa04bab,FIN,SA2
4,f3ab9224ae0c4d6c,a771c3b0faa04bab,3599.829,40848.211,a771c3b0faa04bab,FIN,SA2
...,...,...,...,...,...,...,...
995,d54b3ea108684f5e,85c8d51d75f84ca4,5742.154,27676.142,85c8d51d75f84ca4,FIN,SA1
996,3449aaeceb474dc3,85c8d51d75f84ca4,4775.634,33955.342,85c8d51d75f84ca4,FIN,SA1
997,413a0ea4d5a54036,85c8d51d75f84ca4,4458.889,27322.223,85c8d51d75f84ca4,FIN,SA1
998,4ed5c285c7e24568,85c8d51d75f84ca4,1938.718,11262.442,85c8d51d75f84ca4,FIN,SA1
