# Proyecto 

In [230]:
import pandas as pd
import numpy as np

import random
from scipy.stats import truncnorm
import uuid

### Requesito 1

In [6]:
def generate_unique_id(n):
    return [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(n)]

In [75]:
d1 = {
    "ds": "dataset1",
    "columns": [
        {
            "name": "area",
            "type": "category",
            "values": ["TI", "FIN", "HR"]
        },
        {
            "name": "id",
            "type": "unique",
            "values": generate_unique_id(len((d1['columns'][0]['values'])))
        }
    ],
    "random": False
}

In [77]:
data = {
    d1["columns"][0]["name"]: d1["columns"][0]["values"],
    d1["columns"][1]["name"]: d1["columns"][1]["values"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,area,id
0,TI,b51ae77afa524778
1,FIN,989d76b931d243c5
2,HR,6058aaa61efe4432


## Requisito 2

In [572]:
d2 = {
    "ds": "dataset2",
    "columns": [
        {
            "name": "id",
            "type": "unique",
            "values": generate_unique_id(4)
        },
        {
            "name": "area",
            "type": "foreign",
            "values": "dataset1.area"
        },
        {
            "name": "subarea",
            "type": "category",
            "values": ["SA1", "SA2", "SA3", "SA4"]
        }
    ],
    "random": False
}


In [574]:
# Crear dataset1 basado en el diccionario d1
d1["id"] = d1["columns"][1]["values"]


In [808]:
# Verificacion
referenced_dataset = d1  # Referencia al dataset 1

# Verificar que el dataset de referencia exista
if d2["columns"][1]["values"].split('.')[0] != referenced_dataset['ds']:
    raise ValueError(f"Referencia inválida al dataset. Dataset válido: {referenced_dataset['ds']}")

# Verificar que la columna de referencia exista
if d2["columns"][1]["values"].split('.')[1] not in [col['name'] for col in referenced_dataset['columns']]:
    raise ValueError(f"Columna de referencia inválida. Columnas válidas: {[col['name'] for col in referenced_dataset['columns']]}")

# Generar IDs únicos para d2
def generate_unique_id_d2():
    return [str(uuid.uuid4()).replace('-', '')[:16] for _ in range(4)]  # 4 registros en d2

unique_ids_d2 = generate_unique_id_d2()

# Selección aleatoria de valores para la columna 'area' del dataset 1
random_areas = [random.choice(referenced_dataset["columns"][0]["values"]) for _ in range(4)]

# Crear el dataframe del segundo dataset
data2 = {
    "id": unique_ids_d2,
    "area": random_areas,
    "subarea": d2["columns"][2]["values"]
}

df2 = pd.DataFrame(data2)

# Imprimir el segundo dataset
print("Dataset 2:")
df2 = pd.DataFrame(data2)
df2


Dataset 2:


Unnamed: 0,id,area,subarea
0,babe98de7dc34f62,HR,SA1
1,4780fb57119d402a,FIN,SA2
2,f27ca0cc6e1547e2,FIN,SA3
3,4b4364713c0b495c,TI,SA4


## Requisito 3

In [570]:
d3 = {
    "ds": "dataset3",
    "columns": [
        {
            "name": "id",
            "type": "unique",
            "values": generate_unique_id(1000)
        },
        {
            "name": "subarea",
            "type": "category",
            "values": "dataset2.id"
        },
        {
            "name": "income",
            "type": "numeric",
            "values": {
                "min": 1000,
                "max": 10000
            }
        },
        {
            "name": "goal",
            "type": "numeric",
            "values": {
                "min": 1000,
                "max": 10000,
                "std": 5000,
                "mean": 5000
            }
        }
    ],
    "random": True,
    "random_rows": 1000
}


In [1075]:
# d3 ya tiene los valores definidos
mean = d3["columns"][3]["values"]["mean"]
std = d3["columns"][3]["values"]["std"]
min_val = d3["columns"][3]["values"]["min"]
max_val = d3["columns"][3]["values"]["max"]
size = d3["random_rows"]  # Tamaño de muestra

# Definir la función para generar datos
def generate_truncated_normal_data(mean, std, min_val, max_val, size):
    # Calcular los parámetros de la distribución normal truncada
    a, b = (min_val - mean) / std, (max_val - mean) / std
    data = truncnorm(a, b, loc=mean, scale=std).rvs(size)
    return data

# Generar los datos
data = generate_truncated_normal_data(mean, std, min_val, max_val, size)

referenced_dataset = d2  # Referencia al dataset 1
random_rows = 1000
# Selección aleatoria de valores para la columna 'area' del dataset 1
random_areas = [random.choice(referenced_dataset["columns"][1]["values"]) for _ in range(random_rows)]

def generate_normal_data_in_range(min_val, max_val, size):
    mean = (min_val + max_val) / 2
    std = (max_val - min_val) / 6  # Aproximadamente 99.7% de los datos caerán dentro de [min_val, max_val]
    data = np.random.normal(loc=mean, scale=std, size=size)
    return np.clip(data, min_val, max_val)


In [466]:
# Generar dataset3 con 'subarea' referenciado desde data2.id
dataset3 = pd.DataFrame({
    "id": generate_unique_id(1000),  # Generar 1000 IDs únicos
    "subarea": np.random.choice(data2["id"], n_rows),  # Elegir aleatoriamente de los IDs en data2
    "income": np.random.uniform(1000, 5000, n_rows),  # Generar valores numéricos aleatorios para 'income'
    "goal": np.random.normal(5000, 1000, n_rows)  # Generar valores numéricos aleatorios para 'goal'
})

In [979]:
df3 = pd.DataFrame(dataset3)
df3

Unnamed: 0,id,subarea,income,goal
0,8e3d6c48d1214373,eab5c9829de64e38,4508.172811,3045.772785
1,6b7b16981cc6442a,1d91ab55f9ee4123,2695.826332,5240.664510
2,272cee1edc1540e2,550b0533b8384805,4776.968180,4075.043296
3,430e9c159e464a19,eab5c9829de64e38,1369.981218,5902.906596
4,8393058569304f58,eab5c9829de64e38,2085.333824,6137.915398
...,...,...,...,...
995,6fc25a2d617c474d,ec43fbf119084875,2932.407600,4676.344258
996,92c805d603024536,550b0533b8384805,1661.308662,4725.728022
997,e6d2bfc57cee472b,1d91ab55f9ee4123,4330.750541,5537.790430
998,38359206b356431a,eab5c9829de64e38,2599.695174,4785.789670


## Requisito 4

In [1082]:
conf_list = [d5, d2, d4, d1, d3]
def build_dataframes(conf_list):
    dataframes = {}
    foreign_columns = []

    # Primera pasada: crear dataframes sin columnas 'foreign'
    for conf in conf_list:
        if "ds" not in conf:
            raise KeyError(f"La configuración {conf} no tiene la clave 'ds'.")
        if "columns" not in conf:
            raise KeyError(f"La configuración {conf} no tiene la clave 'columns'.")

        ds_name = conf["ds"]
        columns = conf["columns"]
        df_data = {}

        # Encontrar la longitud correcta para cada dataset
        length = conf.get("random_rows", 5)
        if not conf["random"]:
            length = max(len(col["values"]) for col in columns if col["type"] in ["unique", "category", "text"])

        for col in columns:
            col_name = col["name"]
            col_type = col["type"]
            if col_type == "unique":
                df_data[col_name] = generate_unique_id(length)
            elif col_type == "category":
                if isinstance(col["values"], str) and "." in col["values"]:
                    foreign_ds, foreign_col = col["values"].split(".")
                    foreign_columns.append((ds_name, col_name, foreign_ds, foreign_col, length))
                elif isinstance(col["values"], list) and all(isinstance(i, str) for i in col["values"]):
                    df_data[col_name] = np.random.choice(col["values"], length)
                else:
                    raise ValueError(f"Los valores de la columna '{col_name}' en el dataset '{ds_name}' no son válidos.")
            elif col_type == "numeric":
                if "min" in col["values"] and "max" in col["values"] and "std" in col["values"] and "mean" in col["values"]:
                    # Generar datos utilizando una distribución normal truncada
                    df_data[col_name] = generate_truncated_normal_data(
                        col["values"]["mean"],
                        col["values"]["std"],
                        col["values"]["min"],
                        col["values"]["max"],
                        length
                    )
                elif "min" in col["values"] and "max" in col["values"]:
                    # Generar datos utilizando una distribución normal simple dentro de un rango
                    df_data[col_name] = generate_normal_data_in_range(
                        col["values"]["min"],
                        col["values"]["max"],
                        length
                    )
                else:
                    raise ValueError(f"Los valores de la columna '{col_name}' en el dataset '{ds_name}' no son válidos.")

        dataframes[ds_name] = pd.DataFrame(df_data)

    # Segunda pasada: añadir columnas 'foreign'
    for ds_name, col_name, foreign_ds, foreign_col, length in foreign_columns:
        if foreign_ds not in dataframes:
            raise KeyError(f"El dataset '{foreign_ds}' no ha sido creado.")
        dataframes[ds_name][col_name] = np.random.choice(dataframes[foreign_ds][foreign_col].values, length)

    return dataframes

# Construcción de dataframes
dataframes = build_dataframes(conf_list)

# Convertir el diccionario a una lista de dataframes
dataframe_list = list(dataframes.values())

# Imprimir el dataframe correspondiente a dataset2
print(dataframe_list[4])

                   id       income         goal
0    0ea4fe9d2976403e  7474.931614  2727.301234
1    654c2662d01f4db5  5300.838693  8207.876616
2    c6a2567914ad4856  5718.151704  9053.095844
3    8088dd6654fb4a8f  3937.515925  2091.224246
4    16a2ce7f9e1a45c0  5777.376886  3968.569575
..                ...          ...          ...
995  177fe94d699147c8  4727.989172  6188.928620
996  12651a2661864677  2841.871217  7273.563055
997  1c8c6131ad564c57  4848.235675  9735.217450
998  c858fd20a35342d7  7406.494980  6890.864686
999  fe1d1cf8df5749cb  7142.880590  3424.580122

[1000 rows x 3 columns]


In [1051]:
d4 = {
    "ds": "dataset4",
    "columns": [
        {
            "name": "id",
            "type": "unique",
            "values": generate_unique_id(5)
        },
        {
            "name": "task",
            "type": "category",
            "values": ["Task1", "Task2", "Task3"]
        },
        {
            "name": "subarea",
            "type": "foreign",
            "values": "dataset2.subarea"
        }
    ],
    "random": False
}

d5 = {
    "ds": "dataset5",
    "columns": [
        {
            "name": "id",
            "type": "unique",
            "values": generate_unique_id(5)
        },
        {
            "name": "description",
            "type": "text",
            "values": ["Desc1", "Desc2", "Desc3"]
        },
        {
            "name": "task",
            "type": "foreign",
            "values": "dataset4.task"
        }
    ],
    "random": False
}

conf_list = [d5, d2, d4, d1, d3]

In [991]:
all_dataframes = list(dataframes.values())
all_dataframes[2]

Unnamed: 0,id,income,goal
0,e780577456ea4e4f,6075.817135,2772.833724
1,9190e11be1084ed8,2002.439021,4611.527370
2,fb3917e14504482a,1679.621455,5649.869733
3,1ec4a572ac05455d,2432.003070,4337.212150
4,f5f9a0a0e7c243b6,7358.249484,8495.571611
...,...,...,...
995,e7463a74c08f4254,8683.966751,7005.236803
996,b5fb3c1e35a34309,3825.678562,8049.209115
997,d0f8c186cb4d424e,9651.719398,8327.560273
998,c4c491c09d3d40d8,6222.013789,5547.189122


## Requisito 5

In [1067]:
print(df.columns)

Index(['id', 'area'], dtype='object')


In [1069]:
df5 = all_dataframes[0]  # Este es el dataset generado

# Analiza combinaciones de columnas categóricas
category_cols = ['id', 'area']  # Reemplaza con los nombres reales de las columnas categóricas
combinations = df[category_cols].value_counts(normalize=True).reset_index()
combinations.columns = category_cols + ['probability']

# Imprimir combinaciones y probabilidades
print(combinations)

                 id area  probability
0  457344a08ad44f10  MKT         0.25
1  71081cc415fd46df   HR         0.25
2  84e0eb34e7b64d26   TI         0.25
3  dca52213fea346cd  FIN         0.25


In [1014]:
def simulate_categorical_data(combinations, num_records):
    simulated_data = []
    for _ in range(num_records):
        # Elige una combinación de acuerdo a sus probabilidades
        combination = combinations.sample(weights='probability').iloc[0]
        simulated_data.append(combination[:-1].values)  # Excluye la probabilidad
    return pd.DataFrame(simulated_data, columns=combinations.columns[:-1])

final_simulation = simulate_categorical_data(combinations, 100000)

In [1035]:
def simulate_numeric_column(mean, std, size):
    return np.random.normal(loc=mean, scale=std, size=size)

numeric_cols = ['mean', 'std']  # Reemplaza con columnas numéricas
for col in numeric_cols:
    mean = 50000  
    std = 10000  
    final_simulation[col] = simulate_numeric_column(mean, std, 100000)


In [1037]:
print( final_simulation )

                     id area        income          goal          mean  \
0      84e0eb34e7b64d26   TI  50589.497621  61139.686727  34118.429288   
1      457344a08ad44f10  MKT  40222.205800  53692.853245  56520.203791   
2      71081cc415fd46df   HR  53930.130797  34348.166962  39302.859241   
3      84e0eb34e7b64d26   TI  53399.645213  55998.056172  49768.951007   
4      457344a08ad44f10  MKT  53606.116993  38042.667660  50205.680563   
...                 ...  ...           ...           ...           ...   
99995  457344a08ad44f10  MKT  55740.214859  46691.582962  39545.436803   
99996  457344a08ad44f10  MKT  60155.615132  50387.153798  68268.620077   
99997  84e0eb34e7b64d26   TI  47787.761805  31578.694576  42800.998539   
99998  457344a08ad44f10  MKT  51084.244325  51121.022667  48271.846607   
99999  457344a08ad44f10  MKT  41545.301962  45630.884638  34120.349431   

                std  
0      61329.966987  
1      45937.325242  
2      33718.209098  
3      32327.424517  
4

In [1039]:
print("Categorical Simulation Result:")
print(final_simulation.head())

Categorical Simulation Result:
                 id area        income          goal          mean  \
0  84e0eb34e7b64d26   TI  50589.497621  61139.686727  34118.429288   
1  457344a08ad44f10  MKT  40222.205800  53692.853245  56520.203791   
2  71081cc415fd46df   HR  53930.130797  34348.166962  39302.859241   
3  84e0eb34e7b64d26   TI  53399.645213  55998.056172  49768.951007   
4  457344a08ad44f10  MKT  53606.116993  38042.667660  50205.680563   

            std  
0  61329.966987  
1  45937.325242  
2  33718.209098  
3  32327.424517  
4  51530.583700  
