# Criador de dataset tabular

## Criar dataset complexo (regressor e classificador)
Cria um dataset com 8 entrada (features), sendo utilizado 6 deles para criar a saída (target) como regressor utilizando um cálculo complexo. Com o resultado da regressão é criado outro dataset com os mesmos valores de entrada, mas em formato categórico (target).

A saída é previsível (não contem ruído).

In [7]:
import pandas as pd
import numpy as np

# Number of rows
n_rows = 2000

feature_1 = np.random.uniform(1, 100, n_rows) + np.random.normal(0, 5, n_rows)

feature_2 = np.random.choice(['A', 'B', 'C', 'D', 'E', 'F'], size=n_rows)
category_factors = {'A': 1.1, 'B': 0.9, 'C': 1.05, 'D': 0.95, 'E': 1.2, 'F': 0.85}

feature_3 = (
    np.random.uniform(10, 200, n_rows) +
    np.sin(np.random.uniform(0, 2 * np.pi, n_rows)) +
    np.log1p(np.random.uniform(1, 50, n_rows))
)

feature_4 = (
    (np.random.uniform(5, 50, n_rows) ** 1.5) +
    np.sqrt(np.random.uniform(1, 20, n_rows))
)

feature_5 = (
    np.log1p(np.random.uniform(10, 1000, n_rows)) +
    np.exp(-np.random.uniform(0, 10, n_rows))
)

feature_6 = (
    np.sin(np.random.uniform(0, 2 * np.pi, n_rows)) +
    np.cos(np.random.uniform(0, 2 * np.pi, n_rows))
)

feature_7 = np.vectorize(category_factors.get)(feature_2)

feature_8 = (
    np.exp(-np.random.uniform(0, 50, n_rows)) +
    np.sin(np.random.uniform(0, np.pi / 2, n_rows))
)


# Normalization of features to similar scales
features = [feature_1, feature_3, feature_4, feature_5, feature_6, feature_8]
features_normalized = [(f - np.mean(f)) / (np.std(f) + 1e-8) for f in features]

feature_1_norm, feature_3_norm, feature_4_norm, feature_5_norm, feature_6_norm, feature_8_norm = features_normalized

# ### Regression
target_reg = (
    # Combinação linear inicial balanceada
    (1.55 * feature_1_norm + 
     0.95 * feature_3_norm + 
     1.80 * feature_4_norm + 
     0.4 * feature_5_norm + 
     0.15 * feature_6_norm + 
     2.10 * feature_8_norm)
    
    + (np.sin(feature_1_norm * feature_3_norm) * 
       np.log1p(np.abs(feature_4_norm - feature_6_norm)))
    
    + (np.exp(-np.abs(feature_5_norm - feature_8_norm)) *
       (feature_3_norm ** 2))
    
    + (np.cos(feature_4_norm * feature_6_norm) *
       np.sqrt(np.abs(feature_1_norm - feature_8_norm)))
    
    + ((feature_3_norm ** 3) - (feature_5_norm ** 2)) *
      (np.sin(feature_6_norm) + np.cos(feature_8_norm))
    
    * np.vectorize(category_factors.get)(feature_2)
)

# ### Classification
# Categories based on thresholds
thresholds = np.percentile(target_reg, [20, 40, 60, 80, 100])
clss = ['type_1', 'type_2', 'type_3', 'type_4', 'type_5'] 

def assign_category(score):
    for cl, threshold in zip(clss, thresholds):
        if score < threshold:
            return cl
target_cat = np.array([assign_category(score) for score in target_reg])


df_reg = pd.DataFrame({
    'feature_1': feature_1,
    'feature_2': feature_2,
    'feature_3': feature_3,
    'feature_4': feature_4,
    'feature_5': feature_5,
    'feature_6': feature_6,
    'feature_7': feature_7,
    'feature_8': feature_8,
    'target': target_reg  # regression
})

df_cat = pd.DataFrame({
    'feature_1': feature_1,
    'feature_2': feature_2,
    'feature_3': feature_3,
    'feature_4': feature_4,
    'feature_5': feature_5,
    'feature_6': feature_6,
    'feature_7': feature_7,
    'feature_8': feature_8,
    'target': target_cat,  # classification
})

print('##### Regression #####')
print(df_reg)
print('\n\n##### Classification #####')
print(df_cat)
print(f'\nThresholds: {thresholds}\n')
print(df_cat['target'].value_counts())

# Save as CSV
df_reg.to_csv('dataset/dados_ficticios_reg.csv', index=False)
df_cat.to_csv('dataset/dados_ficticios_cat.csv', index=False)


##### Regression #####
      feature_1 feature_2   feature_3   feature_4  feature_5  feature_6  \
0     16.647436         E  183.980936   24.617366   5.717618  -0.631023   
1     10.604436         A   85.459194  307.449148   6.655153   0.767629   
2     77.388714         E   40.558116  155.780617   6.921389   0.015533   
3     90.406079         A   15.880388   33.010214   6.250944  -1.473239   
4     87.356717         E  170.070489  178.530697   6.204453   1.044204   
...         ...       ...         ...         ...        ...        ...   
1995  52.551128         B   12.801325  246.448995   5.907150   0.017774   
1996  59.145417         D  156.294131  216.923304   5.872186   0.194822   
1997  22.532483         B  115.050490  217.424978   6.890049   0.120550   
1998  18.649490         F   38.755850  314.397756   6.590435  -0.790071   
1999  84.994711         E   82.194049  242.364786   6.448571   1.690049   

      feature_7  feature_8    target  
0          1.20   0.866846  0.084420 

## Criar dataset E=mc²
Cria um dataset contendo a massa como entrada (features) e a energia como saída (target).

A saída é previsível (não contem ruído).

In [6]:
import pandas as pd
import numpy as np

# Number of rows
n_rows = 1000

# Constant c^2 (light speed) (c)
c_squared = 9e16

# Randomly generated mass (m)
mass = np.random.uniform(0.1, 100, n_rows)

# Energy in joules: Calculated by the formula E = mc^2 (E)
energy_joules = mass * c_squared

# Convert energy to terajoules (1 TJ = 10^12 J)
energy_terajoules = energy_joules / 1e12

df = pd.DataFrame({
    'massa': mass,
    'target': energy_terajoules
})

print(df.head())

# Save as CSV
df.to_csv('dataset/dataset_e_mc2_in_terajoules.csv', index=False)


       massa        target
0  59.290703  5.336163e+06
1  74.847717  6.736295e+06
2  74.895715  6.740614e+06
3  11.491273  1.034215e+06
4  41.090341  3.698131e+06
