# Agregando os dados

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit



## 1 - Familiarizando com os dados

In [2]:
# Read a space-separated text file
df = pd.read_csv('../data/01_raw/wider/cl_cmb_c1_m1.dat',  sep=r'\s+', header = None)

print(df)

                0             1
0    0.000000e+00  0.000000e+00
1    0.000000e+00  0.000000e+00
2    4.201063e-02  1.718837e-04
3    1.150391e-01  6.596925e-05
4    4.443855e-02  7.797863e-05
..            ...           ...
508  2.220550e-08  4.873860e-10
509  2.136289e-08  4.858634e-10
510  1.952949e-08  3.759005e-10
511  1.726959e-08  4.054609e-10
512  0.000000e+00  0.000000e+00

[513 rows x 2 columns]


In [3]:
df.isnull().sum()

0    0
1    0
dtype: int64

In [4]:
# Assign new headers
df.columns = ['cl_e', 'cl_b']

print("\nDataFrame with headers:")

print(df)


DataFrame with headers:
             cl_e          cl_b
0    0.000000e+00  0.000000e+00
1    0.000000e+00  0.000000e+00
2    4.201063e-02  1.718837e-04
3    1.150391e-01  6.596925e-05
4    4.443855e-02  7.797863e-05
..            ...           ...
508  2.220550e-08  4.873860e-10
509  2.136289e-08  4.858634e-10
510  1.952949e-08  3.759005e-10
511  1.726959e-08  4.054609e-10
512  0.000000e+00  0.000000e+00

[513 rows x 2 columns]


In [5]:
# Read a space-separated text file
df2 = pd.read_csv('../data/01_raw/wider/cl_cmb_c1_m2.dat',  sep=r'\s+', header = None)

print(df2)

                0             1
0    0.000000e+00  0.000000e+00
1    0.000000e+00  0.000000e+00
2    3.674006e-02  5.771725e-05
3    1.053735e-01  6.833896e-05
4    4.177252e-02  1.176864e-04
..            ...           ...
508  2.286990e-08  4.638441e-10
509  2.265625e-08  4.146040e-10
510  1.977268e-08  3.850935e-10
511  1.716156e-08  3.658829e-10
512  0.000000e+00  0.000000e+00

[513 rows x 2 columns]


In [6]:
df2.isnull().sum()

0    0
1    0
dtype: int64

### Observações

- Não há dados nulos nos dois arquivos analisados.

- Observa-se que as duas primeiras linhas são iguais a zero. 

- Ao conversar com a pessoa que gerou os dados através de uma simulação, ficou recomendado que ambas linhas podem ser excluídas do dataset. 

- Checa-se mais adiante se o dataframe final, contendo a união de todos os arquivos disponíveis, não terá dados nulos.

In [7]:
# Deletando colunas desnecessárias
df2.drop(index=[0, 1], inplace=True)

print(df2)

                0             1
2    3.674006e-02  5.771725e-05
3    1.053735e-01  6.833896e-05
4    4.177252e-02  1.176864e-04
5    5.558406e-02  4.529440e-05
6    2.683714e-02  4.520182e-05
..            ...           ...
508  2.286990e-08  4.638441e-10
509  2.265625e-08  4.146040e-10
510  1.977268e-08  3.850935e-10
511  1.716156e-08  3.658829e-10
512  0.000000e+00  0.000000e+00

[511 rows x 2 columns]


In [8]:
# Assign new headers
df2.columns = ['cl_e', 'cl_b']

print("\nDataFrame with headers:")

print(df2)


DataFrame with headers:
             cl_e          cl_b
2    3.674006e-02  5.771725e-05
3    1.053735e-01  6.833896e-05
4    4.177252e-02  1.176864e-04
5    5.558406e-02  4.529440e-05
6    2.683714e-02  4.520182e-05
..            ...           ...
508  2.286990e-08  4.638441e-10
509  2.265625e-08  4.146040e-10
510  1.977268e-08  3.850935e-10
511  1.716156e-08  3.658829e-10
512  0.000000e+00  0.000000e+00

[511 rows x 2 columns]


# 2 - Juntando-se os dados

Foram disponibilizados vários arquivos de simulação para os valores de multipolos do espectro de potência da radiação cósmica de fundo.

Para cada valor de um observável que queremos prever, foram feitas 10 simulações.

Abaixo une-se os 10 arquivos que serão referentes a um único valor de observável cosmológico (que será o target).

Exclui-se as linhas com valores iguais a 0: linhas 0,1 e 512 



In [9]:
directory = '../data/01_raw/wider/'
dataframes = []
base_columns = ['cl_e', 'cl_b']

for i in range(1, 11):
    file_name = f'cl_cmb_c501_m{i}.dat'
    file_path = os.path.join(directory, file_name)

    try:
        df = pd.read_csv(file_path, sep='\s+', header=None)
        # Assign new headers
        df.columns = [f'cl_e_r{i}', f'cl_b_r{i}']


        df.drop(index=[0, 1, 512], inplace=True)
        
        dataframes.append(df)
        print(f"Prepared {file_name}")
    except FileNotFoundError:
        print(f"Error: The file {file_name} was not found.")
    except Exception as e:
        print(f"An error occurred while reading {file_name}: {e}")

Prepared cl_cmb_c501_m1.dat
Prepared cl_cmb_c501_m2.dat
Prepared cl_cmb_c501_m3.dat
Prepared cl_cmb_c501_m4.dat
Prepared cl_cmb_c501_m5.dat
Prepared cl_cmb_c501_m6.dat
Prepared cl_cmb_c501_m7.dat
Prepared cl_cmb_c501_m8.dat
Prepared cl_cmb_c501_m9.dat
Prepared cl_cmb_c501_m10.dat


  df = pd.read_csv(file_path, sep='\s+', header=None)


In [10]:
# Join all DataFrames side-by-side using axis=1
combined_df = pd.concat(dataframes, axis=1)

# Print the shape of the final DataFrame to see the total number of columns
print(f"\nCombined DataFrame has {combined_df.shape[0]} rows and {combined_df.shape[1]} columns.")

# Display the first few rows of the final DataFrame to show the side-by-side join
print(combined_df.head())


Combined DataFrame has 510 rows and 20 columns.
    cl_e_r1   cl_b_r1   cl_e_r2   cl_b_r2   cl_e_r3   cl_b_r3   cl_e_r4  \
2  0.063304  0.000177  0.083743  0.000131  0.024790  0.000145  0.008915   
3  0.054251  0.000366  0.057342  0.000159  0.057861  0.000078  0.086991   
4  0.019729  0.000385  0.052765  0.000155  0.112431  0.000249  0.045072   
5  0.073937  0.000189  0.081302  0.000246  0.039662  0.000150  0.052894   
6  0.037034  0.000048  0.052730  0.000305  0.045731  0.000220  0.046467   

    cl_b_r4   cl_e_r5   cl_b_r5   cl_e_r6   cl_b_r6   cl_e_r7   cl_b_r7  \
2  0.000213  0.057359  0.000209  0.080824  0.000535  0.012191  0.000715   
3  0.000188  0.025027  0.000374  0.110379  0.000418  0.085401  0.000044   
4  0.000092  0.051360  0.000206  0.100464  0.000133  0.050084  0.000349   
5  0.000115  0.057518  0.000293  0.084041  0.000382  0.025231  0.000292   
6  0.000178  0.042778  0.000216  0.037155  0.000191  0.045960  0.000097   

    cl_e_r8   cl_b_r8   cl_e_r9   cl_b_r9  cl_e_r

# 3 - Transpondo os dados

- Para utilizar todos os dados e simular a presença de ruído nos dados, abaixo transpõe-se os dados lidos:

In [11]:
# Create the DataFrame for cl_e_avg by transposing the column
df_transposed = combined_df.T.reset_index()

df_transposed

Unnamed: 0,index,2,3,4,5,6,7,8,9,10,...,502,503,504,505,506,507,508,509,510,511
0,cl_e_r1,0.063304,0.054251,0.019729,0.073937,0.037034,0.020939,0.021466,0.010626,0.006817,...,3.936323e-08,3.462228e-08,3.029699e-08,3.003691e-08,2.76806e-08,2.580067e-08,2.353348e-08,2.226904e-08,1.928735e-08,1.839746e-08
1,cl_b_r1,0.000177,0.000366,0.000385,0.000189,4.8e-05,0.000157,0.000147,7.3e-05,0.000103,...,6.403156e-10,5.961137e-10,5.322256e-10,5.335975e-10,4.977203e-10,4.774566e-10,4.56031e-10,4.012602e-10,4.110528e-10,3.73989e-10
2,cl_e_r2,0.083743,0.057342,0.052765,0.081302,0.05273,0.027859,0.024706,0.009215,0.00807,...,4.080233e-08,3.612473e-08,3.511255e-08,2.829649e-08,2.756989e-08,2.765929e-08,2.203981e-08,2.081548e-08,2.05052e-08,1.786005e-08
3,cl_b_r2,0.000131,0.000159,0.000155,0.000246,0.000305,0.000177,7.9e-05,0.000113,8.9e-05,...,6.325501e-10,6.352722e-10,6.03934e-10,5.393067e-10,5.505967e-10,4.876478e-10,4.636709e-10,4.14326e-10,4.388595e-10,3.621788e-10
4,cl_e_r3,0.02479,0.057861,0.112431,0.039662,0.045731,0.024857,0.021979,0.012327,0.005341,...,3.575355e-08,3.468693e-08,3.483946e-08,3.18601e-08,2.828778e-08,2.468184e-08,2.415569e-08,2.14284e-08,1.976144e-08,1.682415e-08
5,cl_b_r3,0.000145,7.8e-05,0.000249,0.00015,0.00022,0.000228,6.5e-05,0.000112,8.9e-05,...,6.581546e-10,5.821009e-10,5.628135e-10,5.011548e-10,5.436323e-10,4.437804e-10,4.408738e-10,4.097907e-10,3.875119e-10,3.430054e-10
6,cl_e_r4,0.008915,0.086991,0.045072,0.052894,0.046467,0.054269,0.021458,0.011428,0.007383,...,3.80771e-08,4.059193e-08,3.320592e-08,2.953456e-08,2.770811e-08,2.5038e-08,2.596169e-08,2.222588e-08,2.162488e-08,1.756562e-08
7,cl_b_r4,0.000213,0.000188,9.2e-05,0.000115,0.000178,0.000191,0.000102,0.000114,5.1e-05,...,6.405241e-10,5.609236e-10,5.618016e-10,5.327607e-10,5.344538e-10,4.532758e-10,4.514293e-10,3.952968e-10,3.662061e-10,3.464608e-10
8,cl_e_r5,0.057359,0.025027,0.05136,0.057518,0.042778,0.028163,0.017292,0.010801,0.008316,...,3.91366e-08,3.360974e-08,3.292017e-08,3.091609e-08,3.008851e-08,2.553506e-08,2.36448e-08,2.099798e-08,1.912052e-08,1.729915e-08
9,cl_b_r5,0.000209,0.000374,0.000206,0.000293,0.000216,9.9e-05,0.000134,0.000122,8.2e-05,...,6.557831e-10,6.403748e-10,6.702092e-10,5.777754e-10,5.39984e-10,4.75411e-10,4.605536e-10,4.126458e-10,4.131803e-10,3.808255e-10


In [12]:
columns = ['index'] + [f'l_{i}' for i in range(2,512)]


# Assign new headers
df_transposed.columns = columns

print("\nDataFrame with headers:")

print(df_transposed)


DataFrame with headers:
       index       l_2       l_3       l_4       l_5       l_6       l_7  \
0    cl_e_r1  0.063304  0.054251  0.019729  0.073937  0.037034  0.020939   
1    cl_b_r1  0.000177  0.000366  0.000385  0.000189  0.000048  0.000157   
2    cl_e_r2  0.083743  0.057342  0.052765  0.081302  0.052730  0.027859   
3    cl_b_r2  0.000131  0.000159  0.000155  0.000246  0.000305  0.000177   
4    cl_e_r3  0.024790  0.057861  0.112431  0.039662  0.045731  0.024857   
5    cl_b_r3  0.000145  0.000078  0.000249  0.000150  0.000220  0.000228   
6    cl_e_r4  0.008915  0.086991  0.045072  0.052894  0.046467  0.054269   
7    cl_b_r4  0.000213  0.000188  0.000092  0.000115  0.000178  0.000191   
8    cl_e_r5  0.057359  0.025027  0.051360  0.057518  0.042778  0.028163   
9    cl_b_r5  0.000209  0.000374  0.000206  0.000293  0.000216  0.000099   
10   cl_e_r6  0.080824  0.110379  0.100464  0.084041  0.037155  0.028571   
11   cl_b_r6  0.000535  0.000418  0.000133  0.000382  0.000191 

# 4 - Separando o dataframe acima em dois

- As 510 colunas do dataframe acima correspondem aos momentos de multipole do modo $E$ e modo $B$ no espectro de potências da CMB.

- Os 510 momentos podem ser utilizados para predizer o valor do observável $r$ no caso dos modos $B$ e para predizer o observável $\tau$ no caso do modo $E$.

- Por isso separa-se o dataframe acima em dois e toma-se a transposta dos dataframes resultantes

In [13]:
df_e = df_transposed.loc[df_transposed['index'].str.startswith('cl_e'),:]

In [14]:
df_e

Unnamed: 0,index,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,...,l_502,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511
0,cl_e_r1,0.063304,0.054251,0.019729,0.073937,0.037034,0.020939,0.021466,0.010626,0.006817,...,3.936323e-08,3.462228e-08,3.029699e-08,3.003691e-08,2.76806e-08,2.580067e-08,2.353348e-08,2.226904e-08,1.928735e-08,1.839746e-08
2,cl_e_r2,0.083743,0.057342,0.052765,0.081302,0.05273,0.027859,0.024706,0.009215,0.00807,...,4.080233e-08,3.612473e-08,3.511255e-08,2.829649e-08,2.756989e-08,2.765929e-08,2.203981e-08,2.081548e-08,2.05052e-08,1.786005e-08
4,cl_e_r3,0.02479,0.057861,0.112431,0.039662,0.045731,0.024857,0.021979,0.012327,0.005341,...,3.575355e-08,3.468693e-08,3.483946e-08,3.18601e-08,2.828778e-08,2.468184e-08,2.415569e-08,2.14284e-08,1.976144e-08,1.682415e-08
6,cl_e_r4,0.008915,0.086991,0.045072,0.052894,0.046467,0.054269,0.021458,0.011428,0.007383,...,3.80771e-08,4.059193e-08,3.320592e-08,2.953456e-08,2.770811e-08,2.5038e-08,2.596169e-08,2.222588e-08,2.162488e-08,1.756562e-08
8,cl_e_r5,0.057359,0.025027,0.05136,0.057518,0.042778,0.028163,0.017292,0.010801,0.008316,...,3.91366e-08,3.360974e-08,3.292017e-08,3.091609e-08,3.008851e-08,2.553506e-08,2.36448e-08,2.099798e-08,1.912052e-08,1.729915e-08
10,cl_e_r6,0.080824,0.110379,0.100464,0.084041,0.037155,0.028571,0.018855,0.006867,0.007544,...,4.009004e-08,3.936094e-08,3.353502e-08,2.975173e-08,2.843316e-08,2.619798e-08,2.143305e-08,2.192261e-08,1.908531e-08,1.869502e-08
12,cl_e_r7,0.012191,0.085401,0.050084,0.025231,0.04596,0.022734,0.018932,0.012979,0.01053,...,3.908589e-08,3.578986e-08,3.586337e-08,3.081127e-08,2.884916e-08,2.536001e-08,2.331691e-08,2.199524e-08,1.961958e-08,1.911233e-08
14,cl_e_r8,0.015359,0.091228,0.091121,0.073031,0.034199,0.045205,0.029189,0.009541,0.006674,...,3.964365e-08,3.414672e-08,3.412e-08,3.231159e-08,2.801912e-08,2.612711e-08,2.390663e-08,2.089267e-08,1.915868e-08,1.802469e-08
16,cl_e_r9,0.032583,0.075934,0.083447,0.084348,0.054955,0.035447,0.016324,0.00876,0.008999,...,4.242591e-08,3.698008e-08,3.441897e-08,3.198797e-08,2.86087e-08,2.54293e-08,2.383455e-08,2.044749e-08,2.01285e-08,1.824988e-08
18,cl_e_r10,0.038724,0.036841,0.151774,0.106292,0.062812,0.047174,0.02821,0.010144,0.008941,...,4.247676e-08,3.603386e-08,3.319917e-08,2.99391e-08,2.89475e-08,2.727258e-08,2.329841e-08,2.146732e-08,1.875899e-08,1.708316e-08


In [15]:
df_b = df_transposed.loc[df_transposed['index'].str.startswith('cl_b'),:]
df_b

Unnamed: 0,index,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,...,l_502,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511
1,cl_b_r1,0.000177,0.000366,0.000385,0.000189,4.8e-05,0.000157,0.000147,7.3e-05,0.000103,...,6.403156e-10,5.961137e-10,5.322256e-10,5.335975e-10,4.977203e-10,4.774566e-10,4.56031e-10,4.012602e-10,4.110528e-10,3.73989e-10
3,cl_b_r2,0.000131,0.000159,0.000155,0.000246,0.000305,0.000177,7.9e-05,0.000113,8.9e-05,...,6.325501e-10,6.352722e-10,6.03934e-10,5.393067e-10,5.505967e-10,4.876478e-10,4.636709e-10,4.14326e-10,4.388595e-10,3.621788e-10
5,cl_b_r3,0.000145,7.8e-05,0.000249,0.00015,0.00022,0.000228,6.5e-05,0.000112,8.9e-05,...,6.581546e-10,5.821009e-10,5.628135e-10,5.011548e-10,5.436323e-10,4.437804e-10,4.408738e-10,4.097907e-10,3.875119e-10,3.430054e-10
7,cl_b_r4,0.000213,0.000188,9.2e-05,0.000115,0.000178,0.000191,0.000102,0.000114,5.1e-05,...,6.405241e-10,5.609236e-10,5.618016e-10,5.327607e-10,5.344538e-10,4.532758e-10,4.514293e-10,3.952968e-10,3.662061e-10,3.464608e-10
9,cl_b_r5,0.000209,0.000374,0.000206,0.000293,0.000216,9.9e-05,0.000134,0.000122,8.2e-05,...,6.557831e-10,6.403748e-10,6.702092e-10,5.777754e-10,5.39984e-10,4.75411e-10,4.605536e-10,4.126458e-10,4.131803e-10,3.808255e-10
11,cl_b_r6,0.000535,0.000418,0.000133,0.000382,0.000191,0.000228,0.00016,7e-05,5.6e-05,...,6.470075e-10,6.19268e-10,5.461034e-10,5.652164e-10,5.125295e-10,4.747061e-10,4.651558e-10,4.249107e-10,4.112673e-10,3.795339e-10
13,cl_b_r7,0.000715,4.4e-05,0.000349,0.000292,9.7e-05,9e-05,6.1e-05,0.0001,7.2e-05,...,7.099361e-10,6.022377e-10,6.133663e-10,5.686178e-10,5.406788e-10,4.747081e-10,4.886554e-10,4.16561e-10,4.201233e-10,3.6668e-10
15,cl_b_r8,0.000182,0.000306,0.000111,0.00018,8.9e-05,0.000172,0.000136,8.2e-05,9.9e-05,...,6.376528e-10,6.753697e-10,5.939455e-10,5.412465e-10,5.26113e-10,4.711842e-10,4.670323e-10,4.276227e-10,4.129978e-10,3.800595e-10
17,cl_b_r9,0.000175,0.000126,0.000326,0.000303,0.000265,0.000185,0.00016,5.4e-05,3.7e-05,...,6.606976e-10,6.297157e-10,5.882333e-10,5.182556e-10,5.218351e-10,4.675197e-10,4.449337e-10,4.373022e-10,4.185134e-10,3.907489e-10
19,cl_b_r10,0.000226,0.000184,0.000268,0.000156,0.000181,0.000163,0.000139,5.8e-05,4e-05,...,6.757486e-10,6.597193e-10,5.775008e-10,5.855063e-10,5.088324e-10,5.038816e-10,4.428262e-10,4.636432e-10,3.975155e-10,4.105091e-10


# 5 - Incluindo mais dados

- O processo acima foi para exemplificar o pré-processamento referente a uma linha das features do dataframe que será usado para a modelagem

- Abaixo inclui-se os demais dados seguindo-se a mesma linha de raciocínio

In [16]:
directory = '../data/01_raw/wider/'

dataframes = []

# Outer loop for 'j' values
for j in range(1, 1001):  # j from 501 to 700


    # Inner loop for 'i' values
    for i in range(1, 11):  # i from 1 to 10
        file_name = f'cl_cmb_c{j}_m{i}.dat'
        file_path = os.path.join(directory, file_name)

        try:
            df = pd.read_csv(file_path, sep='\s+', header=None)
            df.columns = [f'cl_e_realization{i}_target{j}', f'cl_b_realization{i}_target{j}']
            df.drop(index=[0, 1,512], inplace=True)
            dataframes.append(df)
            print(f"Prepared {file_name}")
        except FileNotFoundError:
            print(f"Error: The file {file_name} was not found.")
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}")

if dataframes:
    combined_df = pd.concat(dataframes, axis=1)  

else:
    print(f"\nNo dataframes were created for c{j}.")

# Create the DataFrame for cl by transposing the column
df_transposed = combined_df.T.reset_index()

columns = ['index'] + [f'l_{i}' for i in range(2,512)]

# Assign new headers
df_transposed.columns = columns   

# After the loop, separetes the dataframes into b e e modes
df_e = df_transposed.loc[df_transposed['index'].str.startswith('cl_e'),:]
df_b = df_transposed.loc[df_transposed['index'].str.startswith('cl_b'),:]

print("\n" + "="*40)
print("Final Concatenated DataFrame for cl_e_avg:")
print(df_e)

print("\n" + "="*40)
print("Final Concatenated DataFrame for cl_b_avg:")
print(df_b)
        

Prepared cl_cmb_c1_m1.dat
Prepared cl_cmb_c1_m2.dat
Prepared cl_cmb_c1_m3.dat
Prepared cl_cmb_c1_m4.dat
Prepared cl_cmb_c1_m5.dat
Prepared cl_cmb_c1_m6.dat
Prepared cl_cmb_c1_m7.dat
Prepared cl_cmb_c1_m8.dat
Prepared cl_cmb_c1_m9.dat
Prepared cl_cmb_c1_m10.dat
Prepared cl_cmb_c2_m1.dat
Prepared cl_cmb_c2_m2.dat
Prepared cl_cmb_c2_m3.dat
Prepared cl_cmb_c2_m4.dat
Prepared cl_cmb_c2_m5.dat
Prepared cl_cmb_c2_m6.dat
Prepared cl_cmb_c2_m7.dat
Prepared cl_cmb_c2_m8.dat
Prepared cl_cmb_c2_m9.dat
Prepared cl_cmb_c2_m10.dat
Prepared cl_cmb_c3_m1.dat
Prepared cl_cmb_c3_m2.dat
Prepared cl_cmb_c3_m3.dat
Prepared cl_cmb_c3_m4.dat
Prepared cl_cmb_c3_m5.dat
Prepared cl_cmb_c3_m6.dat
Prepared cl_cmb_c3_m7.dat
Prepared cl_cmb_c3_m8.dat
Prepared cl_cmb_c3_m9.dat
Prepared cl_cmb_c3_m10.dat
Prepared cl_cmb_c4_m1.dat
Prepared cl_cmb_c4_m2.dat
Prepared cl_cmb_c4_m3.dat
Prepared cl_cmb_c4_m4.dat
Prepared cl_cmb_c4_m5.dat
Prepared cl_cmb_c4_m6.dat
Prepared cl_cmb_c4_m7.dat
Prepared cl_cmb_c4_m8.dat
Prepared 

  df = pd.read_csv(file_path, sep='\s+', header=None)


Prepared cl_cmb_c10_m9.dat
Prepared cl_cmb_c10_m10.dat
Prepared cl_cmb_c11_m1.dat
Prepared cl_cmb_c11_m2.dat
Prepared cl_cmb_c11_m3.dat
Prepared cl_cmb_c11_m4.dat
Prepared cl_cmb_c11_m5.dat
Prepared cl_cmb_c11_m6.dat
Prepared cl_cmb_c11_m7.dat
Prepared cl_cmb_c11_m8.dat
Prepared cl_cmb_c11_m9.dat
Prepared cl_cmb_c11_m10.dat
Prepared cl_cmb_c12_m1.dat
Prepared cl_cmb_c12_m2.dat
Prepared cl_cmb_c12_m3.dat
Prepared cl_cmb_c12_m4.dat
Prepared cl_cmb_c12_m5.dat
Prepared cl_cmb_c12_m6.dat
Prepared cl_cmb_c12_m7.dat
Prepared cl_cmb_c12_m8.dat
Prepared cl_cmb_c12_m9.dat
Prepared cl_cmb_c12_m10.dat
Prepared cl_cmb_c13_m1.dat
Prepared cl_cmb_c13_m2.dat
Prepared cl_cmb_c13_m3.dat
Prepared cl_cmb_c13_m4.dat
Prepared cl_cmb_c13_m5.dat
Prepared cl_cmb_c13_m6.dat
Prepared cl_cmb_c13_m7.dat
Prepared cl_cmb_c13_m8.dat
Prepared cl_cmb_c13_m9.dat
Prepared cl_cmb_c13_m10.dat
Prepared cl_cmb_c14_m1.dat
Prepared cl_cmb_c14_m2.dat
Prepared cl_cmb_c14_m3.dat
Prepared cl_cmb_c14_m4.dat
Prepared cl_cmb_c14_m5.d

# 6 - Juntando os targets

## 6.1 - Extendendo o número de linhas dos targets

No dataset da feature, as 10 realizações associadas ao mesmo índice de target estarão associadas as linhas do target abaixo.

Portanto, será necessário extender as linhas dos targets antes de juntá-los ao dataframe das features.

In [17]:
df = pd.read_csv('../data/01_raw/targets/CosmoID_r_tau_As_1to1000_concat_2dLHsampling_wider.txt', sep=r'\s+', header=None)
df


Unnamed: 0,0,1,2,3
0,1,0.002872,0.073520,2.182426e-09
1,2,0.006368,0.077384,2.199359e-09
2,3,0.026655,0.109919,2.347228e-09
3,4,0.012663,0.121044,2.400039e-09
4,5,0.047298,0.013508,1.935590e-09
...,...,...,...,...
995,996,0.036522,0.104435,2.321625e-09
996,997,0.010416,0.021746,1.967746e-09
997,998,0.046954,0.075001,2.188900e-09
998,999,0.011467,0.026458,1.986378e-09


Adicionando nome às colunas

In [18]:
# Assign new headers
df.drop(columns=[0], inplace=True)
df.columns = ['r', 'tau', 'As']

print("\nDataFrame with headers:")

print(df)


DataFrame with headers:
            r       tau            As
0    0.002872  0.073520  2.182426e-09
1    0.006368  0.077384  2.199359e-09
2    0.026655  0.109919  2.347228e-09
3    0.012663  0.121044  2.400039e-09
4    0.047298  0.013508  1.935590e-09
..        ...       ...           ...
995  0.036522  0.104435  2.321625e-09
996  0.010416  0.021746  1.967746e-09
997  0.046954  0.075001  2.188900e-09
998  0.011467  0.026458  1.986378e-09
999  0.014936  0.051165  2.086998e-09

[1000 rows x 3 columns]


Extendendo o número de linhas dos targets:

In [None]:
# n is the number of times you want to repeat each row
n = 10

# Use index.repeat() to duplicate the index n times,
# then use .loc[] to create the new DataFrame.
df_extended = df.loc[df.index.repeat(n)]

# Reset the index for a clean 0 to 99 range
df_extended = df_extended.reset_index(drop=True)

In [20]:
df_extended

Unnamed: 0,r,tau,As
0,0.002872,0.073520,2.182426e-09
1,0.002872,0.073520,2.182426e-09
2,0.002872,0.073520,2.182426e-09
3,0.002872,0.073520,2.182426e-09
4,0.002872,0.073520,2.182426e-09
...,...,...,...
9995,0.014936,0.051165,2.086998e-09
9996,0.014936,0.051165,2.086998e-09
9997,0.014936,0.051165,2.086998e-09
9998,0.014936,0.051165,2.086998e-09


In [21]:
# Before concatenation, reset the index of each DataFrame
final_e_reset = df_e.reset_index(drop=True)
final_b_reset = df_b.reset_index(drop=True)

df_reset = df_extended.reset_index(drop=True)

# Now concatenate the DataFrames with the new, unique index
final_b = pd.concat([final_b_reset, df_reset], axis=1)
final_b.drop(columns=['tau','As'], inplace=True)


# Now concatenate the DataFrames with the new, unique index
final_e = pd.concat([final_e_reset, df_reset], axis=1)
final_e.drop(columns=['r','As'], inplace=True)


final_b


Unnamed: 0,index,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,...,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511,r
0,cl_b_realization1_target1,0.000172,0.000066,0.000078,0.000069,0.000063,0.000021,0.000053,0.000031,0.000043,...,6.058483e-10,6.123373e-10,5.492230e-10,4.987756e-10,4.996063e-10,4.873860e-10,4.858634e-10,3.759005e-10,4.054609e-10,0.002872
1,cl_b_realization2_target1,0.000058,0.000068,0.000118,0.000045,0.000045,0.000043,0.000063,0.000035,0.000042,...,5.957533e-10,5.382490e-10,5.511306e-10,4.965663e-10,4.848698e-10,4.638441e-10,4.146040e-10,3.850935e-10,3.658829e-10,0.002872
2,cl_b_realization3_target1,0.000066,0.000086,0.000129,0.000044,0.000056,0.000038,0.000041,0.000059,0.000060,...,6.044535e-10,5.756162e-10,5.306156e-10,4.744517e-10,4.625054e-10,4.354919e-10,3.960931e-10,3.977156e-10,3.608611e-10,0.002872
3,cl_b_realization4_target1,0.000140,0.000167,0.000060,0.000070,0.000064,0.000030,0.000018,0.000049,0.000025,...,6.067260e-10,5.923192e-10,5.206132e-10,5.064276e-10,4.546328e-10,4.771503e-10,4.041781e-10,3.925857e-10,3.493796e-10,0.002872
4,cl_b_realization5_target1,0.000025,0.000066,0.000041,0.000060,0.000052,0.000034,0.000046,0.000031,0.000035,...,6.144698e-10,5.878574e-10,5.581525e-10,5.349131e-10,4.615936e-10,4.710612e-10,4.277015e-10,3.812178e-10,3.678812e-10,0.002872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,cl_b_realization6_target1000,0.000017,0.000209,0.000385,0.000079,0.000089,0.000040,0.000053,0.000055,0.000056,...,5.516829e-10,5.934444e-10,5.169918e-10,5.210142e-10,4.470891e-10,4.619118e-10,3.933742e-10,3.561920e-10,3.397018e-10,0.014936
9996,cl_b_realization7_target1000,0.000021,0.000200,0.000087,0.000099,0.000158,0.000047,0.000015,0.000058,0.000137,...,5.594487e-10,5.265735e-10,4.667441e-10,4.792801e-10,4.221269e-10,3.972213e-10,3.605560e-10,3.874225e-10,3.352761e-10,0.014936
9997,cl_b_realization8_target1000,0.000244,0.000103,0.000202,0.000152,0.000073,0.000035,0.000057,0.000036,0.000052,...,5.235341e-10,6.281461e-10,5.356119e-10,5.341638e-10,4.507673e-10,4.504032e-10,4.218950e-10,4.454599e-10,3.516340e-10,0.014936
9998,cl_b_realization9_target1000,0.000029,0.000242,0.000166,0.000151,0.000074,0.000058,0.000049,0.000089,0.000043,...,5.865421e-10,5.865462e-10,5.099055e-10,5.082683e-10,4.383359e-10,4.465953e-10,3.948185e-10,3.890945e-10,3.387430e-10,0.014936


In [22]:
final_e

Unnamed: 0,index,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,...,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511,tau
0,cl_e_realization1_target1,0.042011,0.115039,0.044439,0.040249,0.046558,0.014648,0.024440,0.005089,0.006585,...,3.556248e-08,3.467475e-08,2.974423e-08,2.819476e-08,2.582774e-08,2.220550e-08,2.136289e-08,1.952949e-08,1.726959e-08,0.073520
1,cl_e_realization2_target1,0.036740,0.105374,0.041773,0.055584,0.026837,0.009200,0.005779,0.009623,0.006778,...,3.582069e-08,3.439128e-08,2.912229e-08,2.720134e-08,2.639238e-08,2.286990e-08,2.265625e-08,1.977268e-08,1.716156e-08,0.073520
2,cl_e_realization3_target1,0.017331,0.139739,0.132292,0.039769,0.030675,0.020003,0.012577,0.008986,0.007984,...,3.522029e-08,3.397636e-08,3.001171e-08,2.845725e-08,2.579380e-08,2.486779e-08,2.105911e-08,1.956422e-08,1.896769e-08,0.073520
3,cl_e_realization4_target1,0.030328,0.025411,0.065339,0.059547,0.080523,0.030582,0.015537,0.008128,0.008160,...,3.480585e-08,3.342992e-08,3.012188e-08,3.000465e-08,2.608773e-08,2.484761e-08,2.104791e-08,1.873560e-08,1.744245e-08,0.073520
4,cl_e_realization5_target1,0.028113,0.089417,0.076477,0.047844,0.029997,0.019519,0.010933,0.007549,0.008458,...,3.768956e-08,3.355341e-08,3.207865e-08,2.924375e-08,2.745029e-08,2.411026e-08,1.989331e-08,1.838740e-08,1.854919e-08,0.073520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,cl_e_realization6_target1000,0.007681,0.039022,0.023729,0.014266,0.009819,0.003871,0.002152,0.002171,0.002723,...,3.501260e-08,3.339130e-08,3.083130e-08,2.931677e-08,2.513462e-08,2.491568e-08,2.201023e-08,1.835751e-08,1.712793e-08,0.051165
9996,cl_e_realization7_target1000,0.073231,0.052051,0.033256,0.021235,0.007832,0.009031,0.004504,0.003259,0.004047,...,3.663146e-08,3.277586e-08,3.203553e-08,2.847473e-08,2.393431e-08,2.307673e-08,2.346293e-08,2.130891e-08,1.909358e-08,0.051165
9997,cl_e_realization8_target1000,0.003440,0.063770,0.046129,0.018907,0.013944,0.004275,0.002605,0.005096,0.002657,...,3.865888e-08,3.264289e-08,2.893302e-08,2.490603e-08,2.524395e-08,2.367276e-08,2.108207e-08,1.896507e-08,1.855942e-08,0.051165
9998,cl_e_realization9_target1000,0.026832,0.052187,0.031538,0.018963,0.009716,0.005181,0.002878,0.004366,0.002280,...,3.693244e-08,3.430184e-08,3.264026e-08,2.777419e-08,2.462420e-08,2.309103e-08,2.261246e-08,2.058053e-08,1.793291e-08,0.051165


# Separando entre treino e teste

Como o conjunto de dados utilizados acima tem a peculiaridade de ter 10 observações associadas ao mesmo target, abaixo utiza-se a seguinte estratégia:

- Divide-se os datasets entre X para as features e y para os targets.

- Utiliza-se a classe GroupShuffleSplit para encontrar-se índices agrupados utilizando-se os valores únicos do target para o agrupamento do dataframe.

- Filtra-se as features e targets utilizando-se os índices obtidos acima.

- Concatena-se as features com os targets para obter um dataframe final.

- Aplica-se a divisão entre treino e teste utilizando-se as funções criadas para excutar as etapas descritas acima.

In [23]:
X_features_e = final_e.drop(columns=['index','tau'])
y_target_e = final_e['tau']

X_features_b = final_b.drop(columns=['index','r'])
y_target_b = final_b['r']

# O array 'groups_e' é o valor de 'tau' para cada linha. 
# O array 'groups_b' é o valor de 'r' para cada linha. 
# O GroupKFold garantirá que todos os valores idênticos sejam mantidos juntos.
groups_e = y_target_e.to_numpy().ravel()
groups_b = y_target_b.to_numpy().ravel()


In [None]:
def gerar_indices_gss(X_features: np.ndarray, y_target: np.ndarray, train_ratio: float = 0.8) -> tuple:
    """
    Gera os índices de treino e teste usando GroupShuffleSplit (80/20 por padrão).
    """
    groups = y_target.to_numpy().ravel()
    
    # GroupShuffleSplit para um único split (n_splits=1) com a proporção train_ratio
    gss_splitter = GroupShuffleSplit(
        n_splits=1, 
        train_size=train_ratio, 
        random_state=42 # Fixa a aleatoriedade da divisão para reprodutibilidade
    )
    
    # GSS.split() retorna um iterador. Pegamos o primeiro (e único) par de índices.
    for train_index, test_index in gss_splitter.split(X_features, groups=groups):
        return train_index, test_index
    
    return np.array([]), np.array([])


def aplicar_split(X: np.ndarray, y: np.ndarray, 
                  train_index: np.ndarray, test_index: np.ndarray) -> tuple:
    """
    Fatia os arrays X e Y nos conjuntos de Treino e Teste usando os índices fornecidos.
    """
    X_train_array = X.iloc[train_index]
    X_test_array = X.iloc[test_index]
    
    y_train_array = y.iloc[train_index]
    y_test_array = y.iloc[test_index]
    
    return X_train_array, X_test_array, y_train_array, y_test_array


def montar_dataframe(X_array: np.ndarray, y_array: np.ndarray) -> pd.DataFrame:
    """
    Monta um DataFrame combinando features (X) e target (Y).
    """
    #df_features = pd.DataFrame(X_array, columns=colunas_X)
    #df_target = pd.DataFrame(y_array, columns=[coluna_y])
    
    return pd.concat([X_array, y_array], axis=1)

def criar_datasets_treino_teste_final(X_features: np.ndarray, 
                                     y_target: np.ndarray):                        
    """
    Função principal que orquestra o split GKF e a montagem dos DataFrames.
    """
    
    # 1. Geração dos Índices 
    train_index, test_index = gerar_indices_gss(X_features, y_target)
    
    # 2. Aplicação do Split 
    X_train, X_test, y_train, y_test = aplicar_split(
        X_features, y_target, train_index, test_index
    )
    
    # 3. Montagem dos DataFrames 
    df_train = montar_dataframe(X_train, y_train)
    df_test = montar_dataframe(X_test, y_test)
    
    return df_train, df_test


In [25]:
# 1. Processar os Dados do MODO E
df_train_E, df_test_E = criar_datasets_treino_teste_final(
    X_features=X_features_e, 
    y_target=y_target_e
)

# 2. Processar os Dados do MODO B
df_train_B, df_test_B = criar_datasets_treino_teste_final(
    X_features=X_features_b, 
    y_target=y_target_b, 
)

print(f"Modo E - Treino/Teste Shapes: {df_train_E.shape} / {df_test_E.shape}")
print(f"Modo B - Treino/Teste Shapes: {df_train_B.shape} / {df_test_B.shape}")

Modo E - Treino/Teste Shapes: (8000, 511) / (2000, 511)
Modo B - Treino/Teste Shapes: (8000, 511) / (2000, 511)


## Salvando dados de treino e teste em um arquivo csv

In [32]:
df_train_E.to_csv('../data/02_intermediate/training_e_df.csv', index=False)
df_test_E.to_csv('../data/02_intermediate/teste_e_df.csv', index=False)

df_train_B.to_csv('../data/02_intermediate/training_b_df.csv', index=False)
df_test_B.to_csv('../data/02_intermediate/teste_b_df.csv', index=False)

# Checa salvamento dos dados

In [34]:
df_e = pd.read_csv('../data/02_intermediate/training_e_df.csv')
df_e_teste = pd.read_csv('../data/02_intermediate/teste_e_df.csv')
df_b = pd.read_csv('../data/02_intermediate/training_b_df.csv')
df_b_teste = pd.read_csv('../data/02_intermediate/teste_b_df.csv')

In [28]:
df_e

Unnamed: 0,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,l_11,...,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511,tau
0,0.065003,0.064016,0.036842,0.048620,0.078288,0.018107,0.012528,0.010383,0.006262,0.004466,...,3.390130e-08,3.367275e-08,3.072641e-08,3.320320e-08,2.511141e-08,2.398705e-08,2.361673e-08,1.965174e-08,1.875916e-08,0.077384
1,0.015162,0.123912,0.110420,0.029549,0.060428,0.011627,0.030145,0.012140,0.006943,0.009513,...,3.666414e-08,3.402342e-08,2.992691e-08,2.742300e-08,2.584623e-08,2.485943e-08,2.148072e-08,2.057205e-08,1.916832e-08,0.077384
2,0.012025,0.085358,0.071173,0.145057,0.040771,0.026864,0.014395,0.009686,0.005667,0.006341,...,3.587941e-08,3.319525e-08,3.332796e-08,2.752022e-08,2.700340e-08,2.462848e-08,1.999068e-08,2.094212e-08,1.901356e-08,0.077384
3,0.039885,0.058472,0.071832,0.052434,0.027600,0.023469,0.015682,0.006613,0.003761,0.006582,...,3.653302e-08,3.244743e-08,3.226249e-08,2.844149e-08,2.546132e-08,2.456684e-08,2.184239e-08,2.036447e-08,1.763171e-08,0.077384
4,0.065013,0.041966,0.034875,0.085098,0.041539,0.034242,0.018618,0.009849,0.012013,0.014574,...,3.947543e-08,3.230519e-08,2.890657e-08,2.632855e-08,2.666674e-08,2.445078e-08,2.244877e-08,2.052424e-08,1.997935e-08,0.077384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.007681,0.039022,0.023729,0.014266,0.009819,0.003871,0.002152,0.002171,0.002723,0.001720,...,3.501260e-08,3.339130e-08,3.083130e-08,2.931677e-08,2.513462e-08,2.491568e-08,2.201023e-08,1.835751e-08,1.712793e-08,0.051165
7996,0.073231,0.052051,0.033256,0.021235,0.007832,0.009031,0.004504,0.003259,0.004047,0.002325,...,3.663146e-08,3.277586e-08,3.203553e-08,2.847473e-08,2.393431e-08,2.307673e-08,2.346293e-08,2.130891e-08,1.909358e-08,0.051165
7997,0.003440,0.063770,0.046129,0.018907,0.013944,0.004275,0.002605,0.005096,0.002657,0.003597,...,3.865888e-08,3.264289e-08,2.893302e-08,2.490603e-08,2.524395e-08,2.367276e-08,2.108207e-08,1.896507e-08,1.855942e-08,0.051165
7998,0.026832,0.052187,0.031538,0.018963,0.009716,0.005181,0.002878,0.004366,0.002280,0.001562,...,3.693244e-08,3.430184e-08,3.264026e-08,2.777419e-08,2.462420e-08,2.309103e-08,2.261246e-08,2.058053e-08,1.793291e-08,0.051165


In [29]:
df_e_teste

Unnamed: 0,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,l_11,...,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511,tau
0,0.042011,0.115039,0.044439,0.040249,0.046558,0.014648,0.024440,0.005089,0.006585,0.004986,...,3.556248e-08,3.467475e-08,2.974423e-08,2.819476e-08,2.582774e-08,2.220550e-08,2.136289e-08,1.952949e-08,1.726959e-08,0.073520
1,0.036740,0.105374,0.041773,0.055584,0.026837,0.009200,0.005779,0.009623,0.006778,0.004694,...,3.582069e-08,3.439128e-08,2.912229e-08,2.720134e-08,2.639238e-08,2.286990e-08,2.265625e-08,1.977268e-08,1.716156e-08,0.073520
2,0.017331,0.139739,0.132292,0.039769,0.030675,0.020003,0.012577,0.008986,0.007984,0.003046,...,3.522029e-08,3.397636e-08,3.001171e-08,2.845725e-08,2.579380e-08,2.486779e-08,2.105911e-08,1.956422e-08,1.896769e-08,0.073520
3,0.030328,0.025411,0.065339,0.059547,0.080523,0.030582,0.015537,0.008128,0.008160,0.005745,...,3.480585e-08,3.342992e-08,3.012188e-08,3.000465e-08,2.608773e-08,2.484761e-08,2.104791e-08,1.873560e-08,1.744245e-08,0.073520
4,0.028113,0.089417,0.076477,0.047844,0.029997,0.019519,0.010933,0.007549,0.008458,0.004227,...,3.768956e-08,3.355341e-08,3.207865e-08,2.924375e-08,2.745029e-08,2.411026e-08,1.989331e-08,1.838740e-08,1.854919e-08,0.073520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.004507,0.007408,0.003857,0.001845,0.000955,0.000764,0.000579,0.000592,0.000779,0.000493,...,3.290691e-08,3.410572e-08,3.192991e-08,2.622592e-08,2.645551e-08,2.397919e-08,2.277135e-08,2.097459e-08,1.848268e-08,0.026458
1996,0.005351,0.009917,0.000735,0.002133,0.000538,0.001243,0.000716,0.000791,0.000656,0.000517,...,3.664329e-08,3.459954e-08,3.142231e-08,2.586035e-08,2.583688e-08,2.438355e-08,2.123083e-08,2.105014e-08,1.856515e-08,0.026458
1997,0.005785,0.013439,0.009985,0.002152,0.001468,0.001179,0.000930,0.000654,0.000681,0.001292,...,3.830068e-08,3.429362e-08,3.137908e-08,2.864006e-08,2.614058e-08,2.451234e-08,2.213164e-08,1.981139e-08,1.829760e-08,0.026458
1998,0.020743,0.010360,0.003220,0.001211,0.001242,0.000955,0.000448,0.000363,0.000864,0.000617,...,3.734023e-08,3.537605e-08,2.897074e-08,2.883298e-08,2.589562e-08,2.359606e-08,2.158875e-08,1.948890e-08,1.844936e-08,0.026458


In [30]:
df_b

Unnamed: 0,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,l_11,...,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511,r
0,0.000172,0.000066,0.000078,0.000069,0.000063,0.000021,0.000053,0.000031,0.000043,0.000060,...,6.058483e-10,6.123373e-10,5.492230e-10,4.987756e-10,4.996063e-10,4.873860e-10,4.858634e-10,3.759005e-10,4.054609e-10,0.002872
1,0.000058,0.000068,0.000118,0.000045,0.000045,0.000043,0.000063,0.000035,0.000042,0.000061,...,5.957533e-10,5.382490e-10,5.511306e-10,4.965663e-10,4.848698e-10,4.638441e-10,4.146040e-10,3.850935e-10,3.658829e-10,0.002872
2,0.000066,0.000086,0.000129,0.000044,0.000056,0.000038,0.000041,0.000059,0.000060,0.000048,...,6.044535e-10,5.756162e-10,5.306156e-10,4.744517e-10,4.625054e-10,4.354919e-10,3.960931e-10,3.977156e-10,3.608611e-10,0.002872
3,0.000140,0.000167,0.000060,0.000070,0.000064,0.000030,0.000018,0.000049,0.000025,0.000042,...,6.067260e-10,5.923192e-10,5.206132e-10,5.064276e-10,4.546328e-10,4.771503e-10,4.041781e-10,3.925857e-10,3.493796e-10,0.002872
4,0.000025,0.000066,0.000041,0.000060,0.000052,0.000034,0.000046,0.000031,0.000035,0.000084,...,6.144698e-10,5.878574e-10,5.581525e-10,5.349131e-10,4.615936e-10,4.710612e-10,4.277015e-10,3.812178e-10,3.678812e-10,0.002872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.000138,0.000049,0.000021,0.000019,0.000019,0.000039,0.000031,0.000039,0.000046,0.000054,...,5.715559e-10,5.597664e-10,5.280729e-10,5.106220e-10,4.678772e-10,4.277167e-10,4.167436e-10,3.841709e-10,3.595660e-10,0.011467
7996,0.000049,0.000049,0.000023,0.000023,0.000052,0.000026,0.000055,0.000036,0.000066,0.000094,...,5.945880e-10,5.588835e-10,5.163429e-10,4.827139e-10,4.791565e-10,4.352070e-10,4.263153e-10,3.777479e-10,3.572270e-10,0.011467
7997,0.000084,0.000028,0.000024,0.000038,0.000044,0.000050,0.000049,0.000026,0.000089,0.000078,...,6.202956e-10,5.871600e-10,5.503659e-10,5.109527e-10,4.832764e-10,4.629816e-10,4.468772e-10,4.371018e-10,3.820512e-10,0.011467
7998,0.000007,0.000125,0.000024,0.000012,0.000050,0.000034,0.000055,0.000085,0.000079,0.000103,...,5.591386e-10,5.611786e-10,5.009329e-10,5.056176e-10,4.515758e-10,4.394383e-10,4.093147e-10,3.904380e-10,3.585950e-10,0.011467


In [31]:
df_b_teste

Unnamed: 0,l_2,l_3,l_4,l_5,l_6,l_7,l_8,l_9,l_10,l_11,...,l_503,l_504,l_505,l_506,l_507,l_508,l_509,l_510,l_511,r
0,0.000324,0.000231,0.000522,0.000442,0.000362,0.000169,0.000340,0.000088,0.000165,0.000080,...,6.354351e-10,6.187699e-10,5.472912e-10,5.294808e-10,4.588926e-10,4.831911e-10,4.436403e-10,4.468961e-10,3.870140e-10,0.012381
1,0.000151,0.000311,0.001109,0.000375,0.000294,0.000303,0.000097,0.000167,0.000160,0.000236,...,6.694943e-10,6.136765e-10,5.431460e-10,5.383948e-10,4.906521e-10,4.529458e-10,4.044472e-10,4.218503e-10,3.899587e-10,0.012381
2,0.000202,0.000149,0.000227,0.000125,0.000369,0.000214,0.000223,0.000248,0.000140,0.000113,...,6.146857e-10,5.446755e-10,5.413667e-10,4.347151e-10,4.661081e-10,4.313945e-10,4.091606e-10,3.493361e-10,4.023474e-10,0.012381
3,0.000220,0.000457,0.000355,0.000421,0.000318,0.000099,0.000261,0.000180,0.000138,0.000109,...,6.131173e-10,6.130531e-10,5.876743e-10,5.359379e-10,5.261903e-10,4.781340e-10,4.263396e-10,3.872770e-10,3.959668e-10,0.012381
4,0.000344,0.000149,0.000228,0.000236,0.000213,0.000378,0.000264,0.000188,0.000133,0.000094,...,6.410077e-10,5.819736e-10,5.628697e-10,5.259830e-10,4.852131e-10,4.648140e-10,3.990974e-10,4.037040e-10,3.888955e-10,0.012381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000017,0.000209,0.000385,0.000079,0.000089,0.000040,0.000053,0.000055,0.000056,0.000124,...,5.516829e-10,5.934444e-10,5.169918e-10,5.210142e-10,4.470891e-10,4.619118e-10,3.933742e-10,3.561920e-10,3.397018e-10,0.014936
1996,0.000021,0.000200,0.000087,0.000099,0.000158,0.000047,0.000015,0.000058,0.000137,0.000072,...,5.594487e-10,5.265735e-10,4.667441e-10,4.792801e-10,4.221269e-10,3.972213e-10,3.605560e-10,3.874225e-10,3.352761e-10,0.014936
1997,0.000244,0.000103,0.000202,0.000152,0.000073,0.000035,0.000057,0.000036,0.000052,0.000085,...,5.235341e-10,6.281461e-10,5.356119e-10,5.341638e-10,4.507673e-10,4.504032e-10,4.218950e-10,4.454599e-10,3.516340e-10,0.014936
1998,0.000029,0.000242,0.000166,0.000151,0.000074,0.000058,0.000049,0.000089,0.000043,0.000053,...,5.865421e-10,5.865462e-10,5.099055e-10,5.082683e-10,4.383359e-10,4.465953e-10,3.948185e-10,3.890945e-10,3.387430e-10,0.014936
