# Pré-processamento de dados

## Primeira ideia

- Foram feitas 10 simulações para $cl_b$ e $cl_e$ para se associar a um valor de $r$ e $\tau$.

- Para a modelagem, precisamos agrupar esses valores para termos apenas um valor de $cl_b$ e $cl_e$.

- Vamos fazer a média das 10 simulações e associar a média, para cada harmonico esférico, aos observáveis $r$ e $\tau$.


# Parâmetros

In [61]:
directory = 'narrow'
dataframes = []
base_columns = ['cl_e', 'cl_b']
base_avg_columns = ['cl_e_avg', 'cl_b_avg']
file_range = 11

# Funções

In [None]:
import pandas as pd
import os
from typing import List

def read_and_prepare_files(directory: str, file_range: int, base_columns: List[str]) -> List[pd.DataFrame]:
    """
    Reads a series of files, renames their columns with a unique ID,
    and returns a list of the prepared DataFrames.

    Args:
        directory: The path to the directory containing the files.
        file_ids: A list of integers representing the file IDs to process.
        base_columns: A list of base column names to be used for renaming.

    Returns:
        A list of pandas DataFrames, each with uniquely named columns.
    """
    prepared_dfs = []
    
    for i in range(1, file_range):
        file_name = f'cl_cmb_c501_m{i}.dat'
        file_path = os.path.join(directory, file_name)
        
        try:
            df = pd.read_csv(file_path, sep='\s+', header=None)

            df.drop(index=[0, 1], inplace=True)
            
            # Create a unique list of column names for this file
            unique_columns = [f'{col}_{i}' for col in base_columns]
            df.columns = unique_columns
            
            prepared_dfs.append(df)
        except FileNotFoundError:
            print(f"Error: The file {file_name} was not found.")
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}")
            
    return prepared_dfs


def combine_dataframes(dfs: List[pd.DataFrame], axis = int) -> pd.DataFrame:
    """
    Combines a list of DataFrames side-by-side using pd.concat with axis=1.

    Args:
        dfs: A list of pandas DataFrames to be joined.

    Returns:
        A single DataFrame resulting from the horizontal concatenation.
    """
    # Join all DataFrames side-by-side using axis=1
    combined_df = pd.concat(dfs, axis= axis)
    
    return combined_df

def calculate_average_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the row-wise average of 'cl_e' and 'cl_b' columns
    and returns a new DataFrame with the average columns.

    Args:
        df: The input DataFrame containing columns with 'cl_e_' and 'cl_b_' prefixes.

    Returns:
        A new DataFrame with two columns: 'cl_e_avg' and 'cl_b_avg'.
    """
    # Select all columns that start with 'cl_e_'
    cl_e_cols = [col for col in df.columns if col.startswith('cl_e_')]

    # Select all columns that start with 'cl_b_'
    cl_b_cols = [col for col in df.columns if col.startswith('cl_b_')]

    # Calculate the average for each group across rows (axis=1)
    cl_e_avg = df[cl_e_cols].mean(axis=1)
    cl_b_avg = df[cl_b_cols].mean(axis=1)

    # Create the new DataFrame with the two average columns
    final_df = pd.DataFrame({
        'cl_e_avg': cl_e_avg,
        'cl_b_avg': cl_b_avg
    })

    return final_df


def create_transposed_dataframe(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """
    Transposes a single column from a DataFrame.

    Args:
        df: The input DataFrame.
        column_name: The name of the column to transpose.

    Returns:
        The transposed DataFrame.
    """
    transposed_df = df[[column_name]].T
    return transposed_df


In [63]:
dfs_list = read_and_prepare_files(directory=directory, file_range= file_range ,base_columns= base_columns)

df = combine_dataframes_horizontally(dfs_list)

df_avg = calculate_average_columns(df)


# Create the DataFrame for cl_e_avg by transposing the column
df_e_avg_transposed = create_transposed_dataframe(df_avg, base_avg_columns[0])

# Create the DataFrame for cl_e_avg by transposing the column
df_b_avg_transposed = create_transposed_dataframe(df_avg, base_avg_columns[1])

# Display the shapes to confirm the transformation
print("\n" + "="*40)
print(f"DataFrame for cl_e_avg shape: {df_e_avg_transposed.shape}")
print("DataFrame for cl_e_avg (first 5 columns):")
print(df_e_avg_transposed.iloc[:, :5])

print("\n" + "="*40)
print(f"DataFrame for cl_b_avg shape: {df_b_avg_transposed.shape}")
print("DataFrame for cl_b_avg (first 5 columns):")
print(df_b_avg_transposed.iloc[:, :5])



DataFrame for cl_e_avg shape: (1, 511)
DataFrame for cl_e_avg (first 5 columns):
                 2         3         4         5         6
cl_e_avg  0.030175  0.027245  0.024617  0.019967  0.010551

DataFrame for cl_b_avg shape: (1, 511)
DataFrame for cl_b_avg (first 5 columns):
                 2         3         4         5         6
cl_b_avg  0.000179  0.000156  0.000133  0.000102  0.000076


In [64]:
dfs_e = []

dfs_b = []

for i in range(501, 701):

    dfs_list = read_and_prepare_files(directory=directory, file_range= file_range ,base_columns= base_columns)

    df = combine_dataframes_horizontally(dfs_list)

    df_avg = calculate_average_columns(df)

    # Create the DataFrame for cl_e_avg by transposing the column
    df_e_avg_transposed = create_transposed_dataframe(df_avg, base_avg_columns[0])

    dfs_e.append(df_e_avg_transposed)

    # Create the DataFrame for cl_e_avg by transposing the column
    df_b_avg_transposed = create_transposed_dataframe(df_avg, base_avg_columns[1])
    
    dfs_b.append(df_b_avg_transposed)
    

In [57]:
df_avg

Unnamed: 0,cl_e_avg,cl_b_avg
0,0.000000e+00,0.000000e+00
1,0.000000e+00,0.000000e+00
2,3.017487e-02,1.791218e-04
3,2.724500e-02,1.557736e-04
4,2.461747e-02,1.329248e-04
...,...,...
508,2.377047e-08,4.515863e-10
509,2.151342e-08,4.041255e-10
510,2.028507e-08,3.969090e-10
511,1.846510e-08,3.614847e-10


# VERSÃO KEDRO


In [None]:
# In conf/base/parameters.yml
data_directory: "narrow"
file_ids: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
base_columns: ['cl_e', 'cl_b']

# In conf/base/catalog.yml
prepared_dfs_list:
  type: kedro.extras.datasets.pickle.PickleDataSet
  filepath: data/01_raw/prepared_dfs_list.pkl

combined_df:
  type: pandas.CSVDataSet
  filepath: data/02_intermediate/combined_df.csv

combined_df:
  type: pandas.CSVDataSet
  filepath: data/02_intermediate/combined_data.csv # or wherever the combined file is saved

final_df:
  type: pandas.CSVDataSet
  filepath: data/03_primary/averaged_data.csv

In [None]:
import pandas as pd
import os
from typing import List

def read_and_prepare_files(directory: str, file_ids: List[int], base_columns: List[str]) -> List[pd.DataFrame]:
    """
    Reads a series of files, renames their columns with a unique ID,
    and returns a list of the prepared DataFrames.

    Args:
        directory: The path to the directory containing the files.
        file_ids: A list of integers representing the file IDs to process.
        base_columns: A list of base column names to be used for renaming.

    Returns:
        A list of pandas DataFrames, each with uniquely named columns.
    """
    prepared_dfs = []
    
    for i in file_ids:
        file_name = f'cl_cmb_c501_m{i}.dat'
        file_path = os.path.join(directory, file_name)
        
        try:
            df = pd.read_csv(file_path, sep='\s+', header=None)
            
            # Create a unique list of column names for this file
            unique_columns = [f'{col}_{i}' for col in base_columns]
            df.columns = unique_columns
            
            prepared_dfs.append(df)
        except FileNotFoundError:
            print(f"Error: The file {file_name} was not found.")
        except Exception as e:
            print(f"An error occurred while reading {file_name}: {e}")
            
    return prepared_dfs

def combine_dataframes_horizontally(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Combines a list of DataFrames side-by-side using pd.concat with axis=1.

    Args:
        dfs: A list of pandas DataFrames to be joined.

    Returns:
        A single DataFrame resulting from the horizontal concatenation.
    """
    # Join all DataFrames side-by-side using axis=1
    combined_df = pd.concat(dfs, axis=1)
    
    return combined_df


def calculate_average_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the row-wise average of 'cl_e' and 'cl_b' columns
    and returns a new DataFrame with the average columns.

    Args:
        df: The input DataFrame containing columns with 'cl_e_' and 'cl_b_' prefixes.

    Returns:
        A new DataFrame with two columns: 'cl_e_avg' and 'cl_b_avg'.
    """
    # Select all columns that start with 'cl_e_'
    cl_e_cols = [col for col in df.columns if col.startswith('cl_e_')]

    # Select all columns that start with 'cl_b_'
    cl_b_cols = [col for col in df.columns if col.startswith('cl_b_')]

    # Calculate the average for each group across rows (axis=1)
    cl_e_avg = df[cl_e_cols].mean(axis=1)
    cl_b_avg = df[cl_b_cols].mean(axis=1)

    # Create the new DataFrame with the two average columns
    final_df = pd.DataFrame({
        'cl_e_avg': cl_e_avg,
        'cl_b_avg': cl_b_avg
    })

    return final_df

In [None]:
from kedro.pipeline import Node, Pipeline

from .nodes import read_and_prepare_files, combine_dataframes_horizontally, calculate_average_columns, create_model_input_table, preprocess_companies, preprocess_shuttles


def create_pipeline(**kwargs) -> Pipeline:
    return Pipeline(
        [
            Node(
                func=read_and_prepare_files,
                inputs={
                    "directory": "params:data_directory",
                    "file_ids": "params:file_ids",
                    "base_columns": "params:base_columns"
                },
                outputs="prepared_dfs_list",
                name="read_and_prepare_files_node",
            ),
            Node(
                func=combine_dataframes_horizontally,
                inputs="prepared_dfs_list",
                outputs="combined_df",
                name="combine_prepared_dataframes_node",
            ),
            Node(
                func=calculate_average_columns,
                inputs="combined_df",
                outputs="final_df",
                name="calculate_average_columns_node",
            )
        ]
    )


In [24]:
# Read a space-separated text file
df_wider = pd.read_csv('CosmoID_r_tau_As_1to1000_concat_2dLHsampling_wider.txt', sep='\s+', header = None)

print(df_wider)

        0         1         2             3
0       1  0.002872  0.073520  2.182426e-09
1       2  0.006368  0.077384  2.199359e-09
2       3  0.026655  0.109919  2.347228e-09
3       4  0.012663  0.121044  2.400039e-09
4       5  0.047298  0.013508  1.935590e-09
..    ...       ...       ...           ...
995   996  0.036522  0.104435  2.321625e-09
996   997  0.010416  0.021746  1.967746e-09
997   998  0.046954  0.075001  2.188900e-09
998   999  0.011467  0.026458  1.986378e-09
999  1000  0.014936  0.051165  2.086998e-09

[1000 rows x 4 columns]


In [25]:
# Assign new headers
df_wider.columns = ['id', 'r', 'tau','incognito']

print("\nDataFrame with headers:")

print(df_wider)


DataFrame with headers:
       id         r       tau     incognito
0       1  0.002872  0.073520  2.182426e-09
1       2  0.006368  0.077384  2.199359e-09
2       3  0.026655  0.109919  2.347228e-09
3       4  0.012663  0.121044  2.400039e-09
4       5  0.047298  0.013508  1.935590e-09
..    ...       ...       ...           ...
995   996  0.036522  0.104435  2.321625e-09
996   997  0.010416  0.021746  1.967746e-09
997   998  0.046954  0.075001  2.188900e-09
998   999  0.011467  0.026458  1.986378e-09
999  1000  0.014936  0.051165  2.086998e-09

[1000 rows x 4 columns]


ValueError: Length mismatch: Expected axis has 1 elements, new values have 3 elements

In [3]:
# Read a space-separated text file
df = pd.read_csv('CosmoID_r_tau_As_1to500wider_501to700narrow_2dLHsampling.txt', sep=',')

print(df)

       1  0.002872  0.073520  2.182426e-09
0      2  0.006368  0.077384  2.199359e-09
1      3  0.026655  0.109919  2.347228e-09
2      4  0.012663  0.121044  2.400039e-09
3      5  0.047298  0.013508  1.935590e-09
4      6  0.036662  0.034989  2.020563e-09
..                                     ...
694  696  0.006017  0.059273  2.121118e-09
695  697  0.027259  0.047538  2.071914e-09
696  698  0.016554  0.041973  2.048984e-09
697  699  0.012717  0.042370  2.050611e-09
698  700  0.025609  0.067076  2.154481e-09

[699 rows x 1 columns]


In [20]:
# Read a space-separated text file
df = pd.read_csv('narrow/cl_cmb_c501_m1.dat',  sep='\s+', header = None)

print(df)

                0             1
0    0.000000e+00  0.000000e+00
1    0.000000e+00  0.000000e+00
2    4.251932e-03  2.276817e-04
3    2.081712e-02  7.996054e-05
4    7.046895e-03  9.265473e-05
..            ...           ...
508  2.293751e-08  4.881470e-10
509  2.057065e-08  3.901631e-10
510  2.099721e-08  4.011278e-10
511  1.916048e-08  3.570308e-10
512  0.000000e+00  0.000000e+00

[513 rows x 2 columns]



DataFrame with headers:
             Cl_E          Cl_B
0    0.000000e+00  0.000000e+00
1    0.000000e+00  0.000000e+00
2    4.251932e-03  2.276817e-04
3    2.081712e-02  7.996054e-05
4    7.046895e-03  9.265473e-05
..            ...           ...
508  2.293751e-08  4.881470e-10
509  2.057065e-08  3.901631e-10
510  2.099721e-08  4.011278e-10
511  1.916048e-08  3.570308e-10
512  0.000000e+00  0.000000e+00

[513 rows x 2 columns]
