In [11]:
import pandas as pd

def preprocess_excel_data(file_path: str, sheet_index: int, skip_rows: int, categorical_columns: list, output_csv: str) -> pd.DataFrame:
    
    df = pd.read_excel(file_path, sheet_name=sheet_index, header=None, skiprows=skip_rows)

    feature_vector = df.iloc[0:1086, :]
    additional_features = df.iloc[1087:1099, :]

    full_data = pd.concat([feature_vector.T, additional_features.T], axis=1)
    df = full_data.drop(0, axis=0)
    df.columns = df.iloc[0]  
    df = df[1:].reset_index(drop=True)
    df = df[~df.isin(['x']).any(axis=1)]

    df = pd.get_dummies(df, columns=categorical_columns).astype(int)

    df.to_csv(output_csv, index=False)
    pd.set_option('display.max_columns', None)

    return df

df = preprocess_excel_data(
    file_path='Data/Data.xlsx',
    sheet_index=0,
    skip_rows=1,
    categorical_columns=['Digester', 'Source', 'Type', 'Waste', 'Biomass'],
    output_csv='output.csv'
)

In [12]:
import pandas as pd
import os

def get_unique_digester_samples(df: pd.DataFrame, digester_prefix: str, sample_count: int = 20, output_dir: str = "Output_Files") -> pd.DataFrame:
 
    digester_columns = [col for col in df.columns if col.startswith(digester_prefix)]
    
    if not digester_columns:
        raise ValueError("No one-hot encoded 'Digester' columns found. Check your categorical column names.")

    selected_samples = []

    for digester in digester_columns:
        digester_group = df[df[digester] == 1]

        if not digester_group.empty:
            selected_samples.append(digester_group.sample(n=1, random_state=42))  # Ensure reproducibility

        if len(selected_samples) == sample_count:
            break

    unique_digester_samples_df = pd.concat(selected_samples).reset_index(drop=True)

    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_file = os.path.join(output_dir, "unique_digester_samples.csv")
    unique_digester_samples_df.to_csv(output_file, index=False)

    print(f"Saved output to {output_file}")

    return unique_digester_samples_df


output_directory = "Output_Files"

unique_samples_df = get_unique_digester_samples(df, digester_prefix='Digester_', sample_count=20, output_dir=output_directory)


Saved output to Output_Files/unique_digester_samples.csv


In [13]:
import pandas as pd

def split_and_save_by_biomass(df: pd.DataFrame, biomass_prefix: str, output_dir: str):
    """
    Splits the dataset into separate DataFrames based on Biomass type and saves them as CSV files.

    Args:
        df (pd.DataFrame): The processed DataFrame with one-hot encoded Biomass categories.
        biomass_prefix (str): The prefix used for one-hot encoded Biomass columns.
        output_dir (str): Directory where the CSV files should be saved.

    Returns:
        dict: A dictionary where keys are Biomass categories, and values are corresponding DataFrames.
    """

    biomass_columns = [col for col in df.columns if col.startswith(biomass_prefix)]
    
    if not biomass_columns:
        raise ValueError("No one-hot encoded 'Biomass' columns found. Check your categorical column names.")

    biomass_dfs = {}

    for biomass in biomass_columns:
        biomass_df = df[df[biomass] == 1].reset_index(drop=True)

        file_path = f"{output_dir}/{biomass}.csv"
        biomass_df.to_csv(file_path, index=False)

        biomass_dfs[biomass] = biomass_df

        print(f"Saved {file_path}")

    return biomass_dfs


output_directory = "Biomass_Files"

import os
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

biomass_dataframes = split_and_save_by_biomass(df, biomass_prefix='Biomass_', output_dir=output_directory)


Saved Biomass_Files/Biomass_F.csv
Saved Biomass_Files/Biomass_G.csv
