In [12]:
import os
import pandas as pd

In [13]:
def process_raw_data(data_dir, file_prefix, output_dir, ID, metadata_rows=14, drop_row=2):
    """
    Process raw data files in a specified directory with a given file prefix.

    Args:
        data_dir (str): Path to the directory containing the files.
        file_prefix (str): Prefix of the files to process (e.g., 'unit_436').
        metadata_rows (int): Number of metadata rows to skip. Default is 14.
        drop_row (int): Row number to drop after skipping metadata. Default is 17.

    Returns:
        pd.DataFrame: A combined DataFrame containing processed data from all files.
    """
    # Get a list of all files starting with the given prefix
    files_to_process = [f for f in os.listdir(data_dir) if f.startswith(file_prefix) and f.endswith(".asc")]

    # Create an empty DataFrame to store combined data
    combined_data = pd.DataFrame()

    # Process each file
    for file in files_to_process:
        file_path = os.path.join(data_dir, file)
        try:
            # Extract year, Julian day, and dive cycle number from the file name
            parts = file.split("-")
            year = int(parts[1])
            julian_day = int(parts[2])
            dive_cycle = int(parts[3])
            
            # Read the file, skipping metadata rows
            df = pd.read_csv(file_path, delimiter='\s+', skiprows=metadata_rows)

            # Drop the specified row
            df = df.iloc[drop_row:] 

            # Add metadata columns to the DataFrame
            df['year'] = year
            df['julian_day'] = julian_day
            df['dive_cycle'] = dive_cycle

            # Append data to the combined DataFrame
            combined_data = pd.concat([combined_data, df], ignore_index=True)

            print(f"Successfully processed: {file}")
        except Exception as e:
            print(f"Error processing {file}: {e}")

    # Save each processed file as a separate CSV
    combined_output_file = os.path.join(output_dir, f"combined_{ID}.csv")
    combined_data.to_csv(combined_output_file, index=False)

    # Save each processed file as a separate parquet
    combined_output_file = os.path.join(output_dir, f"combined_{ID}.parquet")
    combined_data.to_parquet(combined_output_file, index=False)
    
    return combined_data

In [14]:
# DIRECTORIES AND PREFIX OF DATA TO PROCESS
# 1. Cabot_517
Cabot_517_data_dir = r"C:\Users\Silvia\Desktop\Year 3 Project\UG data\Raw Data\517\asc"
Cabot_517_file_prefix = "unit_345"
Cabot_517_output_dir = r'C:\Users\Silvia\Desktop\Year 3 Project\UG data\PREPROCESSED\517'
Cabot_517_ID = '517'

# 2. Dolomite_499
Dolomite_499_data_dir = r"C:\Users\Silvia\Desktop\Year 3 Project\UG data\Raw Data\499\asc"
Dolomite_499_file_prefix = "unit_305"
Dolomite_499_output_dir = r'C:\Users\Silvia\Desktop\Year 3 Project\UG data\PREPROCESSED\499'
Dolomite_499_ID = '499'

# 3. Stella_494
Stella_494_data_dir = r"C:\Users\Silvia\Desktop\Year 3 Project\UG data\Raw Data\494\asc"
Stella_494_file_prefix = "unit_436"
Stella_494_output_dir = r'C:\Users\Silvia\Desktop\Year 3 Project\UG data\PREPROCESSED\494'
Stella_494_ID = '494'

# 4. Kelvin_481
Kelvin_481_data_dir = r"C:\Users\Silvia\Desktop\Year 3 Project\UG data\Raw Data\481\asc"
Kelvin_481_file_prefix = "unit_444"
Kelvin_481_output_dir = r'C:\Users\Silvia\Desktop\Year 3 Project\UG data\PREPROCESSED\481'
Kelvin_481_ID = '481'

# 5. Cabot_454
Cabot_454_data_dir = r"C:\Users\Silvia\Desktop\Year 3 Project\UG data\Raw Data\454\asc"
Cabot_454_file_prefix = "unit_345"
Cabot_454_output_dir = r'C:\Users\Silvia\Desktop\Year 3 Project\UG data\PREPROCESSED\454'
Cabot_454_ID = '454'

In [16]:
# Process & save the data
combined_517_raw_data = process_raw_data(Cabot_517_data_dir, Cabot_517_file_prefix, Cabot_517_output_dir, Cabot_517_ID)
combined_499_raw_data = process_raw_data(Dolomite_499_data_dir, Dolomite_499_file_prefix, Dolomite_499_output_dir, Dolomite_499_ID)
combined_494_raw_data = process_raw_data(Stella_494_data_dir, Stella_494_file_prefix, Stella_494_output_dir, Stella_494_ID)
combined_481_raw_data = process_raw_data(Kelvin_481_data_dir, Kelvin_481_file_prefix, Kelvin_481_output_dir, Kelvin_481_ID)
combined_454_raw_data = process_raw_data(Cabot_454_data_dir, Cabot_454_file_prefix, Cabot_454_output_dir, Cabot_454_ID)

Successfully processed: unit_345-2019-070-4-0.sbd.asc
Successfully processed: unit_345-2019-070-4-0.tbd.asc
Successfully processed: unit_345-2019-070-5-0.sbd.asc
Successfully processed: unit_345-2019-070-5-0.tbd.asc
Successfully processed: unit_345-2019-070-6-0.sbd.asc
Successfully processed: unit_345-2019-070-6-0.tbd.asc
Successfully processed: unit_345-2019-070-6-1.sbd.asc
Successfully processed: unit_345-2019-070-6-1.tbd.asc
Successfully processed: unit_345-2019-070-6-10.sbd.asc
Successfully processed: unit_345-2019-070-6-10.tbd.asc
Successfully processed: unit_345-2019-070-6-100.sbd.asc
Successfully processed: unit_345-2019-070-6-100.tbd.asc
Successfully processed: unit_345-2019-070-6-101.sbd.asc
Successfully processed: unit_345-2019-070-6-101.tbd.asc
Successfully processed: unit_345-2019-070-6-102.sbd.asc
Successfully processed: unit_345-2019-070-6-102.tbd.asc
Successfully processed: unit_345-2019-070-6-103.sbd.asc
Successfully processed: unit_345-2019-070-6-103.tbd.asc
Successful

In [19]:
combined_494_raw_data = process_raw_data(Stella_494_data_dir, Stella_494_file_prefix, Stella_494_output_dir, Stella_494_ID)

Successfully processed: unit_436-2018-037-0-0.sbd.asc
Successfully processed: unit_436-2018-037-0-0.tbd.asc
Successfully processed: unit_436-2018-037-1-0.sbd.asc
Successfully processed: unit_436-2018-037-1-0.tbd.asc
Successfully processed: unit_436-2018-037-2-0.sbd.asc
Successfully processed: unit_436-2018-037-2-0.tbd.asc
Successfully processed: unit_436-2018-037-3-0.sbd.asc
Successfully processed: unit_436-2018-037-3-0.tbd.asc
Successfully processed: unit_436-2018-037-3-1.sbd.asc
Successfully processed: unit_436-2018-037-3-1.tbd.asc
Successfully processed: unit_436-2018-037-3-10.sbd.asc
Successfully processed: unit_436-2018-037-3-10.tbd.asc
Successfully processed: unit_436-2018-037-3-11.sbd.asc
Successfully processed: unit_436-2018-037-3-11.tbd.asc
Successfully processed: unit_436-2018-037-3-12.sbd.asc
Successfully processed: unit_436-2018-037-3-12.tbd.asc
Successfully processed: unit_436-2018-037-3-13.sbd.asc
Successfully processed: unit_436-2018-037-3-13.tbd.asc
Successfully process