In [None]:
import os
import tarfile
import pygrib
import pandas as pd
from multiprocessing import Pool

# Replace this with the actual path to your tar files directory
tar_files_directory = '/content/'
temp_dir = '/content/temp_dir/'

# Function to process a single .grib file
def process_grib_file(grib_file):
    #print(f"Processing {grib_file}...")  # Debugging: Check which file is being processed

    data = {
        'lat': [],
        'lon': [],
        'level': [],
        'param_name': [],
        'param_value': [],
        'forecast_step': [],
        'Date': [],
        'time': []
    }

    try:
        with pygrib.open(grib_file) as grbs:
            for grb in grbs:
                latitudes, longitudes, values = grb.latitudes, grb.longitudes, grb.values
                data['lat'].extend(latitudes.ravel())
                data['lon'].extend(longitudes.ravel())
                data['level'].extend([grb['level']] * latitudes.size)
                data['param_name'].extend([grb['name']] * latitudes.size)
                data['param_value'].extend(values.ravel())
                data['forecast_step'].extend([grb['step']] * latitudes.size)
                data['Date'].extend([grb.validityDate] * latitudes.size)
                data['time'].extend([grb.validityTime] * latitudes.size)
    except Exception as e:
        print(f"Error processing {grib_file}: {e}")  # Debugging: Catch and print any error

    return data

# Function to extract .tar files and process the .grib files
def process_tar_file(tar_file_path):
    print(f"Processing TAR file: {tar_file_path}")  # Debugging: Check which tar file is being processed

    # Extract files from tar archive
    with tarfile.open(tar_file_path, 'r') as tar:
        tar.extractall(path=temp_dir)

    # Get all grib files extracted from the tar
    grib_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith('.grib2')]
    print(f"Found {len(grib_files)} .grib files in {tar_file_path}")  # Debugging: Check if grib files were found

    if not grib_files:
        print(f"No .grib files found in {tar_file_path}")  # Debugging: Alert if no grib files

    # Process each grib file in parallel
    with Pool() as pool:
        result_data = pool.map(process_grib_file, grib_files)

    # Clear the temporary directory after processing
    for file in os.listdir(temp_dir):
        os.remove(os.path.join(temp_dir, file))

    # Combine all data into a single dictionary, check if result_data is not empty
    if not result_data:
        print(f"No data processed for {tar_file_path}")  # Debugging: Alert if no data is processed

    combined_data = {key: [] for key in result_data[0].keys()} if result_data else {}
    for data_chunk in result_data:
        for key in combined_data:
            combined_data[key].extend(data_chunk[key])

    return combined_data

# Main execution loop
def process_all_tar_files(tar_files_directory):
    all_data = {
        'lat': [],
        'lon': [],
        'level': [],
        'param_name': [],
        'param_value': [],
        'forecast_step': [],
        'Date': [],
        'time': []
    }

    tar_files = [os.path.join(tar_files_directory, file) for file in os.listdir(tar_files_directory) if file.endswith('.tar')]

    for tar_file in tar_files:
        file_data = process_tar_file(tar_file)

        for key in all_data:
            all_data[key].extend(file_data.get(key, []))  # Use .get to avoid key errors

    # Convert the data dictionary to a pandas DataFrame
    df = pd.DataFrame(all_data)

    return df


# Run the process
df = process_all_tar_files(tar_files_directory)
