In [2]:
import zipfile
import tempfile
from pathlib import Path
import xarray as xr
import pandas as pd

# -------------------------------
# CONFIG
# -------------------------------
for j in [4,5]:  # years to process
    for i in range(1,13):
        if j == 5 and i > 10:
            continue
        else:
            if i < 10:
                file_path = Path(f"era5_data/era5_202{j}_0{i}.nc")  # your downloaded ERA5 file
                output_csv = f"era5_data_csv/era5_full_dataset_202{j}_0{i}.csv"
            else:
                file_path = Path(f"era5_data/era5_202{j}_{i}.nc")  # your downloaded ERA5 file
                output_csv = f"era5_data_csv/era5_full_dataset_202{j}_{i}.csv"
            # -------------------------------
            # LOAD DATASET (handle zip or nc)
            # -------------------------------
            if not file_path.exists():
                raise FileNotFoundError(file_path)

            with open(file_path, 'rb') as f:
                sig = f.read(4)

            if sig[:2] == b'PK':
                print('File is a zip archive. Extracting .nc...')
                with zipfile.ZipFile(file_path, 'r') as z:
                    nc_files = [n for n in z.namelist() if n.lower().endswith('.nc')]
                    if not nc_files:
                        raise RuntimeError('Zip archive does not contain any .nc files')
                    member = nc_files[0]
                    with tempfile.NamedTemporaryFile(suffix='.nc', delete=False) as tmp:
                        tmp.write(z.read(member))
                        tmp_path = Path(tmp.name)
                    ds = xr.open_dataset(tmp_path)
            else:
                print('File is not a zip; opening directly')
                ds = xr.open_dataset(file_path)

            print('Dataset loaded successfully!')

            # -------------------------------
            # STACK LAT/LON and convert to DataFrame
            # -------------------------------
            print('Converting dataset to DataFrame...')

            # Flatten lat/lon into single "points" dimension
            df = ds.to_dataframe().reset_index()

            # Convert temperature from K to Â°C for common variables
            for var in ['t2m']:
                if var in df.columns:
                    df[var] = df[var] - 273.15

            # -------------------------------
            # SAVE TO CSV
            # -------------------------------
            print(f'Saving full dataset to CSV: {output_csv} ...')
            df.to_csv(output_csv, index=False)
            print('Done!')


File is a zip archive. Extracting .nc...
Dataset loaded successfully!
Converting dataset to DataFrame...
Saving full dataset to CSV: era5_data_csv/era5_full_dataset_2024_01.csv ...
Done!
File is a zip archive. Extracting .nc...
Dataset loaded successfully!
Converting dataset to DataFrame...
Saving full dataset to CSV: era5_data_csv/era5_full_dataset_2024_02.csv ...
Done!
File is a zip archive. Extracting .nc...
Dataset loaded successfully!
Converting dataset to DataFrame...
Saving full dataset to CSV: era5_data_csv/era5_full_dataset_2024_03.csv ...
Done!
File is a zip archive. Extracting .nc...
Dataset loaded successfully!
Converting dataset to DataFrame...
Saving full dataset to CSV: era5_data_csv/era5_full_dataset_2024_04.csv ...
Done!
File is a zip archive. Extracting .nc...
Dataset loaded successfully!
Converting dataset to DataFrame...
Saving full dataset to CSV: era5_data_csv/era5_full_dataset_2024_05.csv ...
Done!
File is a zip archive. Extracting .nc...
Dataset loaded successfu

In [3]:
import os
import pandas as pd

def describe_csv_in_folder(folder_path='.', output_file='csv_descriptions.txt'):
    """
    Analyzes all CSV files in a given folder and writes a description of each
    to a single output file.

    Args:
        folder_path (str): The path to the folder containing CSV files.
                           Defaults to the current directory.
        output_file (str): The name of the text file to save the descriptions to.
                           Defaults to 'csv_descriptions.txt'.
    """
    # Check if the folder path exists
    if not os.path.isdir(folder_path):
        print(f"Error: The folder '{folder_path}' does not exist.")
        return

    # Open the output file in write mode, which will overwrite any existing file
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(f"CSV File Analysis Report for folder: '{os.path.abspath(folder_path)}'\n")
            f.write("=" * 80 + "\n\n")

            print(f"Starting analysis of CSV files in '{folder_path}'...")
            
            # List all files in the directory
            files_in_dir = os.listdir(folder_path)
            csv_files = [file for file in files_in_dir if file.lower().endswith('.csv')]

            if not csv_files:
                message = "No CSV files found in the specified directory.\n"
                f.write(message)
                print(message)
                return

            # Loop through each file in the directory
            for filename in csv_files:
                if filename.lower().endswith('.csv'):
                    file_path = os.path.join(folder_path, filename)
                    
                    try:
                        # Read the CSV file into a pandas DataFrame
                        df = pd.read_csv(file_path)

                        # Write file header
                        f.write("-" * 80 + "\n")
                        f.write(f"File Name: {filename}\n")
                        f.write("-" * 80 + "\n\n")

                        # Basic Information
                        f.write("1. Basic Information:\n")
                        f.write(f"   - Rows: {df.shape[0]}\n")
                        f.write(f"   - Columns: {df.shape[1]}\n\n")

                        # Column Details
                        f.write("2. Column Details (Name and Data Type):\n")
                        for col in df.columns:
                            f.write(f"   - '{col}': {df[col].dtype}\n")
                        f.write("\n")

                        # Missing Values
                        f.write("3. Missing Values per Column:\n")
                        missing_values = df.isnull().sum()
                        if missing_values.sum() == 0:
                            f.write("   - No missing values found.\n")
                        else:
                            for col, count in missing_values.items():
                                if count > 0:
                                    f.write(f"   - '{col}': {count} missing values\n")
                        f.write("\n")

                        # Summary Statistics
                        f.write("4. Summary Statistics (for numeric columns):\n")
                        # The .to_string() method ensures the entire summary is written
                        f.write(df.describe().to_string())
                        f.write("\n\n")

                        # Preview of the first 5 rows
                        f.write("5. Data Preview (First 5 Rows):\n")
                        f.write(df.head().to_string())
                        f.write("\n\n\n")
                        
                        print(f" - Successfully described '{filename}'")

                    except Exception as e:
                        error_message = f"Could not process {filename}. Error: {e}\n\n"
                        f.write(error_message)
                        print(error_message)
        
        print(f"\nAnalysis complete. Report saved to '{output_file}'")

    except IOError as e:
        print(f"Error: Could not write to the file '{output_file}'. Reason: {e}")


if __name__ == '__main__':
    # Get user input for the folder path
    target_folder = "era5_data_csv"

    # If the user just presses Enter, use the current directory
    if not target_folder:
        target_folder = '.'
        
    describe_csv_in_folder(target_folder)


Starting analysis of CSV files in 'era5_data_csv'...
 - Successfully described 'era5_full_dataset_2024_01.csv'
 - Successfully described 'era5_full_dataset_2024_02.csv'
 - Successfully described 'era5_full_dataset_2024_03.csv'
 - Successfully described 'era5_full_dataset_2024_04.csv'
 - Successfully described 'era5_full_dataset_2024_05.csv'
 - Successfully described 'era5_full_dataset_2024_06.csv'
 - Successfully described 'era5_full_dataset_2024_07.csv'
 - Successfully described 'era5_full_dataset_2024_08.csv'
 - Successfully described 'era5_full_dataset_2024_09.csv'
 - Successfully described 'era5_full_dataset_2024_10.csv'
 - Successfully described 'era5_full_dataset_2024_11.csv'
 - Successfully described 'era5_full_dataset_2024_12.csv'
 - Successfully described 'era5_full_dataset_2025_01.csv'
 - Successfully described 'era5_full_dataset_2025_02.csv'
 - Successfully described 'era5_full_dataset_2025_03.csv'
 - Successfully described 'era5_full_dataset_2025_04.csv'
 - Successfully des