# Import Required Libraries
Code to import necessary libraries such as json, glob, pandas, matplotlib.pyplot, and seaborn.

In [None]:
# Import necessary libraries
import seaborn as sns  # For enhanced data visualization

# Configure seaborn for better aesthetics
sns.set(style="whitegrid")

In [None]:
import sys
import os
# Add the path to the custom module for data processing 
import dataprocessing.creationOfDataframe as dp  # Custom module for data processing

In [None]:
import yaml
import os
from pathlib import Path

# 1. Get the path of the script
current_file = Path(__file__).resolve() # src/training/your_script.py

# 2. Go up one level to 'src', then into 'config'
config_path = current_file.parent.parent / "config" / "config_general.yaml"

# 3. Load the YAML
with open(config_path, "r") as f:
    config = yaml.safe_load(f)

# 4. Resolve the root of the project (one level above 'src')
# This ensures that "./data" in the YAML is interpreted relative to the Project_Root
PROJECT_ROOT = current_file.parent.parent.parent
os.chdir(PROJECT_ROOT) 

# Extract paths from YAML
DATA_DIR = config['paths']['data']
CHECKPOINT_DIR = config['paths']['checkpoints']
SAM_CHECKPOINT = config['paths']['sam_checkpoint']

# Load and Merge JSON Files
Code to use glob to locate files with pattern '*_met.json', read each file using json.load(), and merge the data into a pandas DataFrame.

In [None]:
import os
import json
import pandas as pd

root_dir = DATA_DIR + '/avalanche_input'

json_paths = dp.get_all_image_paths(root_dir, ['met.json'])

print("Number of JSON files:", len(json_paths))


In [None]:
# Now, loop over these file paths to read and flatten your JSON data.
records = []
count_mismatch = 0
for file in json_paths:
    # Ensure the file exists
    if not os.path.exists(file):
        print(f"File does not exist: {file}")
        continue
    with open(file, "r") as f:
        met_data = json.load(f)
    # Assuming JSON keys: "time", "air_temperature_2m", etc.
    times = met_data.get("time", [])
    air_temp = met_data.get("air_temperature_2m", [])
    precip = met_data.get("precipitation_amount", [])
    wind_speed = met_data.get("wind_speed_10m", [])
    humidity = met_data.get("relative_humidity_2m", [])
    air_pressure = met_data.get("air_pressure_at_sea_level", [])

    #print(len(times), len(air_temp), len(precip), len(wind_speed), len(humidity), len(air_pressure))
    # Check if all lists have the same length
    if not (len(times) == len(air_temp) == len(precip) == len(wind_speed) == len(humidity) == len(air_pressure)):
        print(f"Data length mismatch in file: {file}")
        count_mismatch += 1
        continue
    record = {
        "time": times,
        "air_temperature_2m": air_temp,
        "precipitation_amount": precip,
        "wind_speed_10m": wind_speed,
        "relative_humidity_2m": humidity,
        "air_pressure_at_sea_level": air_pressure,
        "source_file": file
    }
    records.append(record)
    
#print("Records", records)
# Create a DataFrame from all records.
df_met = pd.DataFrame(records)
print("Meteorological DataFrame shape:", df_met.shape)
df_met.head()
print("Number of mismatched files:", count_mismatch)

In [None]:
# Check dataframe is not empty
if df_met.empty:
    raise ValueError("The DataFrame is empty. Please check the JSON files.")

In [None]:
df_met.to_pickle(DATA_DIR + '/df_met.pkl')

In [None]:
numerical_columns = ['air_temperature_2m', 'precipitation_amount', 'wind_speed_10m', 'relative_humidity_2m', 'air_pressure_at_sea_level']

mean_values = df_met[numerical_columns].mean()
print("Mean values of numerical columns:")
print(mean_values)
std_values = df_met[numerical_columns].std()
print("Standard deviation values of numerical columns:")
print(std_values)

In [None]:
numerical_columns = ['air_temperature_2m', 'precipitation_amount', 'wind_speed_10m', 'relative_humidity_2m', 'air_pressure_at_sea_level']

In [None]:
import numpy as np

def flatten_column(column_series):
    # Assuming each cell contains a list, concatenate all lists into one array
    return np.concatenate(column_series.values)

print("Mean values of numerical columns:")
for col in numerical_columns:
    flat_values = flatten_column(df_met[col])
    print(f"{col}: {flat_values.mean()}")

print("Standard deviation values of numerical columns:")
for col in numerical_columns:
    flat_values = flatten_column(df_met[col])
    print(f"{col}: {flat_values.std()}")