In [None]:
import os
import glob
import re
import pandas as pd


In [None]:
# Path to folder with CSV files, same process for both n2o and o3 data
data_path_68 = "cleaned_file_path"
data_path_46 = "cleaned_file_path"
data_path_32 = "cleaned_file_path"
data_path_22 = "cleaned_file_path"

In [None]:
# Pattern to match files. Adjust if necessary 
csv_files_68 = glob.glob(os.path.join(data_path_68, "monthly_*_*.csv"))
csv_files_46 = glob.glob(os.path.join(data_path_46, "monthly_*_*.csv"))
csv_files_32 = glob.glob(os.path.join(data_path_32, "monthly_*_*.csv"))
csv_files_22 = glob.glob(os.path.join(data_path_22, "monthly_*_*.csv"))

In [None]:
# Prepare a list to gather all data
all_data_68 = []
all_data_46 = []
all_data_32 = []
all_data_22 = []

In [None]:
# e.g., "monthly_nz_n_201268.csv" or "monthly_estonia_201268.csv"
# We'll capture everything between "monthly_" and the "_YYYY68.csv".
# Explanation:
#   ^monthly_      -> starts with "monthly_"
#   (.*?)          -> capture any characters, non-greedy, into group 1 (location)
#   _(\d{4})       -> underscore, then 4 digits for the year (group 2)
#   \d{2}          -> the trailing '68' (or other 2 digits)
#   \.csv$         -> .csv at the end
pattern = r"^monthly_(.*?)_(\d{4})\d{2}\.csv$"

In [None]:
def averages_calculation(file_path, csv_files, all_data):
    """
    Reads each ragged CSV in `csv_files`, computes for each of the 12 monthly rows:
      - mean_concentration
      - std_concentration
      - count         
    and appends a DataFrame with Month, mean, std, count, location, year to all_data.
    """
    for file_path in csv_files:
        filename = os.path.basename(file_path)
        match = re.match(pattern, filename)
        if not match:
            print(f"Skipping {filename}, doesn't match expected pattern.")
            continue

        location, year = match.group(1), match.group(2)

        # Read the ragged CSV (rows ≈ months, columns ≈ daily values)
        df = pd.read_csv(file_path, header=None)

        monthly_stats = []
        for i in range(len(df)):  # typically 12 months
            row_data   = df.iloc[i].dropna()
            row_mean   = row_data.mean()
            row_std    = row_data.std()
            row_count  = len(row_data)

            monthly_stats.append({
                "Month":                i + 1,
                "mean_concentration":   row_mean,
                "std_concentration":    row_std,
                "count":                row_count,        
            })

        monthly_df = pd.DataFrame(monthly_stats)
        monthly_df["location"] = location
        monthly_df["year"]     = int(year)

        all_data.append(monthly_df)
    return all_data


In [None]:
averages_calculation(data_path_68, csv_files_68, all_data_68)


In [None]:
averages_calculation(data_path_46, csv_files_46, all_data_46)


In [None]:
averages_calculation(data_path_32, csv_files_32, all_data_32)


In [None]:
averages_calculation(data_path_22, csv_files_22, all_data_22)

In [None]:
# Concatenate all DataFrames into a single DataFrame
big_df_68 = pd.concat(all_data_68, ignore_index=True)
big_df_46 = pd.concat(all_data_46, ignore_index=True)
big_df_32 = pd.concat(all_data_32, ignore_index=True)
big_df_22 = pd.concat(all_data_22, ignore_index=True)

In [None]:
big_df_68["date"] = pd.to_datetime({
    "year": big_df_68["year"].astype(int),
    "month": big_df_68["Month"].astype(int),
    "day": 1
})

big_df_46["date"] = pd.to_datetime({
    "year": big_df_46["year"].astype(int),
    "month": big_df_46["Month"].astype(int),
    "day": 1
})

big_df_32["date"] = pd.to_datetime({
    "year": big_df_32["year"].astype(int),
    "month": big_df_32["Month"].astype(int),
    "day": 1
})

big_df_22["date"] = pd.to_datetime({
    "year": big_df_22["year"].astype(int),
    "month": big_df_22["Month"].astype(int),
    "day": 1
})

In [None]:
#add latitude zones, table with location names in text in README.md
#1 - north of 30°
#2 - between 30° and -30°
#3 - south of -30°
location_to_zone = {
    "bashkortostan": 1,
    "bozeman": 1,
    "brunei": 2,
    "california": 1,
    "catalonia": 1,
    "colombia": 2,
    "estonia": 1,
    "finland": 1,
    "florianopolis": 2,
    "florida": 2,
    "france": 1,
    "french_guiana": 2,
    "huntingdon": 1,
    "iceland_e": 1,
    "iceland_w": 1,
    "khabarovsk": 1,
    "kongo": 2,
    "kyrgyzstan": 1,
    "mexico": 2,
    "morocco": 1,
    "mukhrino": 1,
    "myanmar": 2,
    "nz_n": 3,
    "nz_s": 3,
    "pantanal": 2,
    "quistococha": 2,
    "romania": 1,
    "taiwan": 2,
    "tarapoto": 2,
    "tasmania": 3,
    "tierra_del_fuego": 3,
    "uganda_e": 2,
    "uganda_n": 2,
    "uganda_s": 2,
    "wales": 1
}

In [None]:
big_df_68["latitude_zone"] = big_df_68["location"].map(location_to_zone)
big_df_46["latitude_zone"] = big_df_46["location"].map(location_to_zone)
big_df_32["latitude_zone"] = big_df_32["location"].map(location_to_zone)
big_df_22["latitude_zone"] = big_df_22["location"].map(location_to_zone)

In [None]:
# Write out
big_df_68.to_csv("monthly_averages_68.csv", index=False)
big_df_46.to_csv("monthly_averages_46.csv", index=False)
big_df_32.to_csv("monthly_averages_32.csv", index=False)
big_df_22.to_csv("monthly_averages_22.csv", index=False)

In [None]:
#repeat process with o3 data