In [10]:
import xarray as xr
import os
import glob
from  datetime import datetime
import boto3
import s3fs
import tempfile
import numpy as np
import pandas as pd
import re
import json

In [3]:
raw_files = glob.glob("data/*.nc4")
output_files= glob.glob("output_final2/*.tif")

In [4]:
def extract_date_from_key(key):
    # Split the key to isolate the part that contains the date
    parts = key.split('_')
    for part in parts:
        # Check if the part is numeric and has the length of 6 (YYYYMM format)
        if part.isdigit() and len(part) == 6:
            return part
    return None

In [5]:
overall_raw= []
raw= pd.DataFrame(columns=['filename','min_raw','max_raw','mean_raw','std_raw'])
for file in raw_files:
    xds= xr.open_dataset(file)
    year_month = extract_date_from_key(file)
    for var in ["PM25-PRI","CO2","CO","NOX","SOX"]:
        data = getattr(xds,var)
        overall_raw.append(data)
        data = np.ma.masked_where((data == -9999), data)
        min_val = np.nanmin(data)
        max_val = np.nanmax(data)
        mean_val = np.nanmean(data)
        std_val = np.nanstd(data)
        stats = [f"{var}_{year_month}", min_val, max_val, mean_val, std_val]
        raw.loc[len(raw)] = stats

In [6]:
overall_cog=[]
cog= pd.DataFrame(columns=['filename','min_cog','max_cog','mean_cog','std_cog'])
for file in output_files:
    data= xr.open_dataarray(file)
    
    year_month = file[:-4][-6:]
    var = file.split("_")[-2]
    overall_cog.append(data)
    data = np.ma.masked_where((data == -9999), data)
    
    
    min_val = np.nanmin(data)
    max_val = np.nanmax(data)
    mean_val = np.nanmean(data)
    std_val = np.nanstd(data)
    stats = [f"{var}_{year_month}", min_val, max_val, mean_val, std_val]
    cog.loc[len(cog)] = stats

In [7]:
# validation for reprojected data (non zero) - overall calculation
overall_raw= np.array(overall_raw)
overall_raw= np.ma.masked_where((overall_raw == -9999) , overall_raw)
nan_min = np.nanmin(overall_raw)
nan_max = np.nanmax(overall_raw)
nan_mean = np.nanmean(overall_raw)
nan_std = np.nanstd(overall_raw)
["overall_raw",nan_min,nan_max,nan_mean,nan_std]

['overall_raw', 0.0, 110011.766, 5.1753755, 172.26357]

In [8]:
overall_cog= np.array(overall_cog)
nan_min = np.nanmin(overall_cog)
nan_max = np.nanmax(overall_cog)
nan_mean = np.nanmean(overall_cog)
nan_std = np.nanstd(overall_cog)
["overall_cog",nan_min,nan_max,nan_mean,nan_std]

['overall_cog', 0.0, 110011.766, 5.1753297, 172.27177]

In [9]:
pd.merge(cog, raw, on='filename', how='inner').to_json("monthly_stats.json")

In [11]:

keys = ["data", "nan_min", "nan_max", "nan_mean", "nan_std"]
values_set1 = ["overall_raw", 0.0, 110011.766, 5.1753297, 172.27177]
values_set2 = ["overall_cog", 0.0, 110011.766, 5.1753297, 172.27177]

data_dict = {key: [val1, val2] for key, val1, val2 in zip(keys, values_set1, values_set2)}

# Save the dictionary as a JSON file
with open("overall_stats.json", "w") as json_file:
    json.dump(data_dict, json_file, indent=4)