# Process GEOS-Chem model outputs using Xarray
Xarray is a powerful Python package designed for processing ND arrays. It is pretty smart at handling NetCDF4 files, like GEOS-Chem outputs. You can find many useful functions on [its homepage](http://xarray.pydata.org/en/stable/). There is also a very excellent [Python/xarray tutorial for GEOS-Chem users](https://github.com/geoschem/GEOSChem-python-tutorial) written by GEOS-Chem experts.

## Here I just show some examples of using Xarray to process GEOS-Chem outputs.

In [None]:
###########################################################################################################################
# load packages

import os
import glob
import numpy  as np
import pandas as pd
import xarray as xr

In [None]:
###########################################################################################################################
# set up working direcotry

os.chdir("move-to-your-working-directory")

# you can check the chosen directory
print(os.getcwd())

In [None]:
# find GEOS-Chem output files
Species  = sorted(glob.glob("GEOSChem.SpeciesConc*.nc4"))
StateMet = sorted(glob.glob("GEOSChem.StateMet*.nc4"))
Aerosols = sorted(glob.glob("GEOSChem.AerosolMass*nc4"))

# check the selected files
print(*Species,*StateMet,*Aerosols, sep = "\n")

In [None]:
'''
# syntax to open a single netcdf file
data = xr.open_dataset("filename")

# then you can use "for loop" to open multiple netcdf files
data_all = []

for i in range(len(files)):
    data_all.append(xr.open_dataset(files[i])
'''


In [None]:
# here we use "list compreshensions" to extract data fields from GC output files 
# "list compreshensions" are equivalent to "for loops"
# syntax: list_B = [do_sth(object) for object in list_A] 
Species  = [xr.open_dataset(file) for file in Species]
StateMet = [xr.open_dataset(file) for file in StateMet]
Aerosols = [xr.open_dataset(file) for file in Aerosols]

In [None]:
# surface layer data
surface_NO2 = [data['SpeciesConc_NO2'].isel(time=0,lev=0) for data in Species]
surface_SO2 = [data['SpeciesConc_SO2'].isel(time=0,lev=0) for data in Species]
surface_PM  = [data['PM25'].isel(time=0,lev=0) for data in Aerosols]

# convert unit for gases (dry mol/mol to ug/m3)
surface_AIRNUMDEN = [data['Met_AIRNUMDEN'].isel(time=0,lev=0) for data in StateMet]
surface_NO2_mass  = [x*y*46/(6.022*1e11) for (x,y) in zip(surface_NO2,surface_AIRNUMDEN)]
surface_SO2_mass  = [x*y*64/(6.022*1e11) for (x,y) in zip(surface_SO2,surface_AIRNUMDEN)]

# calculate averages         
model_NO2 = sum(surface_NO2_mass)/len(surface_NO2_mass)
model_SO2 = sum(surface_SO2_mass)/len(surface_SO2_mass)
model_PM  = sum(surface_PM)/len(surface_PM)

# recover the names for the gaseous
model_NO2 = model_NO2.rename(surface_NO2[0].name)
model_SO2 = model_SO2.rename(surface_SO2[0].name)

In [None]:
# now re-arrange the extraced model results

# combine the variables
model_output = xr.merge([model_NO2,
                         model_SO2,
                         model_PM])

In [None]:
# subset data over Northern China (NCP)
model_output_NCP = model_output.sel(lat=slice(32,43),lon=slice(107.5,120))

In [None]:
# check your data results
display(model_output_NCP)

In [None]:
# you can also convert xarray data arrays to a single pandas data frame
def xr_to_df(data):
    data = data.to_dataframe()
    data.reset_index(inplace=True)
    return data

model_output_NCP_df = xr_to_df(model_output_NCP)

# drop the unwanted column
model_output_NCP_df = model_output_NCP_df.drop(['lev'], axis=1)

# fix the columnnames for easy comparison with surface data in future
model_output_NCP_df = model_output_NCP_df.rename(columns={"SpeciesConc_NO2": "GEOSChem_NO2",
                                                          "SpeciesConc_SO2": "GEOSChem_SO2",
                                                          "PM25": "GEOSChem_PM"})

In [None]:
# check the output dataframe
# check the "lat" and "lon"
# this is consistent with R results, but Python draws the map row by row, from bottom-left to top-right
# Python is prefered over R here
print(model_output_NCP_df)