# Climate Data Preprocessing

In [20]:
import os
import glob

def unpack_data():

    data_dir = '../data/climate'

# Find all .tar.gz files in the data directory
    tar_files = glob.glob(os.path.join(data_dir, '*.zip'))

# Unpack and remove each .tar.gz file
    for tar_file in tar_files:
        os.system(f'tar -xf {tar_file} -C {data_dir}')
        os.remove(tar_file)

In [21]:
import cdsapi

def data_download(year):
    
    c = cdsapi.Client()
    c.retrieve(
        'insitu-gridded-observations-global-and-regional',
        {
            'origin': 'cru',
            'region': 'global',
            'variable': [
                'precipitation', 'temperature',
            ],
            'statistic': [
                'maximum', 'mean',
            ],
            'time_aggregation': 'monthly',
            'horizontal_aggregation': '0_5_x_0_5',
            'year': f'{year}',
            'version': 'v4.03',
            'format': 'zip',
        },
        f'../data/climate/climate_{year}.zip')
       
    return unpack_data()

In [23]:
start_year = 2001
end_year = 2019

for i in range(start_year, end_year + 1):
    data_download(i)

2023-06-06 14:46:31,422 INFO Welcome to the CDS
2023-06-06 14:46:31,423 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/insitu-gridded-observations-global-and-regional
2023-06-06 14:46:31,529 INFO Request is completed
2023-06-06 14:46:31,530 INFO Downloading https://download-0017.copernicus-climate.eu/cache-compute-0017/cache/data8/dataset-insitu-gridded-observations-global-and-regional-212601fb-200a-406a-8813-133972b4a848.zip to ../data/climate/climate_2001.zip (3.9M)
2023-06-06 14:46:32,048 INFO Download rate 7.5M/s   
2023-06-06 14:46:32,237 INFO Welcome to the CDS
2023-06-06 14:46:32,238 INFO Sending request to https://cds.climate.copernicus.eu/api/v2/resources/insitu-gridded-observations-global-and-regional
2023-06-06 14:46:32,321 INFO Request is queued
2023-06-06 14:46:33,351 INFO Request is running
2023-06-06 14:46:37,209 INFO Request is completed
2023-06-06 14:46:37,210 INFO Downloading https://download-0006-clone.copernicus-climate.eu/cache-compute-0

In [76]:
import xarray as xr
import pandas as pd

def climate_preprocessing(file_path):
    frame = xr.open_dataset(file_path, engine='netcdf4').to_dataframe()
    frame.reset_index(inplace=True)
    frame['month'] = frame['time'].dt.month
    frame['year'] = frame['time'].dt.year
    frame.drop('time', inplace=True, axis=1)  
    frame = frame[(frame['lat']>=32.75) & (frame['lat']<=52) & (frame['lon']>=-10.25) & (frame['lon']<=30) & (frame['month']==7)]
    return frame

In [77]:
result = climate_preprocessing(f'../data/climate/CRU_mean_temperature_mon_0.5x0.5_global_{2001}_v4.03.nc')
result.head()

Unnamed: 0,lon,lat,tas,month,year
1677485,-10.25,32.75,,7,2001
1677486,-10.25,33.25,,7,2001
1677487,-10.25,33.75,,7,2001
1677488,-10.25,34.25,,7,2001
1677489,-10.25,34.75,,7,2001


In [78]:
def merge_years(climate_var, data_dir):
    dfs = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith('.nc') and climate_var in file_name:
            file_path = os.path.join(data_dir, file_name)
            df = climate_preprocessing(file_path)
            dfs.append(df)
    final_df = pd.concat(dfs, ignore_index=True)
    return final_df

In [79]:
data = merge_years('CRU_mean_temperature', '../data/climate')
data[data['year']==2001]

Unnamed: 0,lon,lat,tas,month,year
18954,-10.25,32.75,,7,2001
18955,-10.25,33.25,,7,2001
18956,-10.25,33.75,,7,2001
18957,-10.25,34.25,,7,2001
18958,-10.25,34.75,,7,2001
...,...,...,...,...,...
22108,29.75,49.75,22.600000,7,2001
22109,29.75,50.25,22.900000,7,2001
22110,29.75,50.75,22.700001,7,2001
22111,29.75,51.25,22.900000,7,2001


In [80]:
data_dir = '../data/climate'

df_mean_temperature = merge_years('CRU_mean_temperature', data_dir)
df_maximum_temperature = merge_years('CRU_maximum_temperature', data_dir)
df_total_precipitation = merge_years('CRU_total_precipitation', data_dir)

In [81]:
df_mean_temperature.head()

Unnamed: 0,lon,lat,tas,month,year
0,-10.25,32.75,,7,2007
1,-10.25,33.25,,7,2007
2,-10.25,33.75,,7,2007
3,-10.25,34.25,,7,2007
4,-10.25,34.75,,7,2007


In [82]:
df_all = df_mean_temperature.copy()
df_all.drop('tas', inplace=True, axis=1)
df_all = pd.merge(df_all, df_mean_temperature, on=['lon','lat','month','year'])
df_all = pd.merge(df_all, df_maximum_temperature, on=['lon','lat','month','year'])
df_all = pd.merge(df_all, df_total_precipitation, on=['lon','lat','month','year'])
df_all

Unnamed: 0,lon,lat,month,year,tas,tasmax,pr
0,-10.25,32.75,7,2007,,,
1,-10.25,33.25,7,2007,,,
2,-10.25,33.75,7,2007,,,
3,-10.25,34.25,7,2007,,,
4,-10.25,34.75,7,2007,,,
...,...,...,...,...,...,...,...
60016,29.75,49.75,7,2015,21.200001,27.0,51.100002
60017,29.75,50.25,7,2015,21.400000,26.9,54.900002
60018,29.75,50.75,7,2015,21.100000,26.6,57.400002
60019,29.75,51.25,7,2015,20.800001,26.4,61.000000


In [98]:
df_all['constant'] = 0.25
df_all_bis = df_all.copy()
df_all_bis['lon'] = df_all_bis['lon'] + df_all_bis['constant']
df_all_tris = df_all.copy()
df_all_tris['lat'] = df_all_tris['lat'] + df_all_tris['constant']
df_all_quatris = df_all.copy()
df_all_quatris['lon'] = df_all_quatris['lon'] + df_all_tris['constant']
df_all_quatris['lat'] = df_all_quatris['lat'] + df_all_tris['constant']

dfs = [df_all, df_all_bis, df_all_tris, df_all_quatris]

df_final = pd.concat(dfs, ignore_index=True)
df_final = df_final.drop(['constant'], axis=1)
df_final = df_final[df_final.lon != -10.25]
df_final = df_final[df_final.lat != 32.75]
df_final = df_final.sort_values(['year','lon','lat'])
df_final = df_final.reset_index()
df_final = df_final.drop(['index'], axis=1)
df_final

Unnamed: 0,lon,lat,month,year,tas,tasmax,pr
0,-10.0,33.00,7,2001,,,
1,-10.0,33.25,7,2001,,,
2,-10.0,33.50,7,2001,,,
3,-10.0,33.75,7,2001,,,
4,-10.0,34.00,7,2001,,,
...,...,...,...,...,...,...,...
235538,30.0,51.00,7,2019,19.200001,24.300001,72.099998
235539,30.0,51.25,7,2019,18.900000,24.100000,77.400002
235540,30.0,51.50,7,2019,18.900000,24.100000,77.400002
235541,30.0,51.75,7,2019,18.500000,23.900000,88.300003


In [95]:
df_final.to_csv('../data/climate.csv')