NUTS: 
- 2003: 1891 rows
- 2006: 1931 rows
- 2010: 1920 rows
- 2013: 1951 rows
- 2016: 2016 rows
- 2021: 2010 rows
- 2024: 1798 rows  
- **Total**: 13517

In [1]:
import re
import os
import gc
import json
import rasterio
import numpy as np
import pandas as pd
from tqdm import tqdm
import geopandas as gpd
from pathlib import Path
from osgeo import gdal, osr
import dask_geopandas as dgpd
from shapely.geometry import shape
from rasterstats import zonal_stats

BASE_DIR = Path('/Users/wenlanzhang/PycharmProjects/Mapineq/src/data-wrangling/')
DATA_DIR = Path('/Users/wenlanzhang/Downloads/PhD_UCL/Data/Oxford')

# Annual
https://code.earthengine.google.com/9df7d68e50a39bf1e5804733d0b06d0c

## NUTS

In [8]:
df2224 = pd.read_csv(DATA_DIR/'GEE/NTL/VIIRS_NUTS_All_22_24.csv')
# df2224
df2224['geo_source'].unique()

array(['NUTS2003', 'NUTS2006', 'NUTS2010', 'NUTS2013', 'NUTS2016',
       'NUTS2021', 'NUTS2024'], dtype=object)

In [9]:
df1321 = pd.read_csv(DATA_DIR/'GEE/NTL/VIIRS_NUTS_All_13_21.csv')
df1321 = df1321.drop('min', axis=1)
# df1321
df1321['geo_source'].unique()

array(['NUTS2003', 'NUTS2006', 'NUTS2010', 'NUTS2013', 'NUTS2016',
       'NUTS2021', 'NUTS2024'], dtype=object)

In [10]:
df_combined = pd.concat([df1321, df2224])
df_combined
# len(df_combined['obsTime'].unique())

Unnamed: 0,NUTS_ID,mean,stdDev,max,obsTime,geo_source
0,AT,0.584123,2.676161,363.295990,2013,NUTS2003
1,AT3,0.506815,2.715370,363.295990,2013,NUTS2003
2,AT1,0.992977,3.448908,167.207932,2013,NUTS2003
3,AT2,0.308386,1.465850,152.990356,2013,NUTS2003
4,AT34,0.833707,2.294287,43.166061,2013,NUTS2003
...,...,...,...,...,...,...
40546,SI037,0.545448,1.391197,23.812185,2024,NUTS2024
40547,SI038,0.347721,0.882950,14.631371,2024,NUTS2024
40548,SI043,0.420442,1.242523,25.155533,2024,NUTS2024
40549,SK041,0.353924,1.380300,31.594156,2024,NUTS2024


In [11]:
12*(1891+1931+1920+1951+2016+2010+1798)

162204

In [12]:
df_combined = df_combined.rename(columns={'NUTS_ID': 'geo', 'stdDev': 'std_dev'})
# Melt the dataframe to long format
df_long = pd.melt(df_combined, 
                  id_vars=['geo', 'geo_source', 'obsTime'], 
                  value_vars=['max', 'mean', 'std_dev'],
                  var_name='metric', 
                  value_name='obsValue')

cols = [col for col in df_long.columns if col != 'geo_source'] + ['geo_source']
df_long = df_long[cols]
df_long

Unnamed: 0,geo,obsTime,metric,obsValue,geo_source
0,AT,2013,max,363.295990,NUTS2003
1,AT3,2013,max,363.295990,NUTS2003
2,AT1,2013,max,167.207932,NUTS2003
3,AT2,2013,max,152.990356,NUTS2003
4,AT34,2013,max,43.166061,NUTS2003
...,...,...,...,...,...
486607,SI037,2024,std_dev,1.391197,NUTS2024
486608,SI038,2024,std_dev,0.882950,NUTS2024
486609,SI043,2024,std_dev,1.242523,NUTS2024
486610,SK041,2024,std_dev,1.380300,NUTS2024


In [13]:
df_long.to_csv(DATA_DIR/"GEE/NTL/Output/NTL_Annual_1324.csv", index=True, index_label="id")

### Test  

- geo_source (7): 'NUTS2003', 'NUTS2006', 'NUTS2010', 'NUTS2013', 'NUTS2016', 'NUTS2021', 'NUTS2024'  
- metric (3): 'max', 'mean', 'std_dev'
- year (12): 2013 - 2024

In [2]:
df_long = pd.read_csv(DATA_DIR/'GEE/NTL/Output/NTL_Annual_1324.csv')
df_long

Unnamed: 0,id,geo,obsTime,metric,obsValue,geo_source
0,0,AT,2013,max,363.295990,NUTS2003
1,1,AT3,2013,max,363.295990,NUTS2003
2,2,AT1,2013,max,167.207932,NUTS2003
3,3,AT2,2013,max,152.990356,NUTS2003
4,4,AT34,2013,max,43.166061,NUTS2003
...,...,...,...,...,...,...
486607,486607,SI037,2024,std_dev,1.391197,NUTS2024
486608,486608,SI038,2024,std_dev,0.882950,NUTS2024
486609,486609,SI043,2024,std_dev,1.242523,NUTS2024
486610,486610,SK041,2024,std_dev,1.380300,NUTS2024


In [8]:
df_long['obsTime'].nunique()

12

In [10]:
13517 * 12 * 3

486612

## ITL + EURO

# Month

In [33]:
162204 * 12

1946448

In [2]:
df1321_M = pd.read_csv(DATA_DIR/'GEE/NTL/VIIRS_NUTS_Monthly_2013_2021.csv')
df1321_M
# df1321_M['geo_source'].unique()

Unnamed: 0,geo,mean,stdDev,max,obsTime,obsMonth,geo_source
0,AT,0.986477,2.548236,182.200836,2013,1,NUTS2003
1,AT3,0.995040,3.078193,182.200836,2013,1,NUTS2003
2,AT1,1.375383,2.639027,106.621887,2013,1,NUTS2003
3,AT2,0.614355,1.306002,71.294838,2013,1,NUTS2003
4,AT34,1.249731,2.065825,37.559410,2013,1,NUTS2003
...,...,...,...,...,...,...,...
1459831,SI037,0.989436,1.863954,40.930000,2021,12,NUTS2024
1459832,SI038,0.753542,1.090943,16.030001,2021,12,NUTS2024
1459833,SI043,0.796052,1.411274,35.980000,2021,12,NUTS2024
1459834,SK041,0.947011,2.419595,83.790001,2021,12,NUTS2024


In [3]:
df2224_M = pd.read_csv(DATA_DIR/'GEE/NTL/VIIRS_NUTS_Monthly_2022_2024.csv')
df2224_M = df2224_M.rename(columns={'NUTS_ID': 'geo', 'obsYear': 'obsTime'})
df2224_M
# df2224_M['geo_source'].unique()

Unnamed: 0,geo,mean,stdDev,max,obsTime,obsMonth,geo_source
0,AT,1.250350,3.499265,434.630005,2022,1,NUTS2003
1,AT3,1.135695,2.849661,334.390015,2022,1,NUTS2003
2,AT1,1.751773,4.559688,141.559998,2022,1,NUTS2003
3,AT2,0.939442,3.054300,434.630005,2022,1,NUTS2003
4,AT34,1.379717,2.241623,70.169998,2022,1,NUTS2003
...,...,...,...,...,...,...,...
486607,SI037,1.023837,1.785201,45.340000,2024,12,NUTS2024
486608,SI038,0.856611,1.177727,19.650000,2024,12,NUTS2024
486609,SI043,0.880418,1.505450,36.509998,2024,12,NUTS2024
486610,SK041,0.756769,1.343125,34.970001,2024,12,NUTS2024


In [4]:
df_combined_M = pd.concat([df1321_M, df2224_M])
df_combined_M
# len(df_combined_M['obsTime'].unique())

Unnamed: 0,geo,mean,stdDev,max,obsTime,obsMonth,geo_source
0,AT,0.986477,2.548236,182.200836,2013,1,NUTS2003
1,AT3,0.995040,3.078193,182.200836,2013,1,NUTS2003
2,AT1,1.375383,2.639027,106.621887,2013,1,NUTS2003
3,AT2,0.614355,1.306002,71.294838,2013,1,NUTS2003
4,AT34,1.249731,2.065825,37.559410,2013,1,NUTS2003
...,...,...,...,...,...,...,...
486607,SI037,1.023837,1.785201,45.340000,2024,12,NUTS2024
486608,SI038,0.856611,1.177727,19.650000,2024,12,NUTS2024
486609,SI043,0.880418,1.505450,36.509998,2024,12,NUTS2024
486610,SK041,0.756769,1.343125,34.970001,2024,12,NUTS2024


In [5]:
df_combined_M = df_combined_M.rename(columns={'stdDev': 'std_dev'})
# Melt the dataframe to long format
df_long_M = pd.melt(df_combined_M, 
                  id_vars=['geo', 'geo_source', 'obsTime', 'obsMonth'], 
                  value_vars=['max', 'mean', 'std_dev'],
                  var_name='metric', 
                  value_name='obsValue')

cols = [col for col in df_long_M.columns if col != 'geo_source'] + ['geo_source']
df_long_M = df_long_M[cols]
df_long_M

Unnamed: 0,geo,obsTime,obsMonth,metric,obsValue,geo_source
0,AT,2013,1,max,182.200836,NUTS2003
1,AT3,2013,1,max,182.200836,NUTS2003
2,AT1,2013,1,max,106.621887,NUTS2003
3,AT2,2013,1,max,71.294838,NUTS2003
4,AT34,2013,1,max,37.559410,NUTS2003
...,...,...,...,...,...,...
5839339,SI037,2024,12,std_dev,1.785201,NUTS2024
5839340,SI038,2024,12,std_dev,1.177727,NUTS2024
5839341,SI043,2024,12,std_dev,1.505450,NUTS2024
5839342,SK041,2024,12,std_dev,1.343125,NUTS2024


In [7]:
1946448*3

5839344

In [6]:
df_long_M.to_csv(DATA_DIR/"GEE/NTL/Output/NTL_Monthly_1324.csv", index=True, index_label="id")

# Export with geometry to Check in QGIS

In [3]:
# # Drop the 'system:index' column
# if 'system:index' in df.columns:
#     df = df.drop(columns=['system:index'])

# # Convert .geo (GeoJSON) to WKT
# def geojson_to_wkt(geo_str):
#     try:
#         geom = shape(json.loads(geo_str))
#         return geom.wkt
#     except Exception as e:
#         print(f"Error parsing geometry: {e}")
#         return None

# df['WKT'] = df['.geo'].apply(geojson_to_wkt)

# # Drop the original .geo column
# df = df.drop(columns=['.geo'])
# df

Unnamed: 0,NUTS_ID,geo_source,max,mean,stdDev,year,WKT
0,AT124,NUTS2006,39.095600,0.485493,1.325237,2020,"POLYGON ((15.5423568687789 48.90795874625594, ..."
1,AT125,NUTS2006,52.678535,0.746196,1.576954,2020,POLYGON ((15.753875401943136 48.85239782145810...
2,AT313,NUTS2006,25.432070,0.314568,0.894521,2020,POLYGON ((13.839701890307119 48.77145300222119...
3,AT126,NUTS2006,68.902916,1.495743,2.753673,2020,POLYGON ((15.731365711283962 48.38419992107961...
4,AT311,NUTS2006,62.645573,0.509443,1.775113,2020,POLYGON ((13.727576738328024 48.51302851665202...
...,...,...,...,...,...,...,...
7639,UKC12,NUTS2003,111.081902,9.822134,15.013664,2021,POLYGON ((-0.7936467003412386 54.5584462763246...
7640,UKC11,NUTS2003,92.253082,10.010130,13.173835,2021,MULTIPOLYGON (((-1.2347878213391148 54.5103682...
7641,UKN03,NUTS2003,46.515072,1.200607,2.830120,2021,MULTIPOLYGON (((-5.641572728675446 54.41375189...
7642,UKE12,NUTS2003,280.591919,1.421410,5.261583,2021,POLYGON ((-1.048596355843249 53.65608239478745...


In [4]:
# df_test = df[df['LEVL_CODE'] == 2]
# df_test.to_csv("/Users/wenlanzhang/Downloads/formatted_for_qgis.csv", index=True, index_label="id")

In [None]:
# # Drop the specified columns
# df = df.drop(columns=['WKT'])

In [5]:
df

Unnamed: 0,geo_source,max,mean,obsTime,stdDev
0,NUTS2003,371.416779,0.700943,2019,3.080153
1,NUTS2003,371.416779,0.562128,2019,2.892622
2,NUTS2003,174.950760,1.227383,2019,4.131787
3,NUTS2003,177.478455,0.398776,2019,1.858099
4,NUTS2003,46.541229,0.807894,2019,2.226023
...,...,...,...,...,...
40546,NUTS2024,25.373180,0.554214,2021,1.470890
40547,NUTS2024,15.038698,0.351601,2021,0.902235
40548,NUTS2024,27.257809,0.400070,2021,1.318365
40549,NUTS2024,46.032799,0.432952,2021,1.798095


In [46]:
# Define the years you're interested in
years = [2003, 2006, 2010, 2013, 2016, 2021, 2024]

# Loop over each year, read the file, and print the number of rows
for year in years:
    file_path = DATA_DIR / f"NUTS/NUTS_RG_01M_{year}_3035.geojson"
    try:
        gdf = gpd.read_file(file_path)
        print(f"{year}: {len(gdf)} rows")
    except Exception as e:
        print(f"Failed to read {file_path.name}: {e}")

2003: 1891 rows
2006: 1931 rows
2010: 1920 rows
2013: 1951 rows
2016: 2016 rows
2021: 2010 rows
2024: 1798 rows
