In [5]:
import os
import sys
import geopandas as gpd
import pandas as pd 


sys.path.append("../")
from config.config import BASE_PATH, PATH_TO_PATH_CONFIG_FILE
from src.utils import load_paths_from_yaml, replace_base_path

In [6]:
# Load paths from the YAML file
paths = load_paths_from_yaml(PATH_TO_PATH_CONFIG_FILE)
paths = replace_base_path(paths, BASE_PATH)

In [7]:
# read in csv data from 2006 and 2011
geostat_2006 = pd.read_csv(paths["population_layers"]["2006"]["source"], sep=";")
geostat_2011 = pd.read_csv(paths["population_layers"]["2011"]["source"])

# read in shp-file and geopackage from 2018 and 2021
geostat_2018 = gpd.read_file(paths["population_layers"]["2018"]["source"])
geostat_2021 = gpd.read_file(paths["population_layers"]["2021"]["source"])

# read in nuts data
nuts = gpd.read_file(paths["nuts_data"])


  geostat_2011 = pd.read_csv(paths["population_layers"]["2011"]["source"])


In [82]:
# intersect geostat_2021 data with nuts austria level 0 shape
nuts_austria = nuts[(nuts.LEVL_CODE == 0) & (nuts.CNTR_CODE == "AT")]
nuts_austria.set_crs(epsg="3035", allow_override=True, inplace=True)
geostat_2021_austria = geostat_2021[geostat_2021.within(nuts_austria.unary_union)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [85]:
# rename population columns into a uniform way 
geostat_2006.rename(columns={"POP_TOT": "POP_2006"}, inplace=True)
geostat_2011.rename(columns={"TOT_P": "POP_2011"}, inplace=True)
geostat_2018.rename(columns={"TOT_P_2018": "POP_2018"}, inplace=True)
geostat_2021_austria.rename(columns={"OBS_VALUE_T": "POP_2021"}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  geostat_2021_austria.rename(columns={"OBS_VALUE_T": "POP_2021"}, inplace=True)


In [86]:
# rename GRD_ID columns of data from 2006 and 2011 so that it matches the column names from 2018 and 2021

def convert_grd_id(grd_id: str):
    grd_str = grd_id.split("km")[1]
    nrd_component = grd_str.split("E")[0]
    east_component = "E" + grd_str.split("E")[1]
    final_str = f"CRS3035RES1000m{nrd_component}000{east_component}000"
    return final_str
    
geostat_2006.GRD_ID = geostat_2006.GRD_ID.apply(lambda x: convert_grd_id(x))
geostat_2011.GRD_ID = geostat_2011.GRD_ID.apply(lambda x: convert_grd_id(x))

In [87]:
# merge data from different years
geostat_pop = geostat_2021_austria.merge(geostat_2018.loc[:, ["GRD_ID", "POP_2018"]], on="GRD_ID", how="left")
geostat_pop = geostat_pop.merge(geostat_2011.loc[:, ["GRD_ID", "POP_2011"]], on="GRD_ID", how="left")
geostat_pop = geostat_pop.merge(geostat_2006.loc[:, ["GRD_ID", "POP_2006"]], on="GRD_ID", how="left")

In [88]:
# fill NaN values with 0
geostat_pop["POP_2006"] = geostat_pop["POP_2006"].fillna(0)
geostat_pop["POP_2011"] = geostat_pop["POP_2011"].fillna(0)
geostat_pop["POP_2018"] = geostat_pop["POP_2018"].fillna(0)
geostat_pop["POP_2021"] = geostat_pop["POP_2021"].fillna(0)


In [89]:
# bring columns into the right order
geostat_pop = geostat_pop.loc[:, ["GRD_ID", "POP_2006", "POP_2011", "POP_2018", "POP_2021", "geometry"]]

In [92]:
# export GeoDataFrame to shp file
geostat_pop.to_file(paths["populations_layers"]["population_all_years_vector"])