In [1]:
from datetime import date, datetime, timedelta
from pytz import timezone
import pytz
utc = pytz.utc
# utc.zone
brussels = timezone('Europe/Brussels')
# brussels.zone
today = datetime.now(brussels).date()
# date.today()


In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [3]:
from datetime import datetime, date
import pandas as pd
import numpy as np

In [4]:
from app.utils import get_db_engine

In [5]:
from app.etl.pipeline import Pipeline
from app.etl.pipeline import Transformer
from app.models import models
from app.models.metadata import ETL_Metadata

In [12]:
pl = {
  "source": "https://epistat.sciensano.be/data/COVID19BE_VACC_MUNI_CUM.csv",
  "model": "VaccinationsByNISCodeAndWeek",
  "metadata_handler": {
    "frequency": "daily",
    "date_column": "date"
  },
  "tranforms": [
    {
      "type": "drop_na"
    },
    {
      "type": "add_column",
      "data": {
        "column": "date",
        "column_from": "YEAR_WEEK",
        "update": {
          "type": "date",
          "suffix": "1",
          "format": "%yW%W%w"
        }
      }
    },
    {
      "type": "update_value",
      "data": {
        "column": "date",
        "update": {
          "type": "date",
          "subtract": {
            "days": 7
          }
        }
      }
    },
    {
      "type": "add_column",
      "data": {
        "column": "week",
        "column_from": "date",
        "update": {
          "type": "integer",
          "get": "weeknum"
        }
      }
    },
    {
      "type": "add_column",
      "data": {
        "column": "year",
        "column_from": "date",
        "update": {
          "type": "integer",
          "get": "year"
        }
      }
    },
    {
      "type": "drop_columns",
      "data": {
        "columns": [
          "YEAR_WEEK"
        ]
      }
    },
    {
      "type": "rename_columns",
      "data": {
        "columns": {
          "NIS5": "nis_code",
          "AGEGROUP": "agegroup",
          "DOSE": "dose",
          "CUMUL": "cumul_of_week"
        }
      }
    },
    {
      "type": "update_value",
      "data": {
        "column": "nis_code",
        "update": {
          "type": "string",
          "format": "{0:0>5}"
        }
      }
    },
    {
      "type": "update_value",
      "data": {
        "column": "cumul_of_week",
        "current_value": "<10",
        "value_if_true": {
            "type": "integer",
            "value": 10
        }
      }        
    },
    {
      "type": "group_by",
      "data": {
        "columns": [
          "nis_code",
          "agegroup",
          "dose",
          "date",
          "week",
          "year"
        ],
        "aggregate": {
          "type": "sum"
        }
      }
    }
  ]
}

In [13]:
transformer=Transformer(pl["tranforms"])
pipeline = Pipeline(
    data_class=getattr(models, pl["model"]),
    path=pl["source"],
    transformer=transformer
)

In [34]:
data_frame = pipeline.extract()

In [35]:
data_frame

Unnamed: 0,YEAR_WEEK,NIS5,AGEGROUP,DOSE,CUMUL
0,20W53,11002.0,18-24,A,<10
1,20W53,11002.0,25-34,A,<10
2,20W53,11002.0,35-44,A,<10
3,20W53,11002.0,45-54,A,<10
4,20W53,11002.0,55-64,A,<10
...,...,...,...,...,...
569223,21W44,,75-84,B,821
569224,21W44,,75-84,C,118
569225,21W44,,85+,A,514
569226,21W44,,85+,B,393


In [22]:
data_frame.describe()

Unnamed: 0,NIS5
count,568106.0
mean,46601.007011
std,24274.22994
min,11001.0
25%,24059.0
50%,44081.0
75%,63072.0
max,93090.0


In [23]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 568106 entries, 0 to 569200
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   YEAR_WEEK  568106 non-null  object 
 1   NIS5       568106 non-null  float64
 2   AGEGROUP   568106 non-null  object 
 3   DOSE       568106 non-null  object 
 4   CUMUL      568106 non-null  object 
dtypes: float64(1), object(4)
memory usage: 26.0+ MB


In [26]:
data_frame.isnull().sum()

YEAR_WEEK    0
NIS5         0
AGEGROUP     0
DOSE         0
CUMUL        0
date         0
dtype: int64

In [36]:
df = pipeline.transformer.drop_na(data_frame, None)
df = pipeline.transformer.add_column(df, {
    "column": "date",
    "column_from": "YEAR_WEEK",
    "update": {
        "type": "date",
        "suffix": "1",
        "format": "%yW%W%w"
    }
})
df = pipeline.transformer.update_value(df, {
    "column": "date",
    "update": {
        "type": "date",
        "subtract": {
            "days": 7
        }
    }
})
df = pipeline.transformer.add_column(df, {
            "column": "week",
            "column_from": "date",
            "update": {
              "type": "integer",
              "get": "weeknum"
            }
          })
df = pipeline.transformer.add_column(df, {
            "column": "year",
            "column_from": "date",
            "update": {
              "type": "integer",
              "get": "year"
            }
          })
df = pipeline.transformer.drop_columns(df, {
            "columns": [
              "YEAR_WEEK"
            ]
          })
df = pipeline.transformer.rename_columns(df, {
            "columns": {
              "NIS5": "nis_code",
              "AGEGROUP": "agegroup",
              "DOSE": "dose",
              "CUMUL": "cumul_of_week"
            }
          })
df

<function get_lambda_to_apply.<locals>.<lambda> at 0x11e6c8dc0>
...weeknum...
<function get_lambda_to_apply.<locals>.<lambda> at 0x11e6e8940>
...year...
<function get_lambda_to_apply.<locals>.<lambda> at 0x11e6e8940>


Unnamed: 0,NIS5,AGEGROUP,DOSE,CUMUL,date,week,year
0,11002.0,18-24,A,<10,2020-12-28,53,2020
1,11002.0,25-34,A,<10,2020-12-28,53,2020
2,11002.0,35-44,A,<10,2020-12-28,53,2020
3,11002.0,45-54,A,<10,2020-12-28,53,2020
4,11002.0,55-64,A,<10,2020-12-28,53,2020
...,...,...,...,...,...,...,...
569196,93090.0,75-84,B,330,2021-10-25,43,2021
569197,93090.0,75-84,C,17,2021-10-25,43,2021
569198,93090.0,85+,A,146,2021-10-25,43,2021
569199,93090.0,85+,B,146,2021-10-25,43,2021


In [10]:
df = pipeline.transform(data_frame)

<function get_lambda_to_apply.<locals>.<lambda> at 0x11d7f39d0>
...weeknum...
<function get_lambda_to_apply.<locals>.<lambda> at 0x11e381c10>
...year...
<function get_lambda_to_apply.<locals>.<lambda> at 0x10ab71c10>


  data_frame = groupedby.sum().reset_index()


In [18]:
data_frame

Unnamed: 0,YEAR_WEEK,NIS5,AGEGROUP,DOSE,CUMUL,date,week,year
0,20W53,11002.0,18-24,A,<10,2020-12-28,53,2020
1,20W53,11002.0,25-34,A,<10,2020-12-28,53,2020
2,20W53,11002.0,35-44,A,<10,2020-12-28,53,2020
3,20W53,11002.0,45-54,A,<10,2020-12-28,53,2020
4,20W53,11002.0,55-64,A,<10,2020-12-28,53,2020
...,...,...,...,...,...,...,...,...
569223,21W44,,75-84,B,821,2021-10-25,43,2021
569224,21W44,,75-84,C,118,2021-10-25,43,2021
569225,21W44,,85+,A,514,2021-10-25,43,2021
569226,21W44,,85+,B,393,2021-10-25,43,2021


In [None]:
list = [
    pipeline.data_class(**kwargs) for kwargs in data_frame.to_dict(orient="records")
]

In [None]:
list

In [None]:
data_frame = pipeline.handle_metadata(data_frame)

In [None]:
data_frame

In [None]:
data_frame = data_frame.groupby([
    'year', 'nis', 'sex',
    'nationality_code', 'nationality_text_nl', 'nationality_text_fr',
    'marital_status_code',
    'marital_status_text_nl',
    'marital_status_text_fr',
    'age'
]).sum().reset_index()
data_frame

In [None]:
data_frame.groupby(['year', 'nis', 'sex', 'nationality_code', 'marital_status_code', 'age'])['population'].transform('size')

In [None]:

data_frame[data_frame.groupby(['year', 'nis', 'sex', 'nationality_code', 'marital_status_code', 'age'])['nis'].transform('size') > 1]


In [None]:
data_frame = data_frame.groupby([
    'year', 'week', 'nis_district',
    'sex', 'agegroup', 'date'
]).sum().reset_index()
data_frame

In [None]:
data_frame.dropna(inplace=True)

In [None]:
data_frame.isnull().sum()

In [None]:
test = 2021
type(test)

In [None]:
starting_day_of_current_year = datetime.now().date().replace(month=1, day=1)    
ending_day_of_current_year = datetime.now().date().replace(month=12, day=31)
starting_day_of_current_year

In [None]:
frequency = "daily"
frequency is not "daily"