In [2]:
import json
import io

import pandas as pd
import great_expectations as ge

from datetime import datetime, timedelta
from haversine import haversine
from typing import Collection
# from ml_part import data_workflow


DATA_DIR = "D:/Notebooks/Diplom/weather_web_app/data/"

# Loading the data for testing

In [2]:
brest_coords = (52.09, 23.68)

data_collection = data_workflow.DataCollection(brest_coords)
brest_dataset = pd.read_csv(io.StringIO(data_collection._dataset)).iloc[:-1]
brest_dataset.set_index('DATE', inplace=True)
brest_dataset.to_csv(DATA_DIR + 'brest_init_df.csv')

data_cleaning = data_workflow.DataCleaning(brest_dataset)
brest_df = data_cleaning.get_cleaned_data().iloc[:-1]
brest_df.to_csv(DATA_DIR + 'brest_filled_df.csv')

latest_data = data_workflow.LatestData(brest_coords)
brest_comb_df = latest_data.combined_dataset.iloc[:-1]
brest_comb_df.to_csv(DATA_DIR + 'brest_comb_df.csv')

The nearest city is Brest -> 1.685 km
********** Stations **********
1. BOM00033008 -> 2.976 km
2. BOM00033001 -> 36.849 km
3. PLM00012497 -> 60.889 km
The nearest station is 'BOM00033008'
Total missing values: 8952
Total missing dates: 1717
Filling them using the other stations: BOM00033001, PLM00012497

Station 'BOM00033001'
Left missing values -> 8950
Left missing dates -> 1717

Station 'PLM00012497'
Left missing values -> 1465
Left missing dates -> 1712

There were left 1465 missing values and 1712 missing dates. Now they will be filled data from ERA5 dataset

In [3]:
brest_comb_df.head(2)

Unnamed: 0_level_0,precip_sum,temp_min,temp_max,100-500_north_precip_sum,100-500_north_temp_min,100-500_north_temp_max,100-500_south_precip_sum,100-500_south_temp_min,100-500_south_temp_max,100-500_west_precip_sum,...,500-1000_north_temp_max,500-1000_south_precip_sum,500-1000_south_temp_min,500-1000_south_temp_max,500-1000_west_precip_sum,500-1000_west_temp_min,500-1000_west_temp_max,500-1000_east_precip_sum,500-1000_east_temp_min,500-1000_east_temp_max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,0.3,-6.2,-0.5,0.0,-6.4,-3.2,0.0,-9.9,-5.3,0.1,...,-6.8,1.4,1.3,5.0,1.9,-0.7,3.4,0.0,-7.5,-4.6
2000-01-02,0.0,-8.1,-3.0,0.0,-9.9,-4.5,0.0,-12.4,-3.4,0.0,...,-9.5,0.0,-2.5,5.1,0.0,2.2,3.6,0.0,-8.8,-6.3


In [4]:
brest_df.head(2)

Unnamed: 0_level_0,precip_sum,temp_max,temp_min
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1940-01-02,0.0,-12.6,-21.5
1940-01-03,2.3,-4.2,-18.3


# Great Expectations

## Initial Brest data

In [13]:
brest_init = pd.read_csv(DATA_DIR + 'brest_init_df.csv')
df = ge.dataset.PandasDataset(brest_init)
df.head()

Unnamed: 0,DATE,STATION,PRCP,TMAX,TMIN
0,1944-09-01,BOM00033008,0.0,23.3,12.4
1,1944-09-02,BOM00033008,0.0,23.5,9.5
2,1944-09-03,BOM00033008,0.0,29.8,12.6
3,1944-09-04,BOM00033008,16.6,18.0,10.6
4,1944-09-05,BOM00033008,0.0,23.0,9.7


In [15]:
df.expect_table_columns_to_match_ordered_list(
    column_list=['DATE', 'STATION', 'PRCP', 'TMAX', 'TMIN']
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      "DATE",
      "STATION",
      "PRCP",
      "TMAX",
      "TMIN"
    ]
  }
}

In [16]:
df.expect_compound_columns_to_be_unique(
    column_list = ['DATE', 'STATION']
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 63091,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [18]:
df.expect_column_values_to_match_strftime_format(
    column='DATE', strftime_format='%Y-%m-%d'
).success

True

In [19]:
# Most rainfall in a day
df.expect_column_values_to_be_between(
    column='PRCP', min_value=0, max_value=2000
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 63091,
    "missing_count": 8899,
    "missing_percent": 14.10502290342521,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [21]:
df.expect_column_values_to_be_between(
    column='TMAX', min_value=-89.2, max_value=56.7
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 63091,
    "missing_count": 21336,
    "missing_percent": 33.817818706313105,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [22]:
df.expect_column_values_to_be_between(
    column='TMIN', min_value=-89.2, max_value=56.7
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 63091,
    "missing_count": 25284,
    "missing_percent": 40.07544657716631,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

## Filled Brest data

In [23]:
brest_filled = pd.read_csv(DATA_DIR + 'brest_filled_df.csv')
df2 = ge.dataset.PandasDataset(brest_filled)
df2.head()

Unnamed: 0,date,precip_sum,temp_max,temp_min
0,1940-01-02,0.0,-12.6,-21.5
1,1940-01-03,2.3,-4.2,-18.3
2,1940-01-04,0.2,-4.2,-9.1
3,1940-01-05,0.0,-8.2,-13.0
4,1940-01-06,0.1,-5.4,-12.8


In [26]:
df2.expect_table_columns_to_match_ordered_list(
    column_list=['date', 'precip_sum', 'temp_max', 'temp_min']
).success

True

In [28]:
# It is because I should have done it for structured filled data.
df2.expect_column_values_to_be_increasing(
    column='date', parse_strings_as_datetimes=True
).success

True

In [30]:
df2.expect_column_values_to_be_unique(
    column='date'
).success

True

In [31]:
df2.expect_column_values_to_not_be_null(
    column='precip_sum'
).success

True

In [32]:
df2.expect_column_values_to_not_be_null(
    column='temp_max'
).success

True

In [33]:
df2.expect_column_values_to_not_be_null(
    column='temp_min'
).success

True

## Combined Brest data

In [34]:
brest_combined = pd.read_csv(DATA_DIR + 'brest_comb_df.csv')
df3 = ge.dataset.PandasDataset(brest_combined)
df3.head()

Unnamed: 0,date,precip_sum,temp_min,temp_max,nearnorthprecip_sum,nearnorthtemp_min,nearnorthtemp_max,nearsouthprecip_sum,nearsouthtemp_min,nearsouthtemp_max,...,farnorthtemp_max,farsouthprecip_sum,farsouthtemp_min,farsouthtemp_max,farwestprecip_sum,farwesttemp_min,farwesttemp_max,fareastprecip_sum,fareasttemp_min,fareasttemp_max
0,2000-01-01,0.3,-6.2,-0.5,0.0,-6.4,-3.2,0.0,-9.9,-5.3,...,-6.8,1.4,1.3,5.0,1.9,-0.7,3.4,0.0,-7.5,-4.6
1,2000-01-02,0.0,-8.1,-3.0,0.0,-9.9,-4.5,0.0,-12.4,-3.4,...,-9.5,0.0,-2.5,5.1,0.0,2.2,3.6,0.0,-8.8,-6.3
2,2000-01-03,0.7,-8.2,1.5,2.7,-5.6,0.4,1.6,-8.0,-1.5,...,-10.0,0.0,-4.6,-2.8,0.0,3.2,5.2,0.0,-10.8,-7.9
3,2000-01-04,2.4,0.6,2.0,2.1,-1.5,1.5,0.0,-2.6,0.3,...,-1.0,0.0,-4.8,-2.5,5.1,3.8,5.1,0.2,-7.6,-5.0
4,2000-01-05,4.3,0.3,2.9,0.0,-2.5,1.0,4.1,-0.3,0.6,...,0.4,0.0,-5.2,-2.6,1.0,0.2,6.4,1.6,-4.6,-0.7


In [36]:
df3.expect_table_column_count_to_be_between(
    min_value=1+3, max_value=100
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 28
  }
}

In [40]:
df3.expect_column_values_to_be_increasing(
    column='date', parse_strings_as_datetimes=True
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 8594,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

In [41]:
df3.expect_column_values_to_be_unique(
    column='date'
)

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 8594,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  }
}

# Django application

In [1]:
import os
import django
import pandas as pd
from datetime import datetime, timedelta

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'backend_django.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

from test_app.ml_part.data_workflow import (
    NOAAService, NOAACleaning, ERA5Service
)

## NOAAService

In [2]:
NOAAService.show_available_data_types()

{'TMAX': 'Maximum temperature', 'TMIN': 'Minimum temperature', 'PRCP': 'Precipitation', 'RHAV': 'Average relative humidity', 'ASTP': 'Average Station Level Pressure', 'ACSC': 'Average cloudiness sunrise to sunset from 30-secondceilometer data (percent)', 'AWDR': 'Average daily wind direction (degrees)', 'AWND': 'Average daily wind speed (tenths of meters per second)'}


In [18]:
try:
    NOAAService(loc_coords=(-100, 90))
except BaseException as be:
    print(be.__class__.__name__ + ':', be.args[0])

ValueError: Latitude (the first value in 'loc_coords') must bein the range of [-90, 90]


In [3]:
loc_coords = (52.08, 23.40)
brest_noaa_service = NOAAService(loc_coords, start_date='2020-01-01')

In [6]:
brest_noaa_service.get_nearest_stations()

The nearest city is Brest -> 17.562 km
********** Stations **********
1. BOM00033008 -> 19.764 km
2. BOM00033001 -> 31.934 km
3. PLM00012497 -> 59.638 km


['BOM00033008', 'BOM00033001', 'PLM00012497']

In [3]:
brest_noaa_service.get_noaa_data()

The nearest city is Brest -> 17.562 km
********** Stations **********
1. BOM00033008 -> 19.764 km
2. BOM00033001 -> 31.934 km
3. PLM00012497 -> 59.638 km
Station(-s) {'BOM00033001'}
do(-es) not have the data for this date range.


Unnamed: 0,STATION,DATE,PRCP,TMAX,TMIN
0,BOM00033008,2020-01-01,,,
1,BOM00033008,2020-01-02,,2.8,
2,BOM00033008,2020-01-03,,,
3,BOM00033008,2020-01-04,0.0,4.1,0.0
4,BOM00033008,2020-01-05,,,
...,...,...,...,...,...
2570,PLM00012497,2023-07-10,0.0,26.4,10.9
2571,PLM00012497,2023-07-11,1.0,24.9,11.9
2572,PLM00012497,2023-07-12,0.0,29.1,11.1
2573,PLM00012497,2023-07-13,0.0,26.3,


## NOAACleaning

In [4]:
NOAACleaning.get_station_coords(station_id='BOM00033008')

(52.1167, 23.6831)

In [4]:
brest_noaa_cleaning = NOAACleaning(
    brest_noaa_service.get_noaa_data()
)

The nearest city is Brest -> 17.562 km
********** Stations **********
1. BOM00033008 -> 19.764 km
2. BOM00033001 -> 31.934 km
3. PLM00012497 -> 59.638 km

Station(-s) {'BOM00033001'}
 do(-es) not have the data for this date range.


In [4]:
brest_noaa_cleaning.get_combined_dataset()

The nearest station is 'BOM00033008'
Total missing values: 1833
Total missing days: 5
Filling them using the other stations: PLM00012497



Station 'PLM00012497'
Left missing values -> 297
Left missing days -> 2

There were left 297 missing values and 2 missing days. 
Now they will be filled data from ERA5 dataset

Unnamed: 0_level_0,precip_sum,temp_max,temp_min
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01,1.3,4.6,0.1
2020-01-02,0.0,2.8,
2020-01-03,0.0,3.5,-2.2
2020-01-04,0.0,4.1,0.0
2020-01-05,2.5,2.7,
...,...,...,...
2023-07-10,0.0,27.0,13.0
2023-07-11,0.0,24.7,14.0
2023-07-12,0.0,28.9,12.7
2023-07-13,0.0,26.3,18.1


In [6]:
noaa_cleaned_data = brest_noaa_cleaning.get_cleaned_data()

The nearest station is 'BOM00033008'
Total missing values: 1838
Total missing days: 5
Filling them using the other stations: PLM00012497



Station 'PLM00012497'
Left missing values -> 297
Left missing days -> 2

There were left 297 missing values and 2 missing days. 
Now they will be filled data from ERA5 dataset

## ERA5Service

In [2]:
ERA5Service(
    loc_coords=(1, 1), data_types=list(ERA5Service.era_data_types.values())
).get_era_data().tail(10)

Unnamed: 0_level_0,precip_sum,temp_max,temp_min,wind_direction,wind_speed,relat_humidity,pressure_level,cloud_cover
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-07-13,0.1,25.6,24.9,167,20,81,1013,81
2023-07-14,0.6,25.5,25.0,170,20,83,1013,49
2023-07-15,0.5,25.6,24.9,181,20,84,1014,70
2023-07-16,0.0,25.6,24.9,199,6,80,1014,70
2023-07-17,0.0,25.8,25.2,201,7,79,1014,81
2023-07-18,0.0,26.1,25.3,215,6,78,1015,60
2023-07-19,0.0,26.5,25.6,210,7,80,1015,75
2023-07-20,0.5,26.3,25.4,201,7,81,1015,66
2023-07-21,0.1,26.4,25.2,198,8,79,1015,66
2023-07-22,0.1,25.7,24.8,182,8,74,1016,89


In [2]:
ERA5Service.show_available_data_types()

Initial data type and ERA5 data type into renamed data type
PRCP and precipitation_sum are renamed to 'precip_sum'
TMAX and temperature_2m_max are renamed to 'temp_max'
TMIN and temperature_2m_min are renamed to 'temp_min'
RHAV and relativehumidity_2m are renamed to 'relat_humidity'
ASTP and pressure_msl are renamed to 'pressure_level'
ACSC and cloudcover are renamed to 'cloud_cover'
AWDR and winddirection_10m_dominant are renamed to 'wind_direction'
AWND and windspeed_10m are renamed to 'wind_speed'


# ETC

In [6]:
from datetime import datetime, timedelta
import pandas as pd

TIME_FORMAT = r'%Y-%m-%d'

In [12]:
pd.date_range(
    start=datetime.today().strftime(TIME_FORMAT),
    end=(datetime.today() + timedelta(days=7)).strftime(TIME_FORMAT),
    freq='D'
).strftime('%a %d').to_list()

['Tue 01',
 'Wed 02',
 'Thu 03',
 'Fri 04',
 'Sat 05',
 'Sun 06',
 'Mon 07',
 'Tue 08']

In [5]:
datetime.strptime('0001-01-03', r'%Y-%m-%d').strftime("%a %d")

'Wed 03'