<a href="https://colab.research.google.com/github/PrzemyslawSarnacki/AirQualityPrediction/blob/master/get_data.ipynb" target="_parent">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a> 

In [3]:
!pip install py-openaq



In [4]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import openaq
import warnings

warnings.simplefilter('ignore')

%matplotlib inline

# Set major seaborn asthetics
sns.set("notebook", style='ticks', font_scale=1.0)

# Increase the quality of inline plots
mpl.rcParams['figure.dpi']= 500


In [5]:
api = openaq.OpenAQ()
resp = api.cities(df=True, limit=10000)

# display the first 10 rows
resp.query("country == 'PL'")

Unnamed: 0,country,name,city,count,locations
151,PL,Augustów,Augustów,161132871.0,2
223,PL,Belsk,Belsk,21770.0,1
224,PL,Belsk Duży,Belsk Duży,125822709.0,2
248,PL,Biała,Biała,94279227.0,1
249,PL,Biała Podlaska,Biała Podlaska,128518539.0,1
...,...,...,...,...,...
2449,PL,Zielonka,Zielonka,158649172.0,1
2454,PL,Złoty Potok,Złoty Potok,184236679.0,1
2456,PL,Żory,Żory,63993420.0,1
2458,PL,Żyrardów,Żyrardów,64404821.0,2


In [28]:
CITIES = ["Warszawa", "Kraków", "Poznań", "Katowice", "Białystok"]
PARAMETERS = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co']
LOCATIONS = ["Warszawa-Śródmieście", "Kraków-", "Poznań", "Katowice", "Białystok-Miejska", "WIOŚ Elbląg ul. Bażyńskiego"]

In [29]:
import unicodedata

def strip_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text.lower().replace("ł","l")) if unicodedata.category(c) != 'Mn')


In [None]:
def check_params():
    return all(param in res.parameter.unique() for param in PARAMETERS) 

dfs = {}

for city in CITIES:
    print(city)
    dfs[strip_accents(city)] = pd.DataFrame()
    for page in range(1, 25):
        if (city == "Katowice" or city == "Poznań" ) and page > 15:
            continue
        if city == "Białystok" and page > 12:
            continue
        res = api.measurements(city=city, page=page, limit=10000, df=True)
        print(check_params())
        if check_params():
            dfs[strip_accents(city)] = dfs[strip_accents(city)].append(res)


In [None]:
dfs["bialystok"].head()

In [None]:
organized_dfs = {}

for city in CITIES:    
    df_avg = pd.DataFrame()
    for param in PARAMETERS:
        df_avg[param] = dfs[strip_accents(city)].loc[dfs[strip_accents(city)]["parameter"] == param].resample('D').mean()["value"]
        organized_dfs[strip_accents(city)] = df_avg
        

In [None]:
output = pd.concat([organized_dfs[strip_accents(city)] for city in CITIES], axis=1, keys=[strip_accents(city) for city in CITIES])

In [None]:
output.to_csv("data/airq_data_1.csv")

# Uzupełnienie danych

In [34]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/PrzemyslawSarnacki/AirQualityPrediction/master/data/airq_data_1.csv", index_col=0, parse_dates=True)

data = {}
indices = [""] + [f".{i}" for i in range(1, 6)]

# fill columns with data
for city in CITIES:
    data[strip_accents(city)] = pd.DataFrame(df, columns=[f"{strip_accents(city)}{item}" for item in indices])
    data[strip_accents(city)] = data[strip_accents(city)].rename(columns=data[strip_accents(city)].iloc[0]).drop(data[strip_accents(city)].index[0])
    data[strip_accents(city)] = data[strip_accents(city)].drop(index="date.local").astype(float).interpolate(method="linear")
    data[strip_accents(city)].index = pd.to_datetime(data[strip_accents(city)].index)

In [72]:
data["warszawa"].head(263)

Unnamed: 0,pm25,pm10,no2,so2,o3,co
2018-11-21,,,,,,
2018-11-22,,,,,,
2018-11-23,,,,,,
2018-11-24,,,,,,
2018-11-25,,,,,,
...,...,...,...,...,...,...
2019-08-06,,,,,,
2019-08-07,,,,,,
2019-08-08,,,,,,
2019-08-09,,,,,,


In [None]:
# Replace Nan values with missing data from date 2018-11-21 to 2019-08-10
for city in CITIES:
  for param in PARAMETERS:
    res = api.measurements(city=city, parameter=param, date_from="2018-11-21", date_to="2019-08-10", limit=100000, df=True)
    res.resample('D').mean()["value"]
    data[strip_accents(city)].head(263)[param] = res.resample('D').mean()["value"]


In [151]:
output = pd.concat([data[strip_accents(city)] for city in CITIES], axis=1, keys=[strip_accents(city) for city in CITIES])
output.to_csv("airq_data_2.csv")

# Aktualizacja nowych danych

In [6]:
import pandas as pd
from datetime import timedelta

data = {}
indices = [""] + [f".{i}" for i in range(1, 6)]
CITIES = ["Warszawa", "Kraków", "Poznań", "Katowice", "Białystok"]
PARAMETERS = ['pm25', 'pm10', 'no2', 'so2', 'o3', 'co']

import unicodedata

def strip_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text.lower().replace("ł","l")) if unicodedata.category(c) != 'Mn')


df = pd.read_csv("https://raw.githubusercontent.com/PrzemyslawSarnacki/AirQualityPrediction/master/data/airq_data_2.csv", index_col=0, parse_dates=True)

for city in CITIES:
    data[strip_accents(city)] = pd.DataFrame(df, columns=[f"{strip_accents(city)}{item}" for item in indices])
    data[strip_accents(city)] = data[strip_accents(city)].rename(columns=data[strip_accents(city)].iloc[0]).drop(data[strip_accents(city)].index[0])
    data[strip_accents(city)] = data[strip_accents(city)].astype(float).interpolate(method="linear")
    data[strip_accents(city)].index = pd.to_datetime(data[strip_accents(city)].index)

for city in CITIES:
  helper_df = pd.DataFrame()
  for param in PARAMETERS:
    last_date = data[strip_accents(city)].index[-1] + timedelta(days=1) 
    last_date_str = last_date.strftime("%Y-%m-%d")
    res = api.measurements(city=city, parameter=param, date_from=last_date_str, limit=100000, df=True)
    helper_df[param] = res.resample('D').mean()["value"]
  data[strip_accents(city)] = data[strip_accents(city)].append(helper_df)

output = pd.concat([data[strip_accents(city)] for city in CITIES], axis=1, keys=[strip_accents(city) for city in CITIES])
output.to_csv("airq_data_2.csv")