# Импорт библиотек

In [2]:
import pandas as pd
import numpy as np

import requests
from datetime import datetime
import time
import pytz

# Составление датасета

## Константы

In [3]:
POLLUTANTS = ['pm25', 'pm10', 'so2', 'no2', 'co', 'no']
DATE_FROM = datetime(2020, 1, 1)
DATE_TO = datetime(2023, 1, 1)
url_measurments = "https://api.openaq.org/v2/measurements"
url_cities = "https://api.openaq.org/v2/cities"
url_locations = "https://api.openaq.org/v2/locations" 
url_countries = "https://api.openaq.org/v3/countries"
date_range = pd.date_range(start=DATE_FROM, end=DATE_TO, freq='ME')

with open('apikey', "r") as keyfile:
    HEADERS = {}
    HEADERS['x-api-key'] = keyfile.readline().strip()

## Получим список всех стран через api предложенного сайта, которые соответствуют необходимым фильтрам:
Минимальная и максимальная даты измерений находятся в диапазоне 3х лет между 1 января 2020 и 1 января 2023 </br>
Есть много загрязнителей (все 6, которые я указывал в константах)

In [4]:
all_countries = []

params = {
    'limit': 200,
}

response = requests.get(url_countries, headers=HEADERS, params=params)

if response.status_code == 429:
    print("Превышено количество запросов. Ожидание 5 секунд...")
    time.sleep(5)

if response.status_code != 200:
    print(f"Ошибка: {response.status_code}, {response.text}")

data = response.json()
results = data.get('results', [])

if not results:
    print(f"Нет данных")

def parse_datetime(date_string):
    try:
        return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%S.%fZ')
    except ValueError:
        return datetime.strptime(date_string, '%Y-%m-%dT%H:%M:%SZ')


for elem in results:
    
    first = parse_datetime(elem['datetimeFirst'])
    last = parse_datetime(elem['datetimeLast'])
    parameters = [param['name'] for param in elem["parameters"]]

    if first <= DATE_FROM and last >= DATE_TO and \
        all(pollutant in parameters for pollutant in POLLUTANTS):
        all_countries.append((elem['name'], elem['id']))
print(all_countries)

[('Argentina', 6), ('Israel', 11), ('Palestine', 12), ('France', 22), ('South Africa', 37), ('Brazil', 45), ('Czech Republic', 49), ('Germany', 50), ('Sweden', 54), ('Luxembourg', 58), ('North Macedonia', 62), ('Spain', 67), ('Denmark', 71), ('Slovakia', 76), ('Poland', 77), ('United Kingdom', 79), ('Austria', 89), ('Italy', 91), ('Switzerland', 92), ('Netherlands', 94), ('Croatia', 103), ('Andorra', 129), ('United States', 155), ('Canada', 156), ('Mexico', 157), ('Malta', 223)]


In [4]:
all_cities = []
for country in all_countries:
    params = {
        'country_id': country[1],
        'limit': 200
    }

    response = requests.get(url_cities, headers=HEADERS, params=params)

    if response.status_code == 429:
        print("Превышено количество запросов. Ожидание 5 секунд...")
        time.sleep(5)

    if response.status_code != 200:
        print(f"Ошибка: {response.status_code}, {response.text}")

    data = response.json()
    results = data.get('results', [])
    print(results)
    if not results:
        print(f"Нет данных для {country}")

    if elem['firstUpdated'] is not None and elem['lastUpdated'] is not None:
            first = datetime.fromisoformat(elem['firstUpdated'])
            last = datetime.fromisoformat(elem['lastUpdated'])
            parameters = elem['parameters']
            
            DATE_FROM = DATE_FROM.replace(tzinfo=pytz.UTC)
            DATE_TO = DATE_TO.replace(tzinfo=pytz.UTC)
            
            if first <= DATE_FROM and last >= DATE_TO and \
                all(pollutant in parameters for pollutant in POLLUTANTS):
                all_cities.append(elem['city'])
print(all_cities)

[{'country': 'DK', 'city': ' ', 'count': 23700, 'locations': 1, 'firstUpdated': '2017-11-09 23:00:00+00', 'lastUpdated': '2024-04-05 08:00:00+00', 'parameters': ['no2']}, {'country': 'JP', 'city': ' ', 'count': 56225225, 'locations': 1596, 'firstUpdated': '2023-07-14 17:00:00+00', 'lastUpdated': '2024-10-03 04:00:00+00', 'parameters': ['co', 'no', 'no2', 'nox', 'pm25', 'so2']}, {'country': 'KR', 'city': ' ', 'count': 17308868, 'locations': 712, 'firstUpdated': '2024-03-19 23:00:00+00', 'lastUpdated': '2024-10-03 04:00:00+00', 'parameters': ['co', 'no2', 'o3', 'pm10', 'pm25', 'so2']}, {'country': 'TT', 'city': ' ', 'count': 4231, 'locations': 1, 'firstUpdated': '2024-03-20 01:00:00+00', 'lastUpdated': '2024-07-08 13:00:00+00', 'parameters': ['co', 'pm10', 'pm25']}, {'country': 'IT', 'city': ' Brescia Via San Polo', 'count': 5209, 'locations': 1, 'firstUpdated': '2024-01-28 23:00:00+00', 'lastUpdated': '2024-10-02 12:00:00+00', 'parameters': ['no2', 'pm25']}, {'country': 'IT', 'city': '-

KeyError: 'firstUpdated'

## Получаем все локации, которые соответствуют тем же требованиям что и страны по 100 локаций для страны

In [5]:
all_locations = []

for country in all_countries:

    params = {
        'country_id': country[1],
        'limit': 100,
    }
    
    print(f"Запрос локаций для страны: {country}")
    
    response = requests.get(url_locations, headers=HEADERS, params=params)
    
    if response.status_code == 429:
        print("Превышено количество запросов. Ожидание 5 секунд...")
        time.sleep(5)
        continue

    if response.status_code != 200:
        print(f"Ошибка: {response.status_code}, {response.text}")
        break
    
    data = response.json()
    results = data.get('results', [])
    print(results) 
    for elem in results:
        if elem['firstUpdated'] is not None and elem['lastUpdated'] is not None:
            first = datetime.fromisoformat(elem['firstUpdated'])
            last = datetime.fromisoformat(elem['lastUpdated'])
            parameters = [param['parameter'] for param in elem["parameters"]]
            
            DATE_FROM = DATE_FROM.replace(tzinfo=pytz.UTC)
            DATE_TO = DATE_TO.replace(tzinfo=pytz.UTC)
            
            if first <= DATE_FROM and last >= DATE_TO and \
                all(pollutant in parameters for pollutant in POLLUTANTS):
                all_locations.append((elem['name'], elem['id'], elem['city']))
    
print(all_locations)

Запрос локаций для страны: ('Argentina', 6)
[{'id': 5240, 'city': 'Buenos Aires', 'name': 'LA BOCA', 'entity': None, 'country': 'AR', 'sources': None, 'isMobile': False, 'isAnalysis': None, 'parameters': [{'id': 8, 'unit': 'ppm', 'count': 24471, 'average': 0.29317637910924804, 'parameter': 'co', 'lastValue': 0.64, 'displayName': 'co ppm', 'lastUpdated': '2024-10-04T03:00:00+00:00', 'parameterId': 8, 'firstUpdated': '2017-08-10T23:00:00+00:00', 'manufacturers': None}, {'id': 7, 'unit': 'ppm', 'count': 24495, 'average': 0.017595304880680222, 'parameter': 'no2', 'lastValue': 0.032, 'displayName': 'no2 ppm', 'lastUpdated': '2024-10-04T03:00:00+00:00', 'parameterId': 7, 'firstUpdated': '2017-08-10T23:00:00+00:00', 'manufacturers': None}, {'id': 1, 'unit': 'µg/m³', 'count': 24482, 'average': 26.477806563039724, 'parameter': 'pm10', 'lastValue': 0.0, 'displayName': 'pm10 µg/m³', 'lastUpdated': '2024-10-04T03:00:00+00:00', 'parameterId': 1, 'firstUpdated': '2017-08-10T23:00:00+00:00', 'manufac

In [6]:
locations_df = pd.DataFrame(all_locations)
locations_df.to_csv('locations.csv')

## Запрос всех результатов по локациям и по диапазону дат

In [40]:
import gc

all_results = []

i = 1
for location in all_locations:
    for month_end in date_range:
        print(i)
        i+=1
        
        start_date = month_end.replace(day=1)
        end_date = month_end

        params = {
            'parameter': POLLUTANTS,
            'date_from': start_date.strftime('%Y-%m-%dT00:00:00Z'),
            'date_to': end_date.strftime('%Y-%m-%dT23:59:59Z'),
            'page' : 1,
            'limit': 100,
            'location' : location[0],
            'location_id' : location[1]
        }
        
        while True:
            response = requests.get(url_measurments, headers=HEADERS, params=params)
            
            if response.status_code == 429:
                print("Превышено количество запросов. Ожидание 5 секунд...")
                time.sleep(5)
                continue
            
            if response.status_code != 200:
                print(f"Ошибка: {response.status_code}, {response.text}")
                break

            data = response.json()
            results = data.get('results', [])
            
            if not results:
                print(f"Нет данных для {location} в период с {start_date} по {end_date}")
                break

            all_results.extend(results)

            if 'meta' in data and 'next' in data['meta']:
                params['page'] = data['meta']['next']
            else:
                break
            
            time.sleep(2)
            
df = pd.DataFrame(all_results)
df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
Превышено количество запросов. Ожидание 5 секунд...
143
144
145
146
147


Unnamed: 0,locationId,location,parameter,value,date,unit,coordinates,country,city,isMobile,isAnalysis,entity,sensorType
0,3972,Plzen-Slovany,co,205.02200,"{'utc': '2020-01-31T23:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 49.73244857793753, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
1,3972,Plzen-Slovany,co,206.18701,"{'utc': '2020-01-31T22:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 49.73244857793753, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
2,3972,Plzen-Slovany,co,232.98001,"{'utc': '2020-01-31T21:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 49.73244857793753, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
3,3972,Plzen-Slovany,co,246.95902,"{'utc': '2020-01-31T20:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 49.73244857793753, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
4,3972,Plzen-Slovany,co,272.58703,"{'utc': '2020-01-31T19:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 49.73244857793753, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
...,...,...,...,...,...,...,...,...,...,...,...,...,...
145153,5503,MT00005,pm25,17.67200,"{'utc': '2022-12-27T06:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade
145154,5503,MT00005,pm25,14.56700,"{'utc': '2022-12-27T05:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade
145155,5503,MT00005,pm25,14.28500,"{'utc': '2022-12-27T04:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade
145156,5503,MT00005,pm25,14.55200,"{'utc': '2022-12-27T03:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade


In [9]:
import gc

all_results_big = []

i = 1
for location in all_locations:
    for month_end in date_range:
        print(f'{i} of {len(all_locations)*len(date_range)}')
        i+=1
        
        start_date = month_end.replace(day=1)
        end_date = month_end

        params = {
            'parameter': POLLUTANTS,
            'date_from': start_date.strftime('%Y-%m-%dT00:00:00Z'),
            'date_to': end_date.strftime('%Y-%m-%dT23:59:59Z'),
            'limit': 1000,
            'location' : location[0],
            'location_id' : location[1]
        }
        
        while True:
            response = requests.get(url_measurments, headers=HEADERS, params=params)
            
            if response.status_code == 429:
                print("Превышено количество запросов. Ожидание 5 секунд...")
                time.sleep(5)
                continue
            
            if response.status_code != 200:
                print(f"Ошибка: {response.status_code}, {response.text}")
                break

            data = response.json()
            results = data.get('results', [])
            
            if not results:
                print(f"Нет данных для {location} в период с {start_date} по {end_date}")
                break

            all_results_big.extend(results)
            #print(f'added {results}')
            if 'meta' in data and 'next' in data['meta']:
                params['page'] = data['meta']['next']
                print('meta')
            else:
                break
            
            del result
            gc.collect()
            time.sleep(5)
            
df_big = pd.DataFrame(all_results_big)
df_big

1 of 1620
2 of 1620
3 of 1620
4 of 1620
5 of 1620
6 of 1620
7 of 1620
8 of 1620
9 of 1620
10 of 1620
11 of 1620
12 of 1620
13 of 1620
14 of 1620
15 of 1620
16 of 1620
17 of 1620
18 of 1620
19 of 1620
20 of 1620
21 of 1620
22 of 1620
23 of 1620
24 of 1620
25 of 1620
26 of 1620
27 of 1620
28 of 1620
29 of 1620
30 of 1620
31 of 1620
32 of 1620
33 of 1620
34 of 1620
35 of 1620
36 of 1620
37 of 1620
38 of 1620
39 of 1620
40 of 1620
41 of 1620
42 of 1620
43 of 1620
44 of 1620
45 of 1620
46 of 1620
47 of 1620
48 of 1620
49 of 1620
50 of 1620
51 of 1620
52 of 1620
53 of 1620
54 of 1620
55 of 1620
56 of 1620
57 of 1620
58 of 1620
59 of 1620
60 of 1620
61 of 1620
62 of 1620
63 of 1620
64 of 1620
65 of 1620
66 of 1620
67 of 1620
68 of 1620
69 of 1620
70 of 1620
71 of 1620
72 of 1620
73 of 1620
74 of 1620
75 of 1620
76 of 1620
77 of 1620
78 of 1620
79 of 1620
80 of 1620
81 of 1620
82 of 1620
83 of 1620
84 of 1620
85 of 1620
86 of 1620
87 of 1620
88 of 1620
89 of 1620
90 of 1620
91 of 1620
92 of 16

Unnamed: 0,locationId,location,parameter,value,date,unit,coordinates,country,city,isMobile,isAnalysis,entity,sensorType
0,4369,Praha 4-Libus,so2,2.900,"{'utc': '2020-01-29T03:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 50.00730514514777, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
1,4369,Praha 4-Libus,so2,2.900,"{'utc': '2020-01-29T02:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 50.00730514514777, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
2,4369,Praha 4-Libus,so2,3.200,"{'utc': '2020-01-29T01:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 50.00730514514777, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
3,4369,Praha 4-Libus,so2,2.900,"{'utc': '2020-01-22T22:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 50.00730514514777, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
4,4369,Praha 4-Libus,so2,3.700,"{'utc': '2020-01-22T21:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 50.00730514514777, 'longitude': 1...",CZ,,False,,Governmental Organization,reference grade
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187780,5503,MT00005,pm10,22.949,"{'utc': '2022-12-17T15:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade
1187781,5503,MT00005,pm10,21.751,"{'utc': '2022-12-17T14:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade
1187782,5503,MT00005,pm10,24.408,"{'utc': '2022-12-17T13:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade
1187783,5503,MT00005,pm10,28.030,"{'utc': '2022-12-17T12:00:00+00:00', 'local': ...",µg/m³,"{'latitude': 35.8958358794896, 'longitude': 14...",MT,,False,,Governmental Organization,reference grade


In [12]:
df_big.to_csv('openaq_measurements_2.csv')