## DATA COLLECTION

* Using [Openweathermap](https://openweathermap.org/) for data collection <br>
* For their API list visit - [apis](https://openweathermap.org/api) <br>
* [API Documentation](https://openweathermap.org/api/air-pollution)
* API used - http://api.openweathermap.org/data/2.5/air_pollution/history?lat={lat}&lon={lon}&start={start}&end={end}&appid={API key}

In [1]:
import requests
import pandas as pd
import json
from datetime import datetime
import time

In [11]:
#Date format (YYYY, M,D,H,m)
#Extended the end date to 4/3/23

START_DATE = datetime(2020, 11, 25, 1, 0)
END_DATE = datetime(2023, 3, 4, 23, 0)

LAT = 19.07
LON = 72.88

#Enter your APP ID here
APP_ID = 'a82bcbb38561edff907416a47e4c15f4'

START_UNIX = int(time.mktime(START_DATE.timetuple()))
END_UNIX = int(time.mktime(END_DATE.timetuple()))

print('Start unix: {}'.format(START_UNIX))
print('End unix: {}'.format(END_UNIX))

Start unix: 1606266000
End unix: 1677970800


In [12]:
url = 'http://api.openweathermap.org/data/2.5/air_pollution/history?lat={}&lon={}&start={}&end={}&appid={}'.format(LAT,LON,START_UNIX,END_UNIX,APP_ID)

In [13]:
#Getting the response from the api
response = requests.get(url)

In [15]:
#response text
response.text

{'cod': 401, 'message': 'Invalid API key. Please see https://openweathermap.org/faq#error401 for more info.'}



{
  "coord":[
    50,
    50
  ],
  "list":[
    {
      "dt":1605182400,
      "main":{
        "aqi":1
      },
      "components":{
        "co":201.94053649902344,
        "no":0.01877197064459324,
        "no2":0.7711350917816162,
        "o3":68.66455078125,
        "so2":0.6407499313354492,
        "pm2_5":0.5,
        "pm10":0.540438711643219,
        "nh3":0.12369127571582794
      }
    }
  ]
}
                   
                
      item['main']['aqi']

In [6]:
r = json.loads(response.text)

In [7]:
#Parsing the variables and extracting the time,co,no2,o3,so2,pm2.5,pm10,nh3. Each row of data is stored as a list which is then converted to a dataframe
#Some dates have multiple observations
#Added the AQI column --
data = []

for c in r['list']:
    date = datetime.utcfromtimestamp(int(c['dt'])).strftime('%Y-%m-%d')
    data.append([date,c['main']['aqi']c['components']['co'],c['components']['no'],c['components']['no2'],c['components']['o3'],c['components']['so2'],c['components']['pm2_5'],c['components']['pm10'],c['components']['nh3']])

In [8]:
#Converting the list to a dataframe
df =pd.DataFrame(data,columns=['date','AQI','co','no','no2','o3','so2','pm2_5','pm10','nh3'])

In [9]:
df.head()

Unnamed: 0,date,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,2020-11-25,2162.93,0.0,58.95,72.96,95.37,346.73,385.34,9.37
1,2020-11-25,2403.26,0.02,78.83,58.65,123.98,371.86,412.33,12.29
2,2020-11-25,2883.91,3.21,111.04,41.84,158.31,421.5,468.55,17.48
3,2020-11-25,3738.4,15.87,138.46,47.21,188.83,503.57,561.9,28.63
4,2020-11-25,4913.33,22.35,178.22,70.81,221.25,622.25,698.88,47.62


In [10]:
#exporting dataset
df.to_csv('../../data/airQuality_1.csv', index=False)

In [13]:
df_read = pd.read_csv('../../data/airQuality_1.csv')

In [14]:
df_read.head()

Unnamed: 0,date,co,no,no2,o3,so2,pm2_5,pm10,nh3
0,2020-11-25,2162.93,0.0,58.95,72.96,95.37,346.73,385.34,9.37
1,2020-11-25,2403.26,0.02,78.83,58.65,123.98,371.86,412.33,12.29
2,2020-11-25,2883.91,3.21,111.04,41.84,158.31,421.5,468.55,17.48
3,2020-11-25,3738.4,15.87,138.46,47.21,188.83,503.57,561.9,28.63
4,2020-11-25,4913.33,22.35,178.22,70.81,221.25,622.25,698.88,47.62


In [15]:
df_read.isnull().sum()

date     0
co       0
no       0
no2      0
o3       0
so2      0
pm2_5    0
pm10     0
nh3      0
dtype: int64