In [1]:
import os.path
import numpy as np
import requests
import time
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import aqi
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
pio.templates.default = "plotly_white"
from plotly.subplots import make_subplots
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats.mstats import winsorize
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import learning_curve
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold

from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay

from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')

In [ ]:
url_template = "https://api.openaq.org/v2/measurements?country=IN&location_id={}&date_from={}&date_to={}&limit={}&page={}&offset=0&sort=asc&radius=1000&order_by=datetime"
headers = {"accept": "application/json"}


In [ ]:
limit = 1000  # Set your desired limit
locationId = 407
# Set the initial date range
current_date_from = datetime(2023, 1, 1)
current_date_to = datetime(2023, 1, 1)

In [ ]:
# Set the end date for the loop
end_date = datetime(2023, 12, 31)
response_list = []

In [ ]:
page = 1
while current_date_from < end_date:
    date_from_str = current_date_from.strftime("%Y-%m-%dT%H:%M:%SZ")
    next_month = current_date_from + relativedelta(months=1)
    date_to_str = next_month.strftime("%Y-%m-%dT%H:%M:%SZ")

    url = url_template.format(locationId, date_from_str, date_to_str, limit, page)
    print("url : "+url)
    response = requests.get(url, headers=headers)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the "found" value from the JSON content
        json_response = response.json()
        found_value_str = str(json_response["meta"]["found"])
        limit_value_str = json_response["meta"]["limit"]
        result_count = json_response['results']
        response_list.extend(json_response['results'])
        #print(json_response)
        limit_value = int(limit_value_str)
        # Remove ">" and convert to numeric type
        if result_count:
            if '>' in found_value_str:
                found_value = int(found_value_str.replace('>', ''))
            else:
                found_value = int (found_value_str)
        else:
            found_value = 0
        # Print "more" or "done" based on the limit
        if found_value > 0:
            print("more")
            #print(response.text)
            page += 1  # Increment the page number for the next iteration
            time.sleep(1)
        else:
            print("updating date - as page limit for date range is met.")
            # Update date range for the next iteration
            current_date_from = next_month
            page = 1

    else:
        print(f"Error: {response.status_code}")
        break  # Exit the loop on error

In [ ]:
#creating an empty dataframe
# Create an empty dictionary to store DataFrames for each locationId
dfs = {}

# Loop through response_data and extract relevant information
for entry in response_list:
    location_id = entry["locationId"]
    date_utc = entry["date"]["utc"]
    location = entry["location"]
    coordinates = entry["coordinates"]
    country = entry["country"]
    city = entry["city"]
    parameter = entry["parameter"]
    value = entry["value"]

    # Check if the locationId exists in the dictionary
    if location_id not in dfs:
        # If the locationId is not in the dictionary, create a new DataFrame for it
        dfs[location_id] = pd.DataFrame({
            'locationId': [location_id],
            'date_utc': [date_utc],
            'location': [location],
            'coordinates': [coordinates],
            'country': [country],
            'city': [city],
            parameter: [value]
        })
    else:
        # If location_id and date_utc combination already exists in the DataFrame, update the corresponding parameter value
        if any((dfs[location_id]['date_utc'].eq(date_utc))):
            dfs[location_id].loc[dfs[location_id]['date_utc'].eq(date_utc), parameter] = value
        else:
            # Create a new row for the combination of location_id and date_utc
            dfs[location_id] = pd.concat([dfs[location_id], pd.DataFrame({
                'locationId': [location_id],
                'date_utc': [date_utc],
                'location': [location],
                'coordinates': [coordinates],
                'country': [country],
                'city': [city],
                parameter: [value]
            })])



In [ ]:
# Combine all DataFrames into a single DataFrame (if needed)
df_combined = pd.concat(dfs.values(), ignore_index=True)

# Reset the index of the combined DataFrame
df_combined.reset_index(drop=True, inplace=True)

In [ ]:
#target folder within project
target_folder='target'
if not os.path.exists(target_folder):
    os.makedirs(target_folder)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
excel_file_path = os.path.join(target_folder, f'output_{timestamp}.xlsx')

In [ ]:
df_combined.to_excel(excel_file_path, index=False)
print('Excel File generated Successfully')
print('program exit')

In [ ]:
# Get a list of all Excel files in the folder
excel_files = [file for file in os.listdir(target_folder) if file.endswith('.xlsx')]

In [ ]:
# Create an empty list to store DataFrames
aqs = []

# Loop through each Excel file and read it into a DataFrame
for excel_file in excel_files:
    file_path = os.path.join(target_folder, excel_file)
    aq = pd.read_excel(file_path)
    aqs.append(aq)

In [ ]:
# Concatenate all DataFrames into a single DataFrame
combined_aq = pd.concat(aqs, ignore_index=True)

In [ ]:
# read Excel file
combined_aq.tail()

In [ ]:
# Save the combined DataFrame to a new Excel file
combined_excel_path = os.path.join(target_folder, 'air_quality.xlsx')
combined_aq.to_excel(combined_excel_path, index=False)

print('Combined Excel File generated successfully.')