In [1]:
import requests
import re
import pandas as pd
from datetime import datetime
import pdfplumber
import certifi
import sys
import folium
from folium.plugins import HeatMap
import googlemaps
import pytz
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()

False

In [3]:
time_zone = pytz.timezone('Asia/Kolkata')
date = datetime.now(time_zone).strftime("%Y%m%d")
date

'20250803'

In [4]:
time = datetime.now(time_zone).strftime("%H:%M:%S")

In [5]:
url = f'https://cpcb.nic.in//upload/Downloads/AQI_Bulletin_{date}.pdf'

In [6]:
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, verify=False)

if response.status_code == 200:
    with open(f"AQI_Bulletin_{date}.pdf", "wb") as f:
        f.write(response.content)
    print("Download successful.")
else:
    print("Failed to download:", response.status_code)



Download successful.


In [7]:
daily_doc = f'AQI_Bulletin_{date}.pdf'

In [8]:
if daily_doc:
    tables = []
    with pdfplumber.open(daily_doc) as pdf:
        for page in pdf.pages[:-1]:
            extracted_tables = page.extract_tables()
            # print(extracted_tables[0])
            tables.extend(extracted_tables)

In [9]:
clean_tables = []
for table in tables:
    table_wihtout_headers = table[1:]
    clean_tables.append(table_wihtout_headers)

In [10]:
df_list = []
for each_table in clean_tables:
    df = pd.DataFrame(each_table)
    df_list.append(df)

In [11]:
df_combined = pd.concat(df_list, ignore_index=True)
df_combined.columns = tables[0][0]

In [12]:
df = df_combined.drop(columns=['S.No']) #, 'Prominent Pollutant', 'No. of Stations\nParticipated/\nTotal Stations'])

In [13]:
df.columns = df.columns.str.replace("\n", " ").str.replace(" ","_").str.replace("/", "")

In [14]:
df.columns

Index(['City', 'Air_Quality', 'Index_Value', 'Prominent_Pollutant',
       'No._of_Stations_Participated_Total_Stations'],
      dtype='object')

In [15]:
df['Index_Value'] = df['Index_Value'].astype(int)

In [16]:
API_KEY = os.getenv("GOOGLE_API_KEY")

In [17]:
gmaps = googlemaps.Client(key=API_KEY)

In [18]:
# Function to get state using reverse geocoding
def get_state(city):
    try:
        geocode_result = gmaps.geocode(f"{city}, India")
        for component in geocode_result[0]['address_components']:
            if 'administrative_area_level_1' in component['types']:
                return component['long_name']
    except Exception as e:
        return None

df["State"] = df["City"].apply(get_state)

In [19]:
df.sort_values(by='Index_Value', ascending=False).to_csv(f'daily_aqi_{date}.csv', index=False)

In [20]:
# Function to geocode addresses
def geocode_google(address):
    try:
        geocode_result = gmaps.geocode(address)
        if geocode_result:
            location = geocode_result[0]['geometry']['location']
            return pd.Series([location['lat'], location['lng']])
        else:
            return pd.Series([None, None])
    except Exception as e:
        return pd.Series([None, None])

In [21]:
# Apply geocoding function to the DataFrame
df[['Latitude', 'Longitude']] = df['City'].apply(geocode_google)

In [22]:
df['date'] = date

In [23]:
if len(df[df['Latitude'].isna()]['City']) > 0:
    df[['Latitude', 'Longitude']] = df['City'].apply(geocode_google)

In [24]:
len(df[df['Latitude'].isna()]['City'])

4

In [25]:
if len(df[df['Latitude'].isna()]['City']) > 0:
    df['Latitude'] = df['Latitude'].fillna('0')
    df['Longitude'] = df['Longitude'].fillna('0')

In [26]:
len(df[df['Latitude'].isna()]['City']) 

0

In [27]:
with pdfplumber.open(daily_doc) as pdf:
    last_page_text = pdf.pages[-1].extract_text()

In [28]:
text = last_page_text.replace("\n", " ")

In [29]:
list_of_no_data_cities = text.split(":")[1].split(",")

In [30]:
no_data_cities = []
for each_city in list_of_no_data_cities:
    city_name = each_city.split(")")[1].strip()
    no_data_cities.append(city_name)

In [31]:
geocoded_no_data_cities = []
for each_city in no_data_cities:
    no_data_city_dict = {}
    no_data_city_dict['city'] = each_city 
    no_data_city_dict['lat'] = geocode_google(each_city)[0]
    no_data_city_dict['long'] = geocode_google(each_city)[1]
    geocoded_no_data_cities.append(no_data_city_dict)

In [32]:
df_no_data = pd.DataFrame(geocoded_no_data_cities)
df_no_data['state'] = df_no_data['city'].apply(get_state)

In [33]:
df_no_data['date'] = date

In [34]:
if len(df_no_data[df_no_data['lat'].isna()]) > 0:
    df_no_data['city'].apply(geocode_google)

In [35]:
if len(df_no_data[df_no_data['lat'].isna()]) > 0:
    df_no_data['lat'] = df_no_data['lat'].fillna('0')
    df_no_data['long'] = df_no_data['long'].fillna('0')

In [36]:
df_no_data.to_csv('no_data_cities.csv', index=False)

In [37]:
# Create base map
m1 = folium.Map(location=[22.9734, 78.6569], zoom_start=4.5)

map_date = pd.to_datetime(df['date'].iloc[0]).strftime("%B %d, %Y")  # Optional formatting

# Define map center
map_center = [33.9734, 78.6569]

heat_data = [[row['Latitude'], row['Longitude'], row['Index_Value']] for index, row in df.iterrows()]
HeatMap(heat_data, radius=13, blur=10, max_zoom=4.5).add_to(m1)

folium.map.Marker(
    [33.5, 78],  # Approximate location to place annotation (adjust as needed)
    icon=folium.DivIcon(html="""
        <div style="position: absolute; top: 10px; left: 10px; font-size: 12px; font-weight: bold; 
                    background: rgba(255, 255, 255, 0.5); padding: 2px; border-radius: 1px; 
                    border: 1px solid; white-space: nowrap; z-index:9999;">
            Zoom in at area of interest
        </div>
    """)
).add_to(m1)

folium.map.Marker(
    map_center,
    icon=folium.DivIcon(html=f"""
        <div style="text-align: center; font-size: 12px; min-width: 120px; font-weight: bold; 
                    background: rgba(255, 255, 255, 0.3); padding: 0px 0px; border-radius: 1px; 
                    border: 1px solid; z-index:9999;">
            {map_date}
        </div>
    """)
).add_to(m1)

m1.save(f"daily_map_{date}.html")

In [38]:
with open(f"daily_map_{date}.html", "r") as source_file:
    content = source_file.read()

with open("daily_map.html", "w") as target_file:
    target_file.write(content)

In [39]:
# Create a base map centered around India
m2 = folium.Map(location=[22.9734, 78.9629], zoom_start=4.5)

# Add markers with city names as labels
for _, row in df_no_data.iterrows():
    folium.Marker(
        location=[row["lat"], row["long"]],
        popup=folium.Popup(row["city"], parse_html=True),
        icon=folium.Icon(color="red", icon="flag")
    ).add_to(m2)

folium.map.Marker(
    [33.5, 78],  # Approximate location to place annotation (adjust as needed)
    icon=folium.DivIcon(html="""
        <div style="position: absolute; top: 10px; left: 10px; font-size: 12px; font-weight: bold; 
                    background: rgba(255, 255, 255, 0.5); padding: 2px; border-radius: 1px; 
                    border: 1px solid; white-space: nowrap; z-index:9999;">
            Zoom in at area of interest
        </div>
    """)
).add_to(m2)

folium.map.Marker(
    map_center,
    icon=folium.DivIcon(html=f"""
        <div style="text-align: center; font-size: 12px; min-width: 120px; font-weight: bold; 
                    background: rgba(255, 255, 255, 0.3); padding: 0px 0px; border-radius: 1px; 
                    border: 1px solid; z-index:9999;">
            {map_date}
        </div>
    """)
).add_to(m2)

# Display the map
m2.save(f"no_data_map_{date}.html")

In [40]:
with open(f"no_data_map_{date}.html", "r") as source_file:
    content = source_file.read()

with open("no_data_map.html", "w") as target_file:
    target_file.write(content)

#Make an empty list
#Save it to a csv 
#Read into the csv as df
#Start adding to the df daily
#Save it as csv again?

OR

#Once df is processed, change the name to have that day's date {date}
#Sort descending and take head(10)
#save in a different folder
#Use os to read ontp the folder and make a list of files inside
#Give it a condition: if len(list) > 7, then throw out the oldest one or wait for the new ones to collect into set of 7 
#Take mean and display in a table