In [1]:
!pip install tqdm
!pip install opencage
!pip install scikit-learn

Collecting opencage
  Downloading opencage-3.2.0-py3-none-any.whl.metadata (7.9 kB)
Collecting backoff>=2.2.1 (from opencage)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Downloading opencage-3.2.0-py3-none-any.whl (23 kB)
Downloading backoff-2.2.1-py3-none-any.whl (15 kB)
Installing collected packages: backoff, opencage
Successfully installed backoff-2.2.1 opencage-3.2.0


In [2]:
import pandas as pd
import time
from geopy.geocoders import Nominatim
from tqdm.notebook import tqdm
from opencage.geocoder import OpenCageGeocode
from google.colab import userdata
import requests
import os
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
csv_path = '/content/drive/MyDrive/ML_Dataset/AQI_2010_2023/stations_info.csv'
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,file_name,state,city,agency,station_location,start_month,start_month_num,start_year
0,AP001,Andhra Pradesh,Tirupati,APPCB,"Tirumala, Tirupati",July,7,2016
1,AP002,Andhra Pradesh,Vijayawada,APPCB,"PWD Grounds, Vijayawada",May,5,2017
2,AP003,Andhra Pradesh,Visakhapatnam,APPCB,"GVM Corporation, Visakhapatnam",July,7,2017
3,AP004,Andhra Pradesh,Rajamahendravaram,APPCB,"Anand Kala Kshetram, Rajamahendravaram",September,9,2017
4,AP005,Andhra Pradesh,Amaravati,APPCB,"Secretariat, Amaravati",November,11,2017


In [None]:
df['station_address'] = df['station_location'] + ', ' +df['state']
df.head()

Unnamed: 0,file_name,state,city,agency,station_location,start_month,start_month_num,start_year,station_address
0,AP001,Andhra Pradesh,Tirupati,APPCB,"Tirumala, Tirupati",July,7,2016,"Tirumala, Tirupati , Andhra Pradesh"
1,AP002,Andhra Pradesh,Vijayawada,APPCB,"PWD Grounds, Vijayawada",May,5,2017,"PWD Grounds, Vijayawada , Andhra Pradesh"
2,AP003,Andhra Pradesh,Visakhapatnam,APPCB,"GVM Corporation, Visakhapatnam",July,7,2017,"GVM Corporation, Visakhapatnam , Andhra Pradesh"
3,AP004,Andhra Pradesh,Rajamahendravaram,APPCB,"Anand Kala Kshetram, Rajamahendravaram",September,9,2017,"Anand Kala Kshetram, Rajamahendravaram , Andhr..."
4,AP005,Andhra Pradesh,Amaravati,APPCB,"Secretariat, Amaravati",November,11,2017,"Secretariat, Amaravati , Andhra Pradesh"


In [None]:
def compute_latitude_longitude(row):
  try:
    location = geolocator.geocode(row['station_address'])
    time.sleep(3)
    if location:
        return pd.Series({'latitude': location.latitude, 'longitude': location.longitude})
    else:
        return pd.Series({'latitude': None, 'longitude': None})
  except:
    return pd.Series({'latitude': None, 'longitude': None})

In [None]:
geolocator = Nominatim(user_agent="pollution_station_locator")
tqdm.pandas()
df[['latitude', 'longitude']] = df.progress_apply(compute_latitude_longitude, axis=1)
df.to_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/stations_with_lat_lon.csv', index=False)

  0%|          | 0/453 [00:00<?, ?it/s]



Loading intermediate data for latitude and longitude

In [None]:
csv_path = '/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/stations_with_lat_lon.csv'
df = pd.read_csv(csv_path)
retry_df = df[df['latitude'].isna()]
retry_df.head()

Unnamed: 0,file_name,state,city,agency,station_location,start_month,start_month_num,start_year,station_address,latitude,longitude
1,AP002,Andhra Pradesh,Vijayawada,APPCB,"PWD Grounds, Vijayawada",May,5,2017,"PWD Grounds, Vijayawada , Andhra Pradesh",,
2,AP003,Andhra Pradesh,Visakhapatnam,APPCB,"GVM Corporation, Visakhapatnam",July,7,2017,"GVM Corporation, Visakhapatnam , Andhra Pradesh",,
3,AP004,Andhra Pradesh,Rajamahendravaram,APPCB,"Anand Kala Kshetram, Rajamahendravaram",September,9,2017,"Anand Kala Kshetram, Rajamahendravaram , Andhr...",,
7,AP008,Andhra Pradesh,Tirupati,APPCB,"Vaikuntapuram, Tirupati",November,11,2022,"Vaikuntapuram, Tirupati , Andhra Pradesh",,
9,AP010,Andhra Pradesh,Kadapa,APPCB,"Yerramukkapalli, Kadapa",January,1,2023,"Yerramukkapalli, Kadapa , Andhra Pradesh",,


Alternate method to get latitude and longitude

In [None]:
geocoder = OpenCageGeocode(userdata.get('OPENCAGE_API_KEY'))
def compute_latitude_longitude(row):
  try:
    query = row['station_address'] + ", India"
    result = geocoder.geocode(query)
    time.sleep(3)
    if result and len(result):
        return pd.Series({'latitude': result[0]['geometry']['lat'], 'longitude': result[0]['geometry']['lng']})
    else:
        return pd.Series({'latitude': None, 'longitude': None})
  except:
    return pd.Series({'latitude': None, 'longitude': None})

In [None]:
tqdm.pandas()
retry_df[['latitude', 'longitude']] = retry_df.progress_apply(compute_latitude_longitude, axis=1)
retry_df

  0%|          | 0/230 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  retry_df[['latitude', 'longitude']] = retry_df.progress_apply(compute_latitude_longitude, axis=1)


In [None]:
merged_df = pd.concat([df.dropna(subset=['latitude', 'longitude']), retry_df]).drop_duplicates(subset=['file_name'])
merged_df = merged_df.sort_values(by='file_name')
merged_df.to_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/stations_with_lat_lon_final.csv', index=False)

Adding elevation information

In [None]:
def compute_elevation(row):
  try:
    latitude = row['latitude']
    longitude = row['longitude']
    url = f"https://api.open-elevation.com/api/v1/lookup?locations={latitude},{longitude}"
    response = requests.get(url).json()
    time.sleep(3)
    if response:
        return response['results'][0]['elevation']
    else:
        return None
  except:
    return None

In [None]:
tqdm.pandas()
merged_df['elevation'] = merged_df.progress_apply(compute_elevation, axis=1)
merged_df.to_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/stations_with_lat_lon_ele.csv', index=False)

  0%|          | 0/453 [00:00<?, ?it/s]

Station_info completed <br>
Working on AQI files

In [5]:
def column_header_stats(folder_path_for_csv):
  csv_files = [f for f in os.listdir(folder_path_for_csv) if f.endswith(".csv") and not f.endswith('stations_info.csv')]
  file_headers = defaultdict(set)
  file_header_counter = {}
  for file in tqdm(csv_files):
      file_path = os.path.join(folder_path_for_csv, file)
      try:
          df = pd.read_csv(file_path, nrows=0)
          for column in df.columns:
                file_headers[column].add(file)
      except Exception as e:
          print(f"Error reading {file}: {e}")
  file_headers = {k: sorted(list(v)) for k, v in file_headers.items()}
  return pd.DataFrame([
      {"header": header, "files": files, "file_count": len(files)}
      for header, files in file_headers.items()
  ])

In [7]:
folder_path = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023/"
header_df = column_header_stats(folder_path).sort_values(by='file_count', ascending= False)
header_df.to_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/header_presence.csv', index=False)

  0%|          | 0/453 [00:00<?, ?it/s]

Checking column summary to see if they are overlapping

In [None]:
def column_stats(df, column_name, filename):
  print(f"Stats for file: {filename}")
  if column_name not in df.columns:
    print(f"Column '{column_name}' not found in the DataFrame.")
    return
  cleaned_column = df[column_name].dropna()
  if cleaned_column.empty:
    print(f"Column '{column_name}' contains no valid numerical entries.")
    return
  column_range_max = cleaned_column.max()
  column_range_min = cleaned_column.min()
  column_avg = cleaned_column.mean()
  print(f"Column: {column_name}")
  print(f"Range (excluding nulls): {column_range_max:.2f} - {column_range_min:.2f}")
  print(f"Average (excluding nulls): {column_avg:.2f}")

In [None]:
file_paths = ['KA011.csv']
folder_path = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023/"

for path in file_paths:
  column_stats(pd.read_csv(folder_path + path), 'RH (W/mt2)', path)
  print("Stat complete")

# Stats for file: DL037.csv
# Column: RH ()
# Range (excluding nulls): 99.20 - 0.82
# Average (excluding nulls): 62.81
# Stat complete

Stats for file: KA011.csv
Column: RH (W/mt2)
Range (excluding nulls): 97.33 - 2.00
Average (excluding nulls): 54.69
Stat complete


Column rename

In [8]:
input_folder = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023/"
output_folder = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/renamed_column/"
os.makedirs(output_folder, exist_ok=True)
rename_map_aqi = {
    "CO (ng/m3)": "CO (mg/m3)",
    "CO (ug/m3)": "CO (mg/m3)",
    "CO (mg/Nm3)": "CO (mg/m3)",
    "SO2 ()": "SO2 (ug/m3)",
    "Ozone ()": "Ozone (ug/m3)",
    "Ozone (ppb)": "Ozone (ug/m3)",
    "NH3 ()": "NH3 (ug/m3)",
    "NH3 (ppb)": "NH3 (ug/m3)"
}
rename_map_other = {
    # Barometric Pressure (BP)
    "BP (mmHg).5": "BP (mmHg)",
    "BP (mmHg).7": "BP (mmHg)",
    "BP (mmHg).6": "BP (mmHg)",
    "BP (mmHg).4": "BP (mmHg)",
    "BP (mmHg).1": "BP (mmHg)",
    "BP (mmHg).2": "BP (mmHg)",
    "BP (mmHg).3": "BP (mmHg)",
    "BP (mg/m3)": "BP (mmHg)",
    "BP ()": "BP (mmHg)",
    "BP (W/mt2)": "BP (mmHg)",

    # Relative Humidity (RH)
    "RH ()": "RH (%)",
    "RH (W/mt2)": "RH (%)",
    "RH (degree)": "RH (%)",

    # Nitric Oxide (NO)
    "NO (mg/m3)": "NO (ug/m3)",
    "NO ()": "NO (ug/m3)",
    "NO (ppm)": "NO (ug/m3)",
    "NO (ppb)": "NO (ug/m3)",

    # NOx
    "NOx (ppb)": "NOx (ug/m3)",
    "NOx (ppm)": "NOx (ug/m3)",

    # Wind Speed (WS)
    "WS ()": "WS (m/s)",
    "WS (ug/m3)": "WS (m/s)",

    # Benzene
    "Benzene ()": "Benzene (ug/m3)",
    "Benzene (mg/m3)": "Benzene (ug/m3)",

    # Toluene
    "Toluene ()": "Toluene (ug/m3)",

    # Solar Radiation (SR)
    "SR ()": "SR (W/mt2)",
    "SR (ug/m3)": "SR (W/mt2)",

    # Rainfall (RF)
    "RF ()": "RF (mm)",
    "RF (mm).6": "RF (mm)",
    "RF (mm).3": "RF (mm)",
    "RF (mm).4": "RF (mm)",
    "RF (mm).7": "RF (mm)",
    "RF (mm).2": "RF (mm)",
    "RF (mm).1": "RF (mm)",
    "RF (mm).5": "RF (mm)",
    "RF (m/s)": "RF (mm)",

    # Ambient Temperature (AT)
    "AT ()": "AT (degree C)",
    "AT (degree)": "AT (degree C)",
    "AT (ug/m3)": "AT (degree C)",

    # Wind Direction (WD)
    "WD (deg)": "WD (degree)",
    "WD (degree C)": "WD (degree)",
    "WD ()": "WD (degree)",
}
rename_map = {**rename_map_aqi, **rename_map_other}
csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv") and not f.endswith('stations_info.csv')]
for file in tqdm(csv_files, desc="Processing CSV files"):
    input_path = os.path.join(input_folder, file)
    output_path = os.path.join(output_folder, file)
    try:
        df = pd.read_csv(input_path)
        df.rename(columns=rename_map, inplace=True)
        df.to_csv(output_path, index=False)
    except Exception as e:
        tqdm.write(f"Error processing {file}: {e}")


Processing CSV files:   0%|          | 0/453 [00:00<?, ?it/s]

In [6]:
folder_path = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/renamed_column/"
header_df = column_header_stats(folder_path).sort_values(by='file_count', ascending= False)
header_df.to_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/header_presence_v2.csv', index=False)

  0%|          | 0/453 [00:00<?, ?it/s]

List files which are missing minimum AQI fields

In [None]:
folder_path = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/renamed_column/"
required_columns = list(set(rename_map_aqi.values())) + list(set(rename_map_other.values()))
missing_columns_report = []
missing_dict = defaultdict(list)
for file in tqdm(os.listdir(folder_path)):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        try:
            df = pd.read_csv(file_path, nrows=1)
            missing = [col for col in required_columns if col not in df.columns]
            if missing:
                missing_columns_report.append({
                    "file": file,
                    "missing_columns": missing
                })
                for missing_column in missing:
                  missing_dict[missing_column].append(file)
        except Exception as e:
            print(f"Error reading {file}: {e}")
# if missing_columns_report:
#     print("\nFiles missing required AQI columns:\n")
#     for entry in missing_columns_report:
#         print(f"{entry['file']} → Missing: {', '.join(entry['missing_columns'])}")
# else:
#     print("All CSV files contain the required columns.")

  0%|          | 0/453 [00:00<?, ?it/s]


Files missing required AQI columns:

BR001.csv → Missing: PM10 (ug/m3), NH3 (ug/m3)
BR002.csv → Missing: PM10 (ug/m3), NH3 (ug/m3)
BR003.csv → Missing: PM10 (ug/m3), NH3 (ug/m3)
CG001.csv → Missing: Ozone (ug/m3), NH3 (ug/m3)
DL007.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
DL008.csv → Missing: SO2 (ug/m3)
DL009.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
DL010.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
DL011.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
DL016.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
DL017.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
DL018.csv → Missing: PM2.5 (ug/m3), PM10 (ug/m3), NH3 (ug/m3)
GJ001.csv → Missing: NH3 (ug/m3)
GJ006.csv → Missing: Ozone (ug/m3), NH3 (ug/m3)
GJ017.csv → Missing: Ozone (ug/m3), NH3 (ug/m3)
HR001.csv → Missing: PM10 (ug/m3), NH3 (ug/m3)
HR004.csv → Missing: NH3 (ug/m3)
HR002.csv → Missing: PM10 (ug/m3), NH3 (ug/m3)
HR003.csv → Missing: NH3 (ug/m3)
HR005.csv → Missing: SO2 (ug/m3), NH3 (ug/m3)
JH002.csv → Missing: NH3 (ug/m3)
KA005.csv → Missing: PM2.5 (ug/m3)

Remove the files from the folder as well as the station record which are missing minimum details

In [None]:
unwanted_files = list(set([missing_column['file'] for missing_column in missing_columns_report]))
lat_lon_ele_df = pd.read_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/stations_with_lat_lon_ele.csv')
lat_lon_ele_df = lat_lon_ele_df[~lat_lon_ele_df['file_name'].isin([unwanted_file.split('.')[0] for unwanted_file in unwanted_files])]
lat_lon_ele_df.to_csv('/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/station_info_unwanted_removed.csv', index=False)
print("Unwanted stations removed from station_info")
columns_to_check = list(set(rename_map_aqi.values()))
roll_avg_columns = list(set(rename_map_aqi.values()))
input_folder = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/renamed_column/"
output_folder = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/unwanted_removed/"
csv_files = [f for f in os.listdir(input_folder) if f not in unwanted_files]
print(len(csv_files))
for file in tqdm(csv_files, desc="Processing CSV files"):
  input_path = os.path.join(input_folder, file)
  output_path = os.path.join(output_folder, file)
  try:
    unwanted_rows_df = pd.read_csv(input_path)
    unwanted_rows_df = unwanted_rows_df.dropna(subset=columns_to_check, how='all')
    for col in roll_avg_columns:
      rolling_mean = unwanted_rows_df[col].rolling(window=5, min_periods=1).mean()
      unwanted_rows_df[col] = unwanted_rows_df[col].combine_first(rolling_mean)
    unwanted_rows_df.to_csv(output_path, index=False)
  except Exception as e:
      tqdm.write(f"Error processing {file}: {e}")
print("Unwanted rows removed")

Unwanted stations removed from station_info
395


Processing CSV files:   0%|          | 0/395 [00:00<?, ?it/s]

Unwanted rows removed


Adding AQI, main_pollutant and severity

In [None]:
def get_aqi_severity(aqi):
    if aqi <= 50:
        return "GOOD"
    elif aqi <= 100:
        return "SATISFACTORY"
    elif aqi <= 200:
        return "MODERATE"
    elif aqi <= 300:
        return "POOR"
    elif aqi <= 400:
        return "VERY POOR"
    elif aqi <= 500:
        return "SEVERE"
    else:
        return "OUT OF RANGE"

def compute_sub_index(conc, breakpoints):
    for bp in breakpoints:
        c_low, c_high, i_low, i_high = bp
        if c_low <= conc <= c_high:
            return round((i_high - i_low) / (c_high - c_low) * (conc - c_low) + i_low)
    return None  # Out of range

def calculate_aqi_row_with_severity(row):
    sub_indices = {}
    for pollutant, bps in BREAKPOINTS.items():
        conc = row.get(pollutant)
        if pd.notnull(conc):
            sub_index = compute_sub_index(conc, bps)
            if sub_index is not None:
                sub_indices[pollutant] = sub_index
    if not sub_indices:
        return pd.Series({"AQI": None, "Main Pollutant": None, "Severity": None})
    main_pollutant = max(sub_indices, key=sub_indices.get)
    aqi = sub_indices[main_pollutant]
    severity = get_aqi_severity(aqi)
    return pd.Series({"AQI": aqi, "Main Pollutant": main_pollutant, "Severity": severity})

In [None]:
BREAKPOINTS = {
    "PM2.5 (ug/m3)": [
        (0, 30, 0, 50), (31, 60, 51, 100), (61, 90, 101, 200),
        (91, 120, 201, 300), (121, 250, 301, 400), (251, 350, 401, 500)
    ],
    "PM10 (ug/m3)": [
        (0, 50, 0, 50), (51, 100, 51, 100), (101, 250, 101, 200),
        (251, 350, 201, 300), (351, 430, 301, 400), (431, 500, 401, 500)
    ],
    "NO2 (ug/m3)": [
        (0, 40, 0, 50), (41, 80, 51, 100), (81, 180, 101, 200),
        (181, 280, 201, 300), (281, 400, 301, 400), (401, 500, 401, 500)
    ],
    "SO2 (ug/m3)": [
        (0, 40, 0, 50), (41, 80, 51, 100), (81, 380, 101, 200),
        (381, 800, 201, 300), (801, 1600, 301, 400), (1601, 2000, 401, 500)
    ],
    "CO (mg/m3)": [
        (0.0, 1.0, 0, 50), (1.1, 2.0, 51, 100), (2.1, 10.0, 101, 200),
        (10.1, 17.0, 201, 300), (17.1, 34.0, 301, 400), (34.1, 50.0, 401, 500)
    ],
    "Ozone (ug/m3)": [
        (0, 50, 0, 50), (51, 100, 51, 100), (101, 168, 101, 200),
        (169, 208, 201, 300), (209, 748, 301, 400), (749, 1000, 401, 500)
    ],
    "NH3 (ug/m3)": [
        (0, 200, 0, 50), (201, 400, 51, 100), (401, 800, 101, 200),
        (801, 1200, 201, 300), (1201, 1800, 301, 400), (1801, 2400, 401, 500)
    ]
}


input_folder = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/unwanted_removed/"
output_folder = "/content/drive/MyDrive/ML_Dataset/AQI_2010_2023_updated/aqi_added/"
print(len(csv_files))
for file in tqdm(csv_files, desc="Processing CSV files"):
    input_path = os.path.join(input_folder, file)
    output_path = os.path.join(output_folder, file)
    try:
        initial_df = pd.read_csv(input_path)
        aqi_df = initial_df.apply(calculate_aqi_row_with_severity, axis=1)
        df_with_aqi = pd.concat([initial_df, aqi_df], axis=1)
        df_with_aqi.to_csv(output_path, index=False)
    except Exception as e:
        tqdm.write(f"Error processing {file}: {e}")
print("AQI Calculated")

395


Processing CSV files:   0%|          | 0/395 [00:00<?, ?it/s]

AQI Calculated


Combine station_info and AQI data