# Fina Project

In [128]:
import subprocess
import sys
import pandas as pd
import json
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import date

In [129]:
def install_if_missing(package):
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install_if_missing("openmeteo_requests")
install_if_missing("requests_cache")
install_if_missing("retry_requests")
install_if_missing("numpy")
install_if_missing("pandas")

In [130]:
def extract_aqi_features(df, json_col='JSON'):
    """
    Extracts structured air quality and weather data from a parsed JSON column into flat DataFrame columns.
    
    Args:
        df (pd.DataFrame): DataFrame with a column containing parsed JSON dictionaries.
        json_col (str): Name of the column with JSON data (default is 'JSON').
    
    Returns:
        pd.DataFrame: Original DataFrame with new extracted columns.
    """
    
    def extract_value(x, path, default=None):
        """Safely extract a nested value using a path list."""
        try:
            for p in path:
                if isinstance(p, int):  # For list indexing
                    x = x[p]
                else:
                    x = x.get(p, default)
            return x
        except Exception:
            return default

    def extract_forecast_value(x, pollutant, date):
        """Extract forecast daily average for a specific pollutant and date."""
        try:
            forecast_list = x.get('forecast', {}).get('daily', {}).get(pollutant, [])
            for entry in forecast_list:
                if entry.get('day') == date:
                    return entry.get('avg')
        except Exception:
            return None

    # Extract common air quality fields
    df['dominant_pollutant'] = df[json_col].apply(lambda x: x.get('dominentpol'))
    
    pollutants = ['pm25', 'pm10', 'co', 'no2', 'so2', 'o3']
    for pol in pollutants:
        df[pol] = df[json_col].apply(lambda x: extract_value(x, ['iaqi', pol, 'v']))

    # Weather fields
    df['temperature'] = df[json_col].apply(lambda x: extract_value(x, ['iaqi', 't', 'v']))
    df['humidity'] = df[json_col].apply(lambda x: extract_value(x, ['iaqi', 'h', 'v']))
    df['wind_speed'] = df[json_col].apply(lambda x: extract_value(x, ['iaqi', 'w', 'v']))
    df['pressure'] = df[json_col].apply(lambda x: extract_value(x, ['iaqi', 'p', 'v']))
    df['rain'] = df[json_col].apply(lambda x: extract_value(x, ['iaqi', 'r', 'v']))

    # City and location
    df['city_geo_lat'] = df[json_col].apply(lambda x: extract_value(x, ['city', 'geo', 0]))
    df['city_geo_lon'] = df[json_col].apply(lambda x: extract_value(x, ['city', 'geo', 1]))
    # df['city_url'] = df[json_col].apply(lambda x: extract_value(x, ['city', 'url']))
    
    # Time
    # df['aqi_time_iso'] = df[json_col].apply(lambda x: extract_value(x, ['time', 'iso']))
    # df['aqi_time_local'] = df[json_col].apply(lambda x: extract_value(x, ['time', 's']))
    # df['time_zone'] = df[json_col].apply(lambda x: extract_value(x, ['time', 'tz']))

    # Forecast (match the forecast to the date in the row's Timestamp)
    # df['forecast_date'] = pd.to_datetime(df['Timestamp']).dt.strftime('%Y-%m-%d')

    # for pol in ['pm25', 'pm10', 'o3', 'uvi']:
    #     col_name = f'forecast_{pol}_avg'
    #     df[col_name] = df.apply(lambda row: extract_forecast_value(row[json_col], pol, row['forecast_date']), axis=1)

    return df

In [131]:
# read data
df = pd.read_csv("air_data.csv", converters={'JSON': eval})
# view
display(df)

Unnamed: 0,Timestamp,City,AQI,PM2.5,JSON
0,2025-03-01 12:15:43,Bangkok,86,86,"{'aqi': 86, 'idx': 5773, 'attributions': [{'ur..."
1,2025-03-01 12:15:43,Beijing,261,261,"{'aqi': 261, 'idx': 1451, 'attributions': [{'u..."
2,2025-03-01 12:15:44,Los Angeles,28,28,"{'aqi': 28, 'idx': 243, 'attributions': [{'url..."
3,2025-03-02 04:49:40,Bangkok,78,78,"{'aqi': 78, 'idx': 5773, 'attributions': [{'ur..."
4,2025-03-02 04:49:40,Beijing,74,74,"{'aqi': 74, 'idx': 1451, 'attributions': [{'ur..."
...,...,...,...,...,...
133,2025-04-14 01:55:32,Beijing,107,107,"{'aqi': 107, 'idx': 1451, 'attributions': [{'u..."
134,2025-04-14 01:55:32,Los Angeles,51,51,"{'aqi': 51, 'idx': 243, 'attributions': [{'url..."
135,2025-04-15 01:54:25,Bangkok,79,79,"{'aqi': 79, 'idx': 5773, 'attributions': [{'ur..."
136,2025-04-15 01:54:25,Beijing,151,151,"{'aqi': 151, 'idx': 1451, 'attributions': [{'u..."


In [132]:
# call function on a new df
df2 = extract_aqi_features(df)
df2["Timestamp"] = pd.to_datetime(df2["Timestamp"], utc = True)
# round to nearest hour
df2["Timestamp"] = df2["Timestamp"].dt.round("H")
# drop the original JSON column
df2.drop(columns=['JSON'], inplace=True)
# view
display(df2)

Unnamed: 0,Timestamp,City,AQI,PM2.5,dominant_pollutant,pm25,pm10,co,no2,so2,o3,temperature,humidity,wind_speed,pressure,rain,city_geo_lat,city_geo_lon
0,2025-03-01 12:00:00+00:00,Bangkok,86,86,pm25,86,50,0.1,0.6,0.6,20.5,36.4,60.0,2.0,1008.1,3.3,13.756331,100.501765
1,2025-03-01 12:00:00+00:00,Beijing,261,261,pm25,261,156,16.3,22.4,4.1,40.3,11.0,62.0,2.5,1014.0,,39.954592,116.468117
2,2025-03-01 12:00:00+00:00,Los Angeles,28,28,pm25,28,11,2.3,5.0,0.2,27.2,12.2,88.0,0.1,1034.0,,34.066530,-118.226760
3,2025-03-02 05:00:00+00:00,Bangkok,78,78,pm25,78,53,0.1,1.2,0.6,13.4,31.1,73.0,3.0,1009.7,3.3,13.756331,100.501765
4,2025-03-02 05:00:00+00:00,Beijing,74,74,pm25,74,41,4.6,6.4,3.6,37.0,8.0,26.0,2.5,1026.0,,39.954592,116.468117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,2025-04-14 02:00:00+00:00,Beijing,107,107,pm25,107,58,3.7,3.2,3.1,42.3,17.0,21.0,7.7,1007.0,,39.954592,116.468117
134,2025-04-14 02:00:00+00:00,Los Angeles,51,51,pm25,51,21,2.3,9.8,0.5,35.2,17.4,72.5,0.1,1034.1,,34.066530,-118.226760
135,2025-04-15 02:00:00+00:00,Bangkok,79,79,pm25,79,38,0.1,1.8,0.6,5.6,29.0,74.0,0.7,1011.0,99.5,13.756331,100.501765
136,2025-04-15 02:00:00+00:00,Beijing,151,151,pm25,151,65,10.0,20.2,4.6,9.0,16.0,48.0,0.5,1006.0,,39.954592,116.468117


In [133]:
df3 = df2.copy()
# Filter only Bangkok data
df3 = df3[df3['City'] == 'Bangkok']
# Drop the 'city'and weather column as it's no longer needed
df3.drop(columns=['City', 'temperature', 'humidity', 'wind_speed', 'pressure', 'rain', 'city_geo_lat', 'city_geo_lon'], inplace=True)
# Reset the index
df3.reset_index(drop=True, inplace=True)
# view
display(df3)

Unnamed: 0,Timestamp,AQI,PM2.5,dominant_pollutant,pm25,pm10,co,no2,so2,o3
0,2025-03-01 12:00:00+00:00,86,86,pm25,86,50,0.1,0.6,0.6,20.5
1,2025-03-02 05:00:00+00:00,78,78,pm25,78,53,0.1,1.2,0.6,13.4
2,2025-03-03 02:00:00+00:00,72,72,pm25,72,50,0.1,2.4,0.6,2.4
3,2025-03-04 02:00:00+00:00,79,79,pm25,79,53,0.1,2.4,0.6,3.6
4,2025-03-05 02:00:00+00:00,61,61,pm25,61,50,0.1,3.5,0.6,4.0
5,2025-03-06 02:00:00+00:00,68,68,pm25,68,39,0.1,3.5,0.6,4.0
6,2025-03-07 02:00:00+00:00,71,71,pm25,71,17,0.1,1.2,0.6,9.9
7,2025-03-08 01:00:00+00:00,80,80,pm25,80,25,0.1,1.8,0.6,9.9
8,2025-03-09 01:00:00+00:00,91,91,pm25,91,41,0.1,2.4,0.6,4.8
9,2025-03-10 01:00:00+00:00,111,111,pm25,111,50,0.1,2.4,0.6,9.9


In [134]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://api.open-meteo.com/v1/forecast"
params = {
	"latitude": df2["city_geo_lat"].iloc[0], # Latitude of the first location (Bangkok)
	"longitude": df2["city_geo_lon"].iloc[0], # Longitude of the first location (Bangkok)
	"daily": [
		"apparent_temperature_mean",
		"cloud_cover_mean",
		"precipitation_sum",
		"precipitation_probability_mean",
		"pressure_msl_mean",
		"rain_sum",
		"relative_humidity_2m_mean",
		"showers_sum",
		"surface_pressure_mean",
		"temperature_2m_mean",
		"visibility_mean",
		"wind_speed_10m_mean"
	],
	"hourly": [
		"apparent_temperature",
		"cloud_cover",
		"is_day",
		"precipitation",
		"precipitation_probability",
		"pressure_msl",
		"rain",
		"relative_humidity_2m",
		"showers",
		"surface_pressure",
		"temperature_2m",
		"visibility",
		"wind_speed_10m"
	],
	"timezone": "auto",
	"start_date": "2025-03-01",
	"end_date": str(date.today()),
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data.
hourly = response.Hourly()
hourly_apparent_temperature = hourly.Variables(0).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(1).ValuesAsNumpy()
hourly_is_day = hourly.Variables(2).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()	
hourly_precipitation_probability = hourly.Variables(4).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(5).ValuesAsNumpy()
hourly_rain = hourly.Variables(6).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(7).ValuesAsNumpy()
hourly_showers = hourly.Variables(8).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(9).ValuesAsNumpy()
hourly_temperature_2m = hourly.Variables(10).ValuesAsNumpy()
hourly_visibility = hourly.Variables(11).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(12).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

# Convert the date column to datetime format and set the timezone to UTC
hourly_data["date"] = pd.to_datetime(hourly_data["date"], utc = True)
df2["Timestamp"] = pd.to_datetime(df2["Timestamp"], utc = True)

# use the exact datetime from df2["Timestamp"] to filter the data
filtered_hourly_data = hourly_data["date"].isin(df2["Timestamp"])
hourly_data["date"] = hourly_data["date"][filtered_hourly_data]

hourly_data["apparent_temperature"] = hourly_apparent_temperature[filtered_hourly_data]
hourly_data["cloud_cover"] = hourly_cloud_cover[filtered_hourly_data]
hourly_data["is_day"] = hourly_is_day[filtered_hourly_data]
hourly_data["precipitation"] = hourly_precipitation[filtered_hourly_data]
hourly_data["precipitation_probability"] = hourly_precipitation_probability[filtered_hourly_data]
hourly_data["pressure_msl"] = hourly_pressure_msl[filtered_hourly_data]
hourly_data["rain"] = hourly_rain[filtered_hourly_data]
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m[filtered_hourly_data]
hourly_data["showers"] = hourly_showers[filtered_hourly_data]
hourly_data["surface_pressure"] = hourly_surface_pressure[filtered_hourly_data]
hourly_data["temperature_2m"] = hourly_temperature_2m[filtered_hourly_data]
hourly_data["visibility"] = hourly_visibility[filtered_hourly_data]
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m[filtered_hourly_data]


hourly_dataframe = pd.DataFrame(data = hourly_data)
display(hourly_dataframe)

# Process daily data.
daily = response.Daily()
daily_apparent_temperature_mean = daily.Variables(0).ValuesAsNumpy()
daily_cloud_cover_mean = daily.Variables(1).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(2).ValuesAsNumpy()
daily_precipitation_probability_mean = daily.Variables(3).ValuesAsNumpy()
daily_pressure_msl_mean = daily.Variables(4).ValuesAsNumpy()
daily_rain_sum = daily.Variables(5).ValuesAsNumpy()
daily_relative_humidity_2m_mean = daily.Variables(6).ValuesAsNumpy()
daily_showers_sum = daily.Variables(7).ValuesAsNumpy()
daily_surface_pressure_mean = daily.Variables(8).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(9).ValuesAsNumpy()
daily_visibility_mean = daily.Variables(10).ValuesAsNumpy()
daily_wind_speed_10m_mean = daily.Variables(11).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
daily_data["cloud_cover_mean"] = daily_cloud_cover_mean
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["precipitation_probability_mean"] = daily_precipitation_probability_mean
daily_data["pressure_msl_mean"] = daily_pressure_msl_mean
daily_data["rain_sum"] = daily_rain_sum
daily_data["relative_humidity_2m_mean"] = daily_relative_humidity_2m_mean
daily_data["showers_sum"] = daily_showers_sum
daily_data["surface_pressure_mean"] = daily_surface_pressure_mean
daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
daily_data["visibility_mean"] = daily_visibility_mean
daily_data["wind_speed_10m_mean"] = daily_wind_speed_10m_mean

daily_dataframe = pd.DataFrame(data = daily_data)
# Drop the first row of the daily dataframe to match the hourly dataframe
daily_dataframe = daily_dataframe.iloc[1:]
# Reset the index of the daily dataframe
daily_dataframe.reset_index(drop=True, inplace=True)
display(daily_dataframe)



# Concat the hourly and daily dataframes
weather_df = pd.concat([hourly_dataframe, daily_dataframe], axis=1)
# Drop duplicate date columns
weather_df = weather_df.loc[:, ~weather_df.columns.duplicated(keep='first')]

display(weather_df)

Coordinates 13.75°N 100.5°E
Elevation 4.0 m asl
Timezone b'Asia/Bangkok'b'GMT+7'
Timezone difference to GMT+0 25200 s


Unnamed: 0,date,apparent_temperature,cloud_cover,is_day,precipitation,precipitation_probability,pressure_msl,rain,relative_humidity_2m,showers,surface_pressure,temperature_2m,visibility,wind_speed_10m
0,2025-03-01 12:00:00+00:00,34.769497,4.0,0.0,0.0,0.0,1008.200012,0.0,80.0,0.0,1007.744324,29.036999,24140.0,7.928178
1,2025-03-02 05:00:00+00:00,39.054214,48.0,1.0,0.0,0.0,1010.200012,0.0,64.0,0.0,1009.747925,32.086998,24140.0,8.350138
2,2025-03-03 02:00:00+00:00,34.775131,58.0,1.0,0.0,5.0,1010.900024,0.0,75.0,0.0,1010.443726,29.636999,24140.0,9.746631
3,2025-03-04 02:00:00+00:00,34.384857,49.0,1.0,0.0,0.0,1010.400024,0.0,73.0,0.0,1009.94397,29.636999,24140.0,10.446206
4,2025-03-05 02:00:00+00:00,34.694786,55.0,1.0,0.0,0.0,1009.900024,0.0,73.0,0.0,1009.44458,29.737,24140.0,9.422101
5,2025-03-06 02:00:00+00:00,35.030827,35.0,1.0,0.0,0.0,1011.200012,0.0,75.0,0.0,1010.743591,29.636999,24140.0,7.993298
6,2025-03-07 02:00:00+00:00,34.687614,79.0,1.0,0.0,13.0,1010.599976,0.0,76.0,0.0,1010.143616,29.337,24140.0,7.993297
7,2025-03-08 01:00:00+00:00,28.709038,98.0,1.0,0.0,25.0,1012.200012,0.0,85.0,0.0,1011.736145,24.987,4620.0,9.346143
8,2025-03-09 01:00:00+00:00,29.178474,68.0,1.0,0.0,0.0,1013.400024,0.0,84.0,0.0,1012.935608,24.937,24140.0,4.843305
9,2025-03-10 01:00:00+00:00,31.898869,41.0,1.0,0.0,0.0,1013.599976,0.0,78.0,0.0,1013.138733,26.937,24140.0,2.741678


Unnamed: 0,date,apparent_temperature_mean,cloud_cover_mean,precipitation_sum,precipitation_probability_mean,pressure_msl_mean,rain_sum,relative_humidity_2m_mean,showers_sum,surface_pressure_mean,temperature_2m_mean,visibility_mean,wind_speed_10m_mean
0,2025-03-01 17:00:00+00:00,34.986805,36.75,0.0,0.0,1008.516602,0.0,76.166664,0.0,1008.061218,29.345329,24140.0,7.968113
1,2025-03-02 17:00:00+00:00,35.20702,37.916668,0.0,1.958333,1008.320862,0.0,75.958336,0.0,1007.865906,29.601585,24140.0,8.818902
2,2025-03-03 17:00:00+00:00,34.526707,32.291668,0.0,0.25,1008.062439,0.0,74.333336,0.0,1007.607361,29.378668,24140.0,9.74982
3,2025-03-04 17:00:00+00:00,34.940178,34.375,0.0,0.0,1007.558167,0.0,74.583336,0.0,1007.103516,29.507833,24140.0,8.836523
4,2025-03-05 17:00:00+00:00,35.042831,28.416666,0.0,0.333333,1008.612488,0.0,75.458336,0.0,1008.157043,29.457834,24140.0,8.449785
5,2025-03-06 17:00:00+00:00,34.507198,74.958336,1.8,20.25,1009.204102,0.0,78.0,1.8,1008.747803,28.951584,23837.5,7.410603
6,2025-03-07 17:00:00+00:00,31.098938,83.208336,5.5,14.041667,1011.083313,0.0,77.0,5.5,1010.623047,26.989082,20507.5,7.47117
7,2025-03-08 17:00:00+00:00,32.958958,75.833336,0.0,0.25,1012.058289,0.0,67.125,0.0,1011.600281,28.701582,24140.0,4.566617
8,2025-03-09 17:00:00+00:00,35.201778,40.541668,0.0,0.0,1011.624939,0.0,64.958336,0.0,1011.169678,30.386995,24140.0,4.744283
9,2025-03-10 17:00:00+00:00,36.371807,58.208332,0.1,8.708333,1010.466736,0.0,69.416664,0.1,1010.012207,30.697416,24140.0,4.834938


Unnamed: 0,date,apparent_temperature,cloud_cover,is_day,precipitation,precipitation_probability,pressure_msl,rain,relative_humidity_2m,showers,...,precipitation_sum,precipitation_probability_mean,pressure_msl_mean,rain_sum,relative_humidity_2m_mean,showers_sum,surface_pressure_mean,temperature_2m_mean,visibility_mean,wind_speed_10m_mean
0,2025-03-01 12:00:00+00:00,34.769497,4.0,0.0,0.0,0.0,1008.200012,0.0,80.0,0.0,...,0.0,0.0,1008.516602,0.0,76.166664,0.0,1008.061218,29.345329,24140.0,7.968113
1,2025-03-02 05:00:00+00:00,39.054214,48.0,1.0,0.0,0.0,1010.200012,0.0,64.0,0.0,...,0.0,1.958333,1008.320862,0.0,75.958336,0.0,1007.865906,29.601585,24140.0,8.818902
2,2025-03-03 02:00:00+00:00,34.775131,58.0,1.0,0.0,5.0,1010.900024,0.0,75.0,0.0,...,0.0,0.25,1008.062439,0.0,74.333336,0.0,1007.607361,29.378668,24140.0,9.74982
3,2025-03-04 02:00:00+00:00,34.384857,49.0,1.0,0.0,0.0,1010.400024,0.0,73.0,0.0,...,0.0,0.0,1007.558167,0.0,74.583336,0.0,1007.103516,29.507833,24140.0,8.836523
4,2025-03-05 02:00:00+00:00,34.694786,55.0,1.0,0.0,0.0,1009.900024,0.0,73.0,0.0,...,0.0,0.333333,1008.612488,0.0,75.458336,0.0,1008.157043,29.457834,24140.0,8.449785
5,2025-03-06 02:00:00+00:00,35.030827,35.0,1.0,0.0,0.0,1011.200012,0.0,75.0,0.0,...,1.8,20.25,1009.204102,0.0,78.0,1.8,1008.747803,28.951584,23837.5,7.410603
6,2025-03-07 02:00:00+00:00,34.687614,79.0,1.0,0.0,13.0,1010.599976,0.0,76.0,0.0,...,5.5,14.041667,1011.083313,0.0,77.0,5.5,1010.623047,26.989082,20507.5,7.47117
7,2025-03-08 01:00:00+00:00,28.709038,98.0,1.0,0.0,25.0,1012.200012,0.0,85.0,0.0,...,0.0,0.25,1012.058289,0.0,67.125,0.0,1011.600281,28.701582,24140.0,4.566617
8,2025-03-09 01:00:00+00:00,29.178474,68.0,1.0,0.0,0.0,1013.400024,0.0,84.0,0.0,...,0.0,0.0,1011.624939,0.0,64.958336,0.0,1011.169678,30.386995,24140.0,4.744283
9,2025-03-10 01:00:00+00:00,31.898869,41.0,1.0,0.0,0.0,1013.599976,0.0,78.0,0.0,...,0.1,8.708333,1010.466736,0.0,69.416664,0.1,1010.012207,30.697416,24140.0,4.834938


In [135]:
# Merge the weather data with the AQI data
cleaned_df = pd.merge(df3, weather_df, how='left', left_on='Timestamp', right_on='date')
# Drop the date column as it's no longer needed, keep the Timestamp column
cleaned_df.drop(columns=['date'], inplace=True)
# Reset the index
cleaned_df.reset_index(drop=True, inplace=True)
# view
display(cleaned_df)

Unnamed: 0,Timestamp,AQI,PM2.5,dominant_pollutant,pm25,pm10,co,no2,so2,o3,...,precipitation_sum,precipitation_probability_mean,pressure_msl_mean,rain_sum,relative_humidity_2m_mean,showers_sum,surface_pressure_mean,temperature_2m_mean,visibility_mean,wind_speed_10m_mean
0,2025-03-01 12:00:00+00:00,86,86,pm25,86,50,0.1,0.6,0.6,20.5,...,0.0,0.0,1008.516602,0.0,76.166664,0.0,1008.061218,29.345329,24140.0,7.968113
1,2025-03-02 05:00:00+00:00,78,78,pm25,78,53,0.1,1.2,0.6,13.4,...,0.0,1.958333,1008.320862,0.0,75.958336,0.0,1007.865906,29.601585,24140.0,8.818902
2,2025-03-03 02:00:00+00:00,72,72,pm25,72,50,0.1,2.4,0.6,2.4,...,0.0,0.25,1008.062439,0.0,74.333336,0.0,1007.607361,29.378668,24140.0,9.74982
3,2025-03-04 02:00:00+00:00,79,79,pm25,79,53,0.1,2.4,0.6,3.6,...,0.0,0.0,1007.558167,0.0,74.583336,0.0,1007.103516,29.507833,24140.0,8.836523
4,2025-03-05 02:00:00+00:00,61,61,pm25,61,50,0.1,3.5,0.6,4.0,...,0.0,0.333333,1008.612488,0.0,75.458336,0.0,1008.157043,29.457834,24140.0,8.449785
5,2025-03-06 02:00:00+00:00,68,68,pm25,68,39,0.1,3.5,0.6,4.0,...,1.8,20.25,1009.204102,0.0,78.0,1.8,1008.747803,28.951584,23837.5,7.410603
6,2025-03-07 02:00:00+00:00,71,71,pm25,71,17,0.1,1.2,0.6,9.9,...,5.5,14.041667,1011.083313,0.0,77.0,5.5,1010.623047,26.989082,20507.5,7.47117
7,2025-03-08 01:00:00+00:00,80,80,pm25,80,25,0.1,1.8,0.6,9.9,...,0.0,0.25,1012.058289,0.0,67.125,0.0,1011.600281,28.701582,24140.0,4.566617
8,2025-03-09 01:00:00+00:00,91,91,pm25,91,41,0.1,2.4,0.6,4.8,...,0.0,0.0,1011.624939,0.0,64.958336,0.0,1011.169678,30.386995,24140.0,4.744283
9,2025-03-10 01:00:00+00:00,111,111,pm25,111,50,0.1,2.4,0.6,9.9,...,0.1,8.708333,1010.466736,0.0,69.416664,0.1,1010.012207,30.697416,24140.0,4.834938


# Steps

1. Data Cleaning and Preparation
2. Data Analysis
3. Descriptive Statistics
4. Data Visualization
5. Hypothesis Test
6. Machine Learning / Data Modeling

    📌 Hypothesis 1
    H₀ (Null): There is no significant difference in PM2.5 levels on days with low wind & high humidity.
    H₁ (Alt): PM2.5 levels are significantly higher (>100 µg/m³) when wind speed < 1.5 m/s and humidity > 70%.

In [136]:
# Liam



    📌 Hypothesis 2
    H₀ (Null): There is no significant difference in PM2.5 levels on rainy days in Bangkok.
    H₁ (Alt): There is a significant difference in PM2.5 levels on rainy days in Bangkok.

In [137]:
# Nusrat

In [138]:
cleaned_df.isna().sum()

Timestamp                         0
AQI                               0
PM2.5                             0
dominant_pollutant                0
pm25                              0
pm10                              0
co                                0
no2                               0
so2                               0
o3                                0
apparent_temperature              0
cloud_cover                       0
is_day                            0
precipitation                     0
precipitation_probability         0
pressure_msl                      0
rain                              0
relative_humidity_2m              0
showers                           0
surface_pressure                  0
temperature_2m                    0
visibility                        0
wind_speed_10m                    0
apparent_temperature_mean         0
cloud_cover_mean                  0
precipitation_sum                 0
precipitation_probability_mean    0
pressure_msl_mean           

In [139]:
cleaned_df.dtypes

Timestamp                         datetime64[ns, UTC]
AQI                                             int64
PM2.5                                           int64
dominant_pollutant                             object
pm25                                            int64
pm10                                            int64
co                                            float64
no2                                           float64
so2                                           float64
o3                                            float64
apparent_temperature                          float32
cloud_cover                                   float32
is_day                                        float32
precipitation                                 float32
precipitation_probability                     float32
pressure_msl                                  float32
rain                                          float32
relative_humidity_2m                          float32
showers                     