# Library

In [114]:
import pandas as pd
import requests
import json
from datetime import datetime, timedelta
import time
import pytz
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)


# Weather

In [172]:
def clean_weather_data(response):
    """
    Nhận vào JSON gốc từ OpenWeather API,
    trả về DataFrame rút gọn chỉ còn trường cần thiết,
    đồng thời tách dt_txt thành day, month, year, hour.
    """
    city_id = response.get("city", {}).get("id")

    result = []
    for item in response.get("list", []):
        entry = {
            "city_id": city_id,
            "dt_txt": item.get("dt_txt"),
            "temp": item.get("main", {}).get("temp"),
            "feels_like": item.get("main", {}).get("feels_like"),
            "humidity": item.get("main", {}).get("humidity"),
            "weather_main": item.get("weather", [{}])[0].get("main"),
            "weather_description": item.get("weather", [{}])[0].get("description"),
            "weather_icon": item.get("weather", [{}])[0].get("icon"),
            "pop": item.get("pop"),
            "rain_3h": item.get("rain", {}).get("3h", 0),
            "wind_speed": item.get("wind", {}).get("speed"),
            "wind_gust": item.get("wind", {}).get("gust"),
            "visibility": item.get("visibility"),
            "clouds_all": item.get("clouds", {}).get("all"),
        }
        result.append(entry)

    # Chuyển thành DataFrame
    df = pd.DataFrame(result)

    # Parse datetime
    df["dt_txt"] = pd.to_datetime(df["dt_txt"], errors="coerce")

    # Tách thành day, month, year, hour
    df["year"] = df["dt_txt"].dt.year
    df["month"] = df["dt_txt"].dt.month
    df["day"] = df["dt_txt"].dt.day
    df["hour"] = df["dt_txt"].dt.hour

    return df

def assign_period(hour):
    """Gán buổi trong ngày dựa trên giờ"""
    if hour in [0, 3]:
        return "Night"
    elif hour == 6:
        return "Morning"
    elif hour in [9, 12]:
        return "Noon"
    elif hour == 15:
        return "Afternoon"
    elif hour in [18, 21]:
        return "Evening"
    else:
        return None

def aggregate_weather_by_period(df):
    df = df.copy()
    df["period"] = df["hour"].apply(assign_period)

    # Xác định numeric và categorical
    numeric_cols = df.select_dtypes(include=["float64", "int64", "int32"]).columns.tolist()
    object_cols = df.select_dtypes(include=["object"]).columns.tolist()

    exclude = ["city_id", "year", "month", "day", "hour", "period"]
    numeric_cols = [c for c in numeric_cols if c not in exclude]

    agg_funcs = {}
    for col in numeric_cols:
        agg_funcs[col] = "mean"
    for col in object_cols:
        agg_funcs[col] = lambda x: x.mode().iloc[0] if not x.mode().empty else None

    grouped = (
        df.groupby(["city_id", "year", "month", "day", "period"], as_index=False)
          .agg(agg_funcs)
    )

    # Bỏ cột dt_txt và hour nếu còn sót
    grouped = grouped.drop(columns=[c for c in ["hour", "dt_txt"] if c in grouped.columns])

    return grouped



def fetch_api(url, params):
    """Gọi API Air Pollution Forecast và trả về JSON"""
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print("Lỗi API:", response.status_code, response.text)
        return None



In [173]:
API_KEY_WEATHER = "fc437d742a7df737b506c1984e2447ff"
LAT = 11.0   # latitude DongNai
LON = 107.166672  # longitude của DongNai
ID = 1559969

URL = "https://api.openweathermap.org/data/2.5/forecast"
# Tham số truy vấn
params = {
    "id": ID,
    "appid": API_KEY_WEATHER,
    "units": "metric",  # nhiệt độ Celsius
    "lang": "vi"        # ngôn ngữ tiếng Việt
}

response = fetch_api(URL, params)
data = clean_weather_data(response)
df_weather_grouped = aggregate_weather_by_period(data)
df_weather_grouped.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   city_id              26 non-null     int64  
 1   year                 26 non-null     int32  
 2   month                26 non-null     int32  
 3   day                  26 non-null     int32  
 4   temp                 26 non-null     float64
 5   feels_like           26 non-null     float64
 6   humidity             26 non-null     float64
 7   pop                  26 non-null     float64
 8   rain_3h              26 non-null     float64
 9   wind_speed           26 non-null     float64
 10  wind_gust            26 non-null     float64
 11  visibility           26 non-null     float64
 12  clouds_all           26 non-null     float64
 13  weather_main         26 non-null     object 
 14  weather_description  26 non-null     object 
 15  weather_icon         26 non-null     objec

In [154]:
df_weather_grouped.head(5)

Unnamed: 0,city_id,year,month,day,temp,feels_like,humidity,pop,rain_3h,wind_speed,wind_gust,visibility,clouds_all,weather_main,weather_description,weather_icon,period
0,1559969,2025,9,8,23.65,24.63,98.0,1.0,2.28,0.5,0.31,10000.0,100.0,Rain,mưa nhẹ,10n,Afternoon
1,1559969,2025,9,8,22.655,23.585,100.0,0.5,0.1,1.345,2.09,10000.0,100.0,Clouds,mây đen u ám,04n,Evening
2,1559969,2025,9,8,34.45,36.78,42.0,0.0,0.0,1.42,1.74,10000.0,13.0,Clouds,mây thưa,02d,Morning (Early)
3,1559969,2025,9,8,31.155,34.335,60.0,0.325,0.355,0.93,0.725,10000.0,41.5,Clouds,mây rải rác,03d,Noon
4,1559969,2025,9,9,23.29,24.23,98.0,0.96,1.0,0.62,0.65,10000.0,100.0,Rain,mưa nhẹ,10n,Afternoon


In [155]:
df_weather_grouped['day'].unique()

array([ 8,  9, 10, 11, 12, 13], dtype=int32)

---
# Climate

In [None]:
def fetch_api(url, params):
    """Gọi API Air Pollution Forecast và trả về JSON"""
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print("Lỗi API:", response.status_code, response.text)
        return None


def majority_or_median(series):
    """Trả về giá trị xuất hiện >=2 lần, nếu không có thì lấy median"""
    counts = series.value_counts()
    if counts.iloc[0] >= 2:
        return counts.index[0]
    else:
        return int(series.median())  # median nhưng ép int

def process_air_pollution_grouped(data):
    """Chuyển list thành DataFrame, thêm year/month/day/hour_group và nhóm lại theo 8 khung 3h"""
    if not data or "list" not in data:
        return pd.DataFrame()
    
    df = pd.json_normalize(data["list"])
    
    # Đổi dt -> datetime
    df["dt_txt"] = pd.to_datetime(df["dt"].apply(lambda x: datetime.utcfromtimestamp(x)))
    df["year"] = df["dt_txt"].dt.year
    df["month"] = df["dt_txt"].dt.month
    df["day"] = df["dt_txt"].dt.day
    df["hour"] = df["dt_txt"].dt.hour
    
    # Nhóm khung 3h
    df["hour"] = (df["hour"] // 3) * 3
    
    # Gom nhóm
    grouped = df.groupby(["year", "month", "day", "hour"]).agg({
        "main.aqi": majority_or_median,
        "components.co": "mean",
        "components.no": "mean",
        "components.no2": "mean",
        "components.o3": "mean",
        "components.so2": "mean",
        "components.pm2_5": "mean",
        "components.pm10": "mean",
        "components.nh3": "mean"
    }).reset_index()
    
    # Tạo lại dt_txt đại diện (nửa đầu khung giờ)
    grouped["dt_txt"] = pd.to_datetime(dict(
        year=grouped.year,
        month=grouped.month,
        day=grouped.day,
        hour=grouped.hour
    ))
    
    # Đổi tên cột
    grouped = grouped.rename(columns={
        "main.aqi": "AQI",
        "components.co": "CO",
        "components.no": "NO",
        "components.no2": "NO2",
        "components.o3": "O3",
        "components.so2": "SO2",
        "components.pm2_5": "PM2_5",
        "components.pm10": "PM10",
        "components.nh3": "NH3"
    })
    
    # Sắp xếp lại cột
    cols = [
        "dt_txt", "year", "month", "day", "hour",
        "AQI", "CO", "NO", "NO2", "O3", "SO2", "PM2_5", "PM10", "NH3"
    ]
    grouped = grouped[cols]
    return grouped

def process_air_pollution_by_period(data):
    df = data.copy()
    if df.empty:
        return df

    # Gán period
    df["period"] = df["hour"].apply(assign_period)

    # Gom nhóm lại
    grouped = df.groupby(["year", "month", "day", "period"], as_index=False).agg({
        "AQI": "max",   # AQI lấy max
        "CO": "mean",
        "NO": "mean",
        "NO2": "mean",
        "O3": "mean",
        "SO2": "mean",
        "PM2_5": "mean",
        "PM10": "mean",
        "NH3": "mean"
    })

    # Chỉ giữ các cột cần
    cols = [
        "year", "month", "day", "period",
        "AQI", "CO", "NO", "NO2", "O3", "SO2", "PM2_5", "PM10", "NH3"
    ]
    grouped = grouped[cols]

    return grouped


def forcast_climate(url, params):
    data = fetch_api(url, params)
    return process_air_pollution_grouped(data)


In [None]:
API_KEY_WEATHER = "fc437d742a7df737b506c1984e2447ff"
LAT = 11.16667
LON = 106.666672 

URL = "http://api.openweathermap.org/data/2.5/air_pollution/forecast"
# Tham số truy vấn
params = {
    "lat": LAT,
    "lon": LON,
    "appid": API_KEY_WEATHER,
}
response = fetch_api(URL, params)
data = process_air_pollution_grouped(response)
df_climated_grouped = process_air_pollution_by_period(data)
df_climated_grouped.info()

In [159]:
df_climated_grouped.head()

Unnamed: 0,year,month,day,period,AQI,CO,NO,NO2,O3,SO2,PM2_5,PM10,NH3
0,2025,9,8,Afternoon,2,606.456667,0.02,11.266667,20.603333,3.816667,22.06,30.733333,4.453333
1,2025,9,8,Evening,2,532.885,0.04,7.656667,18.296667,2.406667,21.691667,28.698333,5.048333
2,2025,9,8,Morning,1,267.603333,0.19,4.336667,48.39,2.47,6.413333,8.07,1.58
3,2025,9,8,Night,1,253.84,0.2,3.85,48.48,2.3,5.76,7.32,1.35
4,2025,9,8,Noon,2,457.675,0.095,10.253333,31.48,3.851667,13.673333,18.978333,3.676667


---
# UVindex (Current UV index)

In [166]:
def assign_period_uv(hour):
    """Gán buổi trong ngày dựa trên giờ"""
    if hour in [0, 3]:
        return "Night"
    elif hour in [4, 9]:
        return "Morning"
    elif hour in [10, 14]:
        return "Noon"
    elif hour in [15, 18]:
        return "Afternoon"
    elif hour in [19, 23]:
        return "Evening"
    else:
        return None

def aggregate_uv_by_period(df):
    """
    Nhận DataFrame từ parse_uv_forecast_df, 
    gán period theo assign_period_uv, 
    gom uvi lấy max, loại bỏ time và hour.
    """
    if df.empty:
        return df

    df = df.copy()
    df["period"] = df["hour"].apply(assign_period_uv)

    grouped = df.groupby(["year", "month", "day", "period"], as_index=False).agg({
        "uvi": "max"
    })

    # Bỏ cột time và hour nếu còn
    grouped = grouped.drop(columns=[c for c in ["time", "hour"] if c in grouped.columns])

    return grouped

def parse_uv_forecast_df(data):
    """Trích xuất forecast thành DataFrame với uvi, time (VN), year, month, day, hour"""
    records = []

    if not data or "forecast" not in data:
        return pd.DataFrame(records)

    for entry in data["forecast"]:
        utc_time = datetime.strptime(entry["time"], "%Y-%m-%dT%H:%M:%SZ")
        vn_time = utc_time + timedelta(hours=7)  # UTC+7
        records.append({
            "uvi": entry["uvi"],
            "time": vn_time,
            "year": vn_time.year,
            "month": vn_time.month,
            "day": vn_time.day,
            "hour": vn_time.hour
        })

    return pd.DataFrame(records)


In [168]:
LAT = 11.16667
LON = 106.666672 
params = {
    "latitude": LAT,
    "longitude": LON
}
URL = "https://currentuvindex.com/api/v1/uvi"

response = fetch_api(URL, params)
data = parse_uv_forecast_df(response)
df_uv_grouped = aggregate_uv_by_period(data)
df_uv_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    23 non-null     int64  
 1   month   23 non-null     int64  
 2   day     23 non-null     int64  
 3   period  23 non-null     object 
 4   uvi     23 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.0+ KB


In [171]:
df_uv_grouped.head(10)

Unnamed: 0,year,month,day,period,uvi
0,2025,9,8,Afternoon,2.9
1,2025,9,8,Evening,0.0
2,2025,9,8,Noon,5.9
3,2025,9,9,Afternoon,3.3
4,2025,9,9,Evening,0.0
5,2025,9,9,Morning,2.2
6,2025,9,9,Night,0.0
7,2025,9,9,Noon,7.6
8,2025,9,10,Afternoon,4.4
9,2025,9,10,Evening,0.0


---
# Checking

In [72]:
df_city = pd.read_csv('city.csv')
df_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         60 non-null     int64  
 1   name       60 non-null     object 
 2   longitude  60 non-null     float64
 3   latitude   60 non-null     float64
dtypes: float64(2), int64(1), object(1)
memory usage: 2.0+ KB


In [90]:
list_lon = df_city['longitude'].to_numpy()
list_lat = df_city['latitude'].to_numpy()


In [94]:
API_KEY_WEATHER = "fc437d742a7df737b506c1984e2447ff"
LAT = 11.16667
LON = 106.666672 
URL = "http://api.openweathermap.org/data/2.5/air_pollution/forecast"

for LAT, LON in zip(list_lat, list_lon):
    # Tham số truy vấn
    params = {
        "lat": LAT,
        "lon": LON,
        "appid": API_KEY_WEATHER,
    }
    data = fetch_api(URL, params)
    lat_checked, lon_checked = data["coord"]["lat"], data["coord"]["lon"]
    if ( LAT == lat_checked and LON == lon_checked): print('success 🍏')
    else:
        print('failed 🍎')
        print('LAT: ', LAT, '|| lat_checked: ', lat_checked)
        print('LON: ', LON, '|| lon_checked: ', lon_checked)

    time.sleep(7)

failed 🍎
LAT:  19.33333 || lat_checked:  19.3333
LON:  104.833328 || lon_checked:  104.8333
failed 🍎
LAT:  20.25 || lat_checked:  20.25
LON:  105.833328 || lon_checked:  105.8333
failed 🍎
LAT:  11.75 || lat_checked:  11.75
LON:  108.833328 || lon_checked:  108.8333
failed 🍎
LAT:  9.66667 || lat_checked:  9.6667
LON:  105.833328 || lon_checked:  105.8333
failed 🍎
LAT:  9.83333 || lat_checked:  9.8333
LON:  106.25 || lon_checked:  106.25
failed 🍎
LAT:  22.116671 || lat_checked:  22.1167
LON:  105.25 || lon_checked:  105.25
failed 🍎
LAT:  10.16667 || lat_checked:  10.1667
LON:  106.0 || lon_checked:  106
failed 🍎
LAT:  21.5 || lat_checked:  21.5
LON:  104.666672 || lon_checked:  104.6667
failed 🍎
LAT:  22.33333 || lat_checked:  22.3333
LON:  104.0 || lon_checked:  104
failed 🍎
LAT:  10.41667 || lat_checked:  10.4167
LON:  106.166672 || lon_checked:  106.1667
failed 🍎
LAT:  16.33333 || lat_checked:  16.3333
LON:  107.583328 || lon_checked:  107.5833
failed 🍎
LAT:  14.75 || lat_checked:  14

KeyboardInterrupt: 