In [9]:
import time
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor

In [11]:
def crawl_day_data(driver, year, month, day):
    """Crawl weather data for a specific day using existing driver"""
    print(f"Crawling {day}/{month}/{year}...")
    
    try:
        wait = WebDriverWait(driver, 10)
        select_element = wait.until(EC.presence_of_element_located((By.ID, "wt-his-select")))
        select = Select(select_element)
        
        day_value = f"{year}{month:02d}{day:02d}"
        
        options_values = [opt.get_attribute("value") for opt in select.options]
        if day_value not in options_values:
            print(f"Ngày {day}/{month}/{year} không có trong danh sách lựa chọn.")
            return []
        
        select.select_by_value(day_value)
        
        wait.until(EC.presence_of_element_located((By.ID, "wt-his")))
        time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        data = []
        table = soup.find("table", {"id": "wt-his"})
        if not table:
            print(f"Không có bảng dữ liệu cho ngày {day}/{month}/{year}")
            return data
        
        date_str = f"{year}-{month:02d}-{day:02d}"
        rows = table.find_all("tr")
        for row in rows[2:]:  # skip header rows
            cols = row.find_all(["th", "td"])
            if len(cols) >= 9:
                try:
                    hour = cols[0].text.strip().split('\n')[0].strip()
                    temp = float(cols[2].text.strip().replace("°C", "").strip()) if "°C" in cols[2].text else None
                    
                    # ✅ Thêm cột weather
                    weather = cols[3].text.strip() if cols[3].text.strip() else None
                    
                    wind_text = cols[4].text.strip()
                    if "No wind" in wind_text:
                        wind_speed = 0.0
                    else:
                        try:
                            wind_speed = float(wind_text.replace("km/h", "").strip())
                        except:
                            wind_speed = None
                    
                    try:
                        wind_direction_span = cols[5].find("span")
                        if wind_direction_span and "title" in wind_direction_span.attrs:
                            wind_direction = wind_direction_span["title"]
                            wind_angle = float(wind_direction.split("°")[0].replace("Wind blowing from ", "").strip())
                        else:
                            wind_angle = None
                    except:
                        wind_angle = None
                    
                    humidity = float(cols[6].text.strip().replace("%", "").strip()) if "%" in cols[6].text else None
                    pressure = float(cols[7].text.strip().replace("mbar", "").strip()) if "mbar" in cols[7].text else None
                    
                    vis_text = cols[8].text.strip()
                    visibility = float(vis_text.replace("km", "").strip()) if "km" in vis_text else None
                    
                    data.append({
                        "date": date_str,
                        "hour": hour,
                        "temperature": temp,
                        "weather": weather,  # ✅ thêm vào dict
                        "wind_speed": wind_speed,
                        "wind_angle": wind_angle,
                        "humidity": humidity,
                        "pressure": pressure,
                        "visibility": visibility
                    })
                except Exception as e:
                    print(f"Lỗi dữ liệu {day}/{month}/{year} giờ {hour if 'hour' in locals() else 'unknown'}: {e}")
        return data
    except Exception as e:
        print(f"Lỗi khi xử lý ngày {day}/{month}/{year}: {e}")
        return []


def get_days_in_month(year, month):
    if month == 12:
        next_month = datetime(year + 1, 1, 1)
    else:
        next_month = datetime(year, month + 1, 1)
    last_day = next_month - timedelta(days=1)
    return last_day.day

def crawl_month(year, month):
    print(f"Processing month {month}/{year}")
    days_in_month = get_days_in_month(year, month)
    
    if year == datetime.now().year and month == datetime.now().month:
        days_in_month = min(days_in_month, datetime.now().day)
    
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        url = f"https://www.timeanddate.com/weather/vietnam/da-nang/historic?month={month}&year={year}"
        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.ID, "wt-his-select")))
        
        all_data = []
        for day in range(1, days_in_month + 1):
            day_data = crawl_day_data(driver, year, month, day)
            all_data.extend(day_data)
            time.sleep(1)
        return all_data
    finally:
        driver.quit()

In [None]:
if __name__ == "__main__":
    start_year = 2020
    end_year = 2025
    
    all_data = []
    tasks = []
    for year in range(start_year, end_year + 1):
        max_month = 12
        if year == datetime.now().year:
            max_month = datetime.now().month
        
        for month in range(1, max_month + 1):
            tasks.append((year, month))
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = executor.map(lambda args: crawl_month(*args), tasks)
        for res in results:
            all_data.extend(res)
    
    df = pd.DataFrame(all_data)
    print("Đã thu thập xong dữ liệu.")
    print(f"Tổng số bản ghi: {len(df)}")
    print(df.head())
    
    df.to_csv("raw_data.csv", index=False, na_rep="N/A")
    print("Đã lưu file CSV.")

Processing month 1/2020
Processing month 2/2020
Processing month 3/2020
Processing month 4/2020
Crawling 1/1/2020...
Crawling 2/1/2020...
Crawling 3/1/2020...
Crawling 1/2/2020...
Crawling 1/3/2020...
Crawling 4/1/2020...
Crawling 1/4/2020...
Crawling 2/2/2020...
Crawling 2/3/2020...
Crawling 5/1/2020...
Crawling 2/4/2020...
Crawling 3/2/2020...
Crawling 3/3/2020...
Crawling 6/1/2020...
Crawling 3/4/2020...
Crawling 4/2/2020...
Crawling 4/3/2020...
Crawling 7/1/2020...
Crawling 4/4/2020...
Crawling 5/2/2020...
Crawling 5/3/2020...
Crawling 8/1/2020...
Crawling 5/4/2020...
Crawling 6/2/2020...
Crawling 6/3/2020...
Crawling 9/1/2020...
Crawling 6/4/2020...
Crawling 7/2/2020...
Crawling 7/3/2020...
Crawling 10/1/2020...
Crawling 7/4/2020...
Crawling 8/2/2020...
Crawling 8/3/2020...
Crawling 11/1/2020...
Crawling 8/4/2020...
Crawling 9/2/2020...
Crawling 9/3/2020...
Crawling 12/1/2020...
Crawling 9/4/2020...
Crawling 10/2/2020...
Crawling 10/3/2020...
Crawling 13/1/2020...
Crawling 10/4/20

In [None]:
import time
import pandas as pd
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from concurrent.futures import ThreadPoolExecutor

In [None]:
def crawl_day_data(driver, year, month, day):
    """Crawl weather data for a specific day using existing driver"""
    print(f"Crawling {day}/{month}/{year}...")
    
    try:
        wait = WebDriverWait(driver, 10)
        select_element = wait.until(EC.presence_of_element_located((By.ID, "wt-his-select")))
        select = Select(select_element)
        
        day_value = f"{year}{month:02d}{day:02d}"
        
        options_values = [opt.get_attribute("value") for opt in select.options]
        if day_value not in options_values:
            print(f"Ngày {day}/{month}/{year} không có trong danh sách lựa chọn.")
            return []
        
        select.select_by_value(day_value)
        
        wait.until(EC.presence_of_element_located((By.ID, "wt-his")))
        time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        data = []
        table = soup.find("table", {"id": "wt-his"})
        if not table:
            print(f"Không có bảng dữ liệu cho ngày {day}/{month}/{year}")
            return data
        
        date_str = f"{year}-{month:02d}-{day:02d}"
        rows = table.find_all("tr")
        for row in rows[2:]:  # skip header rows
            cols = row.find_all(["th", "td"])
            if len(cols) >= 9:
                try:
                    hour = cols[0].text.strip().split('\n')[0].strip()
                    temp = float(cols[2].text.strip().replace("°C", "").strip()) if "°C" in cols[2].text else None
                    
                    # ✅ Thêm cột weather
                    weather = cols[3].text.strip() if cols[3].text.strip() else None
                    
                    wind_text = cols[4].text.strip()
                    if "No wind" in wind_text:
                        wind_speed = 0.0
                    else:
                        try:
                            wind_speed = float(wind_text.replace("km/h", "").strip())
                        except:
                            wind_speed = None
                    
                    try:
                        wind_direction_span = cols[5].find("span")
                        if wind_direction_span and "title" in wind_direction_span.attrs:
                            wind_direction = wind_direction_span["title"]
                            wind_angle = float(wind_direction.split("°")[0].replace("Wind blowing from ", "").strip())
                        else:
                            wind_angle = None
                    except:
                        wind_angle = None
                    
                    humidity = float(cols[6].text.strip().replace("%", "").strip()) if "%" in cols[6].text else None
                    pressure = float(cols[7].text.strip().replace("mbar", "").strip()) if "mbar" in cols[7].text else None
                    
                    vis_text = cols[8].text.strip()
                    visibility = float(vis_text.replace("km", "").strip()) if "km" in vis_text else None
                    
                    data.append({
                        "date": date_str,
                        "hour": hour,
                        "temperature": temp,
                        "weather": weather,  # ✅ thêm vào dict
                        "wind_speed": wind_speed,
                        "wind_angle": wind_angle,
                        "humidity": humidity,
                        "pressure": pressure,
                        "visibility": visibility
                    })
                except Exception as e:
                    print(f"Lỗi dữ liệu {day}/{month}/{year} giờ {hour if 'hour' in locals() else 'unknown'}: {e}")
        return data
    except Exception as e:
        print(f"Lỗi khi xử lý ngày {day}/{month}/{year}: {e}")
        return []


def get_days_in_month(year, month):
    if month == 12:
        next_month = datetime(year + 1, 1, 1)
    else:
        next_month = datetime(year, month + 1, 1)
    last_day = next_month - timedelta(days=1)
    return last_day.day

def crawl_month(year, month):
    print(f"Processing month {month}/{year}")
    days_in_month = get_days_in_month(year, month)
    
    if year == datetime.now().year and month == datetime.now().month:
        days_in_month = min(days_in_month, datetime.now().day)
    
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        url = f"https://www.timeanddate.com/weather/vietnam/da-nang/historic?month={month}&year={year}"
        driver.get(url)
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.ID, "wt-his-select")))
        
        all_data = []
        for day in range(1, days_in_month + 1):
            day_data = crawl_day_data(driver, year, month, day)
            all_data.extend(day_data)
            time.sleep(1)
        return all_data
    finally:
        driver.quit()

In [None]:
if __name__ == "__main__":
    start_year = 2020
    end_year = 2025
    
    all_data = []
    tasks = []
    for year in range(start_year, end_year + 1):
        max_month = 12
        if year == datetime.now().year:
            max_month = datetime.now().month
        
        for month in range(1, max_month + 1):
            tasks.append((year, month))
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        results = executor.map(lambda args: crawl_month(*args), tasks)
        for res in results:
            all_data.extend(res)
    
    df = pd.DataFrame(all_data)
    print("Đã thu thập xong dữ liệu.")
    print(f"Tổng số bản ghi: {len(df)}")
    print(df.head())
    
    df.to_csv("raw_data.csv", index=False, na_rep="N/A")
    print("Đã lưu file CSV.")

Processing month 1/2020
Processing month 2/2020
Processing month 3/2020
Processing month 4/2020
Crawling 1/1/2020...
Crawling 2/1/2020...
Crawling 3/1/2020...
Crawling 1/2/2020...
Crawling 1/3/2020...
Crawling 4/1/2020...
Crawling 1/4/2020...
Crawling 2/2/2020...
Crawling 2/3/2020...
Crawling 5/1/2020...
Crawling 2/4/2020...
Crawling 3/2/2020...
Crawling 3/3/2020...
Crawling 6/1/2020...
Crawling 3/4/2020...
Crawling 4/2/2020...
Crawling 4/3/2020...
Crawling 7/1/2020...
Crawling 4/4/2020...
Crawling 5/2/2020...
Crawling 5/3/2020...
Crawling 8/1/2020...
Crawling 5/4/2020...
Crawling 6/2/2020...
Crawling 6/3/2020...
Crawling 9/1/2020...
Crawling 6/4/2020...
Crawling 7/2/2020...
Crawling 7/3/2020...
Crawling 10/1/2020...
Crawling 7/4/2020...
Crawling 8/2/2020...
Crawling 8/3/2020...
Crawling 11/1/2020...
Crawling 8/4/2020...
Crawling 9/2/2020...
Crawling 9/3/2020...
Crawling 12/1/2020...
Crawling 9/4/2020...
Crawling 10/2/2020...
Crawling 10/3/2020...
Crawling 13/1/2020...
Crawling 10/4/20