### Wind and Temperature testing data scraping using Selenium

#### read position files

In [15]:
import re  # regular expression

def split(text):
    list_text = re.split(r"\n|\s", text)
    return list_text[2], list_text[5]
    
position_bkk = {'lat': None, 'long': None}
position_cm = {'lat': None, 'long': None}
position_kk = {'lat': None, 'long': None}
position_ry = {'lat': None, 'long': None}
position_sb = {'lat': None, 'long': None}
position_sr = {'lat': None, 'long': None}


In [16]:
f_bkk = open('./datasci_dataset_2022/BKK/position.txt', 'r')
f_cm = open('./datasci_dataset_2022/Chiangmai/position.txt', 'r')
f_kk = open('./datasci_dataset_2022/Khonkaen/position.txt', 'r')
f_ry = open('./datasci_dataset_2022/Rayong/position.txt', 'r')
f_sb = open('./datasci_dataset_2022/Saraburi/position.txt', 'r')
f_sr = open('./datasci_dataset_2022/Surat/position.txt', 'r')


In [17]:
position_bkk['lat'], position_bkk['long'] = split(f_bkk.read())
position_cm['lat'], position_cm['long'] = split(f_cm.read())
position_kk['lat'], position_kk['long'] = split(f_kk.read())
position_ry['lat'], position_ry['long'] = split(f_ry.read())
position_sb['lat'], position_sb['long'] = split(f_sb.read())
position_sr['lat'], position_sr['long'] = split(f_sr.read())

In [18]:
print(f"BKK: {position_bkk}\nChiangmai: {position_cm}\nKhonkaen: {position_kk}\nRayong: {position_ry}\nSaraburi: {position_sb}\nSurat: {position_sr}")

BKK: {'lat': '13.729984', 'long': '100.536443'}
Chiangmai: {'lat': '18.840633', 'long': '98.969661'}
Khonkaen: {'lat': '16.445329', 'long': '102.835251'}
Rayong: {'lat': '12.671521', 'long': '101.275875'}
Saraburi: {'lat': '14.685833', 'long': '100.871996'}
Surat: {'lat': '9.126057', 'long': '99.325355'}


In [19]:
f_bkk.close()
f_cm.close()
f_kk.close()
f_ry.close()
f_sb.close()
f_sr.close()

#### data scraping

In [20]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager

In [21]:
service = Service(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service)

[WDM] - Current edge version is 99.0.1150
[WDM] - Get LATEST edgedriver version for 99.0.1150 Edge
[WDM] - Trying to download new driver from https://msedgedriver.azureedge.net/99.0.1150.39/edgedriver_win64.zip
[WDM] - Driver has been saved in cache [C:\Users\FACT-PC\.wdm\drivers\edgedriver\win64\99.0.1150.39]


In [22]:
import pandas as pd

start_date, end_date = pd.to_datetime(['2020-07-01 01:00 +0700', '2021-07-01 22:00 +0700'], utc=True)  # Convert the data to datetime in UTC  
print(start_date, end_date)

2020-06-30 18:00:00+00:00 2021-07-01 15:00:00+00:00


In [23]:
import os

def scrape_wind(date, lat, long):
    url =  f'https://classic.nullschool.net/{date}/wind/isobaric/850hPa/orthographic/loc={long},{lat}'
    driver.get(url)
    driver.refresh()
    wind_el = driver.find_element(by=By.CSS_SELECTOR, value='#location-wind')
    wind_info = wind_el.get_attribute('innerText').split('° @ ')
    return wind_info

def scrape_wind_data(start_date, end_date, position, f_dir):
    path = f'./datasci_dataset_2022/{f_dir.capitalize()}/test/{f_dir}_wind_test.csv'
    
    if (not os.path.exists(path) or os.stat(path).st_size==0):
        f_w = open(path, 'a')
        f_w.write('date_time,wind speed,wind dir\n')
        f_w.close()
    else:
        f_r = open(path, 'r')
        last_line = f_r.readlines()[-1][:-1]
        print(f"lastline: {last_line}")
        if (last_line == 'date_time,wind speed,wind dir'):
            pass
        else:
            last_date = pd.to_datetime([last_line[:16] + " +0700"], utc=True)
            start_date = last_date[0] + pd.Timedelta(hours=3)

    counter = 0
    f_w = open(path, 'a')
    while start_date <= end_date :
        scrape_date = start_date.strftime('#%Y/%m/%d/%H%MZ')
        save_date = start_date.tz_convert('Asia/Bangkok').strftime('%Y-%m-%d %H:%M:%S')
        while True:
            wind_info = scrape_wind(scrape_date, position['lat'], position['long'])
            if len(wind_info) == 2: # if we have got the data, break the loop
                break
        start_date += pd.Timedelta(hours=3)
        f_w.write(f"{save_date},{wind_info[1]},{wind_info[0]}\n")
        if counter == 7:   # Save the data once a day
            counter = 0
            f_w.close()
            f_w = open(path, 'a')
        else:
            counter += 1
        print(f"{save_date},{wind_info[1]},{wind_info[0]}")
    f_w.close()

In [None]:
# Scrape wind data for BKK
scrape_wind_data(start_date, end_date, position_bkk, 'bkk')

In [None]:
# Scrape wind data for Chiangmai
scrape_wind_data(start_date, end_date, position_cm, 'chiangmai')

In [None]:
# Scrape wind data for Khonkaen
scrape_wind_data(start_date, end_date, position_kk, 'khonkaen')

In [None]:
# Scrape wind data for Rayong
scrape_wind_data(start_date, end_date, position_ry, 'rayong')

In [None]:
# Scrape wind data for Saraburi
scrape_wind_data(start_date, end_date, position_sb, 'saraburi')

In [None]:
# Scrape wind data for Surat
scrape_wind_data(start_date, end_date, position_sr, 'surat')

In [None]:
driver.quit()  # Close the selenium