### Wind and Temperature testing data scraping using Selenium

#### read position files

In [2]:
import re  # regular expression

def split(text):
    list_text = re.split(r"\n|\s", text)
    return list_text[2], list_text[5]
    
position_bkk = {'lat': None, 'long': None}
position_cm = {'lat': None, 'long': None}
position_kk = {'lat': None, 'long': None}
position_ry = {'lat': None, 'long': None}
position_sb = {'lat': None, 'long': None}
position_sr = {'lat': None, 'long': None}


In [3]:
f_bkk = open('./datasci_dataset_2022/BKK/position.txt', 'r')
f_cm = open('./datasci_dataset_2022/Chiangmai/position.txt', 'r')
f_kk = open('./datasci_dataset_2022/Khonkaen/position.txt', 'r')
f_ry = open('./datasci_dataset_2022/Rayong/position.txt', 'r')
f_sb = open('./datasci_dataset_2022/Saraburi/position.txt', 'r')
f_sr = open('./datasci_dataset_2022/Surat/position.txt', 'r')


In [4]:
position_bkk['lat'], position_bkk['long'] = split(f_bkk.read())
position_cm['lat'], position_cm['long'] = split(f_cm.read())
position_kk['lat'], position_kk['long'] = split(f_kk.read())
position_ry['lat'], position_ry['long'] = split(f_ry.read())
position_sb['lat'], position_sb['long'] = split(f_sb.read())
position_sr['lat'], position_sr['long'] = split(f_sr.read())

In [5]:
print(f"BKK: {position_bkk}\nChiangmai: {position_cm}\nKhonkaen: {position_kk}\nRayong: {position_ry}\nSaraburi: {position_sb}\nSurat: {position_sr}")

BKK: {'lat': '13.729984', 'long': '100.536443'}
Chiangmai: {'lat': '18.840633', 'long': '98.969661'}
Khonkaen: {'lat': '16.445329', 'long': '102.835251'}
Rayong: {'lat': '12.671521', 'long': '101.275875'}
Saraburi: {'lat': '14.685833', 'long': '100.871996'}
Surat: {'lat': '9.126057', 'long': '99.325355'}


In [6]:
f_bkk.close()
f_cm.close()
f_kk.close()
f_ry.close()
f_sb.close()
f_sr.close()

#### data scraping

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager

In [8]:
service = Service(executable_path=EdgeChromiumDriverManager().install())
driver = webdriver.Edge(service=service)

[WDM] - Current edge version is 99.0.1150
[WDM] - Get LATEST edgedriver version for 99.0.1150 Edge
[WDM] - Trying to download new driver from https://msedgedriver.azureedge.net/99.0.1150.39/edgedriver_win64.zip
[WDM] - Driver has been saved in cache [C:\Users\nine_\.wdm\drivers\edgedriver\win64\99.0.1150.39]


In [9]:
import pandas as pd

start_date, end_date = pd.to_datetime(['2020-07-01 01:00 +0700', '2021-07-01 22:00 +0700'], utc=True)  # Convert the data to datetime in UTC  
print(start_date, end_date)

2020-06-30 18:00:00+00:00 2021-07-01 15:00:00+00:00


##### wind scraping

In [23]:
import os

def scrape_wind(date, lat, long):
    url =  f'https://classic.nullschool.net/{date}/wind/isobaric/850hPa/orthographic/loc={long},{lat}'
    driver.get(url)
    driver.refresh()
    wind_el = driver.find_element(by=By.CSS_SELECTOR, value='#location-wind')
    wind_info = wind_el.get_attribute('innerText').split('° @ ')
    return wind_info

def scrape_wind_data(start_date, end_date, position, f_dir):
    path = f'./datasci_dataset_2022/{f_dir.capitalize()}/test/{f_dir}_wind_test.csv'
    
    if (not os.path.exists(path) or os.stat(path).st_size==0):
        f_w = open(path, 'a')
        f_w.write('date_time,wind speed,wind dir\n')
        f_w.close()
    else:
        f_r = open(path, 'r')
        last_line = f_r.readlines()[-1][:-1]
        # print(f"lastline: {last_line}")
        if (last_line == 'date_time,wind speed,wind dir'):
            pass
        else:
            last_date = pd.to_datetime([last_line[:16] + " +0700"], utc=True)
            start_date = last_date[0] + pd.Timedelta(hours=3)

    counter = 0
    f_w = open(path, 'a')
    while start_date <= end_date :
        scrape_date = start_date.strftime('#%Y/%m/%d/%H%MZ')
        save_date = start_date.tz_convert('Asia/Bangkok').strftime('%Y-%m-%d %H:%M:%S')
        while True:
            wind_info = scrape_wind(scrape_date, position['lat'], position['long'])
            if len(wind_info) == 2: # if we have got the data, break the loop
                break
        start_date += pd.Timedelta(hours=3)
        f_w.write(f"{save_date},{wind_info[1]},{wind_info[0]}\n")
        if counter == 7:   # Save the data once a day
            counter = 0
            f_w.close()
            f_w = open(path, 'a')
        else:
            counter += 1
        print(f"{save_date},{wind_info[1]},{wind_info[0]}")
    f_w.close()

In [None]:
# Scrape wind data for BKK
scrape_wind_data(start_date, end_date, position_bkk, 'bkk')

In [None]:
# Scrape wind data for Chiangmai
scrape_wind_data(start_date, end_date, position_cm, 'chiangmai')

In [None]:
# Scrape wind data for Khonkaen
scrape_wind_data(start_date, end_date, position_kk, 'khonkaen')

In [None]:
# Scrape wind data for Rayong
scrape_wind_data(start_date, end_date, position_ry, 'rayong')

In [None]:
# Scrape wind data for Saraburi
scrape_wind_data(start_date, end_date, position_sb, 'saraburi')

In [None]:
# Scrape wind data for Surat
scrape_wind_data(start_date, end_date, position_sr, 'surat')

In [42]:
driver.quit()  # Close the selenium

##### temp scraping

In [11]:
import os

def scrape_temp(date, lat, long):
    url =  f'https://classic.nullschool.net/{date}/wind/surface/level/overlay=temp/orthographic/loc={long},{lat}'
    driver.get(url)
    driver.refresh()
    temp_info = driver.find_element(by=By.CSS_SELECTOR, value='#location-value').get_attribute('innerText')
    return temp_info

def scrape_temp_data(start_date, end_date, position, f_dir):
    path = f'./datasci_dataset_2022/{f_dir.capitalize()}/test/{f_dir}_temp_test.csv'
    
    if (not os.path.exists(path) or os.stat(path).st_size==0):
        f_w = open(path, 'a')
        f_w.write('date_time,temp\n')
        f_w.close()
    else:
        f_r = open(path, 'r')
        last_line = f_r.readlines()[-1][:-1]
        # print(f"lastline: {last_line}")
        if (last_line == 'date_time,temp'):
            pass
        else:
            last_date = pd.to_datetime([last_line[:16] + " +0700"], utc=True)
            start_date = last_date[0] + pd.Timedelta(hours=3)

    counter = 0
    f_w = open(path, 'a')
    while start_date <= end_date :
        scrape_date = start_date.strftime('#%Y/%m/%d/%H%MZ')
        save_date = start_date.tz_convert('Asia/Bangkok').strftime('%Y-%m-%d %H:%M:%S')
        while True:
            temp_info = scrape_temp(scrape_date, position['lat'], position['long'])
            if temp_info: # if we have got the data, break the loop
                break
        start_date += pd.Timedelta(hours=3)
        f_w.write(f"{save_date},{temp_info}\n")
        if counter == 7:   # Save the data once a day
            counter = 0
            f_w.close()
            f_w = open(path, 'a')
        else:
            counter += 1
        print(f"{save_date},{temp_info}")
    f_w.close()

In [11]:
# Scrape temp data for BKK
scrape_temp_data(start_date, end_date, position_bkk, 'bkk')

2020-07-01 01:00:00,28.5
2020-07-01 04:00:00,28.3
2020-07-01 07:00:00,28.7
2020-07-01 10:00:00,29.7
2020-07-01 13:00:00,30.8
2020-07-01 16:00:00,29.5
2020-07-01 19:00:00,29.2
2020-07-01 22:00:00,28.7
2020-07-02 01:00:00,28.5
2020-07-02 04:00:00,28.4
2020-07-02 07:00:00,28.3
2020-07-02 10:00:00,30.8
2020-07-02 13:00:00,32.7
2020-07-02 16:00:00,31.6
2020-07-02 19:00:00,28.4
2020-07-02 22:00:00,29.1
2020-07-03 01:00:00,28.7
2020-07-03 04:00:00,28.3
2020-07-03 07:00:00,28.8
2020-07-03 10:00:00,29.2
2020-07-03 13:00:00,30.2
2020-07-03 16:00:00,29.7
2020-07-03 19:00:00,28.0
2020-07-03 22:00:00,27.2
2020-07-04 01:00:00,27.5
2020-07-04 04:00:00,27.4
2020-07-04 07:00:00,27.9
2020-07-04 10:00:00,29.8
2020-07-04 13:00:00,29.9
2020-07-04 16:00:00,26.5
2020-07-04 19:00:00,27.8
2020-07-04 22:00:00,27.9
2020-07-05 01:00:00,27.6
2020-07-05 04:00:00,27.2
2020-07-05 07:00:00,27.5
2020-07-05 10:00:00,28.5
2020-07-05 13:00:00,29.7
2020-07-05 16:00:00,30.6
2020-07-05 19:00:00,29.1
2020-07-05 22:00:00,28.1


In [13]:
# Scrape temp data for Chiangmai
scrape_temp_data(start_date, end_date, position_cm, 'chiangmai')

2020-10-27 01:00:00,21.0
2020-10-27 04:00:00,20.8
2020-10-27 07:00:00,20.9
2020-10-27 10:00:00,23.4
2020-10-27 13:00:00,28.1
2020-10-27 16:00:00,27.7
2020-10-27 19:00:00,22.5
2020-10-27 22:00:00,21.4
2020-10-28 01:00:00,20.5
2020-10-28 04:00:00,19.9
2020-10-28 07:00:00,20.6
2020-10-28 10:00:00,27.2
2020-10-28 13:00:00,30.8
2020-10-28 16:00:00,29.6
2020-10-28 19:00:00,23.4
2020-10-28 22:00:00,21.8
2020-10-29 01:00:00,20.9
2020-10-29 04:00:00,20.3
2020-10-29 07:00:00,21.3
2020-10-29 10:00:00,24.8
2020-10-29 13:00:00,24.7
2020-10-29 16:00:00,24.9
2020-10-29 19:00:00,22.3
2020-10-29 22:00:00,20.9
2020-10-30 01:00:00,20.9
2020-10-30 04:00:00,20.7
2020-10-30 07:00:00,20.7
2020-10-30 10:00:00,20.9
2020-10-30 13:00:00,21.3
2020-10-30 16:00:00,21.3
2020-10-30 19:00:00,20.8
2020-10-30 22:00:00,20.3
2020-10-31 01:00:00,20.0
2020-10-31 04:00:00,20.1
2020-10-31 07:00:00,20.9
2020-10-31 10:00:00,23.3
2020-10-31 13:00:00,27.4
2020-10-31 16:00:00,26.2
2020-10-31 19:00:00,22.2
2020-10-31 22:00:00,21.1


In [None]:
# Scrape temp data for Khonkaen
scrape_temp_data(start_date, end_date, position_kk, 'khonkaen')

In [None]:
# Scrape temp data for Rayong
scrape_temp_data(start_date, end_date, position_ry, 'rayong')

In [None]:
# Scrape temp data for Saraburi
scrape_temp_data(start_date, end_date, position_sb, 'saraburi')

In [None]:
# Scrape temp data for Surat
scrape_temp_data(start_date, end_date, position_sr, 'surat')

In [12]:
driver.quit()  # Close the selenium