<h3> Load Libraries </h3>

In [1]:
import numpy as np
import pandas as pd
import os
from selenium.webdriver.support import ui
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import time
import requests
import urllib.request
from bs4 import BeautifulSoup
import datetime
import schedule
from apscheduler.schedulers.background import BackgroundScheduler
from pytz import utc
import logging
import random
from selenium.webdriver.chrome.options import Options

<h3> Global Variables </h3>

In [2]:
base_path = "/Users/saraawad/Desktop/Datasets/Google/"

<h3> Helpers </h3>

In [3]:
def remove_symbol(string, symbol):
    return string.replace(symbol, '')

def remove_symbols(string):
    string = string.replace("°", "")
    string = string.replace("%", "")
    string = string.replace("mm", "")
    string = string.replace("cm", "")
    string = string.replace("km", "")
    string = string.replace("km/h", "")
    string = string.replace("/h", "")
    return string

In [4]:
def get_driver(latitude, longitude):
    #1. Set coordinates
    url = "https://www.accuweather.com/en/search-locations?query="+latitude+"%2C+"+longitude
    chrome_options = Options()
#     chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920x1080")
    driver = webdriver.Chrome(options=chrome_options, executable_path='/Users/saraawad/Downloads/opt/WebDriver/bin/chromedriver')
    driver.get(url)

    #2. Wait a couple of minutes to get the country url
    wait = WebDriverWait(driver, 10)
    current_url = driver.current_url
    print("Current URL:", current_url)

    #3. Navigate to the hourly url
    driver.find_element_by_xpath("/html/body/div/div[4]/div[1]/div[3]/a[3]").click()
    current_url = driver.current_url
    print("Current URL:", current_url)
    
    return driver

In [5]:
def get_page_content(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

def get_temp_per(soup):
    #Get Times and remove date from the array
    times_temp = soup.select("div.date p")
    times = []
    for i in range(len(times_temp)):
        if i % 2 == 0:
            times.append(times_temp[i].get_text().strip())

    #Get Dates
    dates = soup.select("div.date p.sub")
    dates = [pt.get_text().strip() for pt in dates]

    #Get Temperatures in Celsius
    temperatures = soup.select("div.temp.metric")
    temperatures = [remove_symbol(pt.get_text().strip(), "°") for pt in temperatures]

    #Get Precipitation
    preci_tmp = soup.select("div.precip")
    preci_tmp = [remove_symbol(pt.get_text().strip(), "%") for pt in preci_tmp]
    preci = []
    for i in range(len(preci_tmp)):
        pre = preci_tmp[i]
        pre = pre.replace("Precip\n\t\t\t", "")
        preci.append(pre)
    header_df = pd.DataFrame({
        "Time": times,
        "Date": dates,
        "Year": datetime.datetime.now().year,
        "Temperature": temperatures,
        "Percipitation": pre
    })

    return header_df

In [6]:
def get_detailed_info(soup):
    panel = soup.select("div.hourly-forecast-card-content div.panel p")
    details = [pt.get_text().strip().split(":")[1].strip() for pt in panel]

    details_temp = []
    for element in details:
        element = remove_symbols(element)
        details_temp.append(element)

    details = details_temp

    row_num = 0
    rows = []
    columns = []

    for i in range(len(details)):
        row_num += 1
        if row_num == 12:
            columns = details[i-11:i]
            rows.append(columns)
            row_num = 0

    content_df = pd.DataFrame(rows)
    content_df.columns = ["RealFeel", "Wind", "Gusts", "Humidity", "Dew Point", "Max UV Index", 
                           "Cloud Cover", "Rain", "Snow", "Ice", "Visibility"]

    content_df["Wind Unit"] = ""

    w_index = content_df.columns.get_loc("Wind")
    w_unit_index = content_df.columns.get_loc("Wind Unit")

    for i in range(len(content_df)):
        wind_str = str(content_df.iloc[i, w_index]).split(" ")
        content_df.iloc[i, w_index] = wind_str[0]
        content_df.iloc[i, w_unit_index] = "km/h " + wind_str[2]
    
    return content_df

In [7]:
def generate_weather_data(driver):
    soup = get_page_content(driver)
    header_df = get_temp_per(soup)
    content_df = get_detailed_info(soup)
    weather_df = pd.concat([header_df, content_df], axis=1)
    return weather_df 

In [8]:
def save_data(latitude, longitude, city_name):
    driver = get_driver(latitude, longitude)
    df = generate_weather_data(driver)
    date = datetime.datetime.now()
    time_str = str(date.strftime("%I" "%p"))
    date_str = str(date.year) + "-" + str(date.month) + "-" + str(date.day) + " " + time_str
    file_name = date_str + "_" + city_name + ".csv"
    print("file name:", file_name)
    export_path = os.path.join(base_path, file_name)
    export_csv = df.to_csv (export_path, index = None, header=True)
    driver.quit()

In [10]:
#Get weather data for pre-defined city list
full_path = os.path.join(base_path, "cities_locations.xlsx")
df = pd.read_excel(full_path)

def weather_forcast():
    for i in range(len(df)):
        city_name = str(df.iloc[i,0])
        latitude = str(df.iloc[i,1])
        longitude = str(df.iloc[i,2])
        save_data(latitude, longitude, city_name)
        #Add random delay between each request
        timeDelay = random.randrange(0, 10)
        time.sleep(timeDelay)
        

In [10]:
try:
    scheduler = BackgroundScheduler()
    scheduler.configure()

    date = datetime.datetime.now()

    scheduler.add_job(weather_forcast, 'cron', day_of_week="mon-sun", hour=7, minute=0)
    scheduler.add_job(weather_forcast, 'cron', day_of_week="mon-sun", hour=11, minute=0)
    scheduler.add_job(weather_forcast, 'cron', day_of_week="mon-sun", hour=16, minute=0)

    logging.basicConfig()
    logging.getLogger('apscheduler').setLevel(logging.DEBUG)
    
    scheduler.start()
except Exception as e:
    logger.exception(e)
    pass
#     scheduler.shutdown() 
    

INFO:apscheduler.scheduler:Added job "weather_forcast" to job store "default"
INFO:apscheduler.scheduler:Added job "weather_forcast" to job store "default"
INFO:apscheduler.scheduler:Added job "weather_forcast" to job store "default"
INFO:apscheduler.scheduler:Scheduler started
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-21 16:00:00+02:00 (in 15432.103950 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
INFO:apscheduler.executors.default:Running job "weather_forcast (trigger: cron[day_of_week='mon-sun', hour='16', minute='0'], next run at: 2019-12-21 16:00:00 EET)" (scheduled at 2019-12-21 16:00:00+02:00)
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-22 07:00:00+02:00 (in 53999.341216 seconds)


Current URL: https://www.accuweather.com/en/lb/akkar-al-atiqah/226717/weather-forecast/226717
Current URL: https://www.accuweather.com/en/lb/akkar-al-atiqah/226717/hourly-weather-forecast/226717
file name: 2019-12-21 04PM_Akkar.csv
Current URL: https://www.accuweather.com/en/lb/adlun/228606/weather-forecast/228606
Current URL: https://www.accuweather.com/en/lb/adlun/228606/hourly-weather-forecast/228606
file name: 2019-12-21 04PM_Adloun.csv
Current URL: https://www.accuweather.com/en/lb/ed-damour/229249/weather-forecast/229249
Current URL: https://www.accuweather.com/en/lb/ed-damour/229249/hourly-weather-forecast/229249
file name: 2019-12-21 04PM_Damour.csv
Current URL: https://www.accuweather.com/en/lb/al-qa/227230/weather-forecast/227230
Current URL: https://www.accuweather.com/en/lb/al-qa/227230/hourly-weather-forecast/227230
file name: 2019-12-21 04PM_AlQaa.csv
Current URL: https://www.accuweather.com/en/lb/jubb-jannin/227350/weather-forecast/227350
Current URL: https://www.accuwea

INFO:apscheduler.executors.default:Job "weather_forcast (trigger: cron[day_of_week='mon-sun', hour='16', minute='0'], next run at: 2019-12-22 16:00:00 EET)" executed successfully
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-22 11:00:00+02:00 (in 14398.946523 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
INFO:apscheduler.executors.default:Running job "weather_forcast (trigger: cron[day_of_week='mon-sun', hour='11', minute='0'], next run at: 2019-12-22 11:00:00 EET)" (scheduled at 2019-12-22 11:00:00+02:00)
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-22 16:00:00+02:00 (in 17999.737901 seconds)


Current URL: https://www.accuweather.com/en/lb/akkar-al-atiqah/226717/weather-forecast/226717
Current URL: https://www.accuweather.com/en/lb/akkar-al-atiqah/226717/hourly-weather-forecast/226717
file name: 2019-12-22 11AM_Akkar.csv
Current URL: https://www.accuweather.com/en/lb/adlun/228606/weather-forecast/228606
Current URL: https://www.accuweather.com/en/lb/adlun/228606/hourly-weather-forecast/228606
file name: 2019-12-22 11AM_Adloun.csv
Current URL: https://www.accuweather.com/en/lb/ed-damour/229249/weather-forecast/229249
Current URL: https://www.accuweather.com/en/lb/ed-damour/229249/hourly-weather-forecast/229249
file name: 2019-12-22 11AM_Damour.csv
Current URL: https://www.accuweather.com/en/lb/al-qa/227230/weather-forecast/227230
Current URL: https://www.accuweather.com/en/lb/al-qa/227230/hourly-weather-forecast/227230
file name: 2019-12-22 11AM_AlQaa.csv
Current URL: https://www.accuweather.com/en/lb/jubb-jannin/227350/weather-forecast/227350
Current URL: https://www.accuwea

INFO:apscheduler.executors.default:Job "weather_forcast (trigger: cron[day_of_week='mon-sun', hour='11', minute='0'], next run at: 2019-12-23 11:00:00 EET)" executed successfully
DEBUG:apscheduler.scheduler:Looking for jobs to run
INFO:apscheduler.executors.default:Running job "weather_forcast (trigger: cron[day_of_week='mon-sun', hour='16', minute='0'], next run at: 2019-12-22 16:00:00 EET)" (scheduled at 2019-12-22 16:00:00+02:00)
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-23 07:00:00+02:00 (in 53999.663577 seconds)


Current URL: https://www.accuweather.com/en/lb/akkar-al-atiqah/226717/weather-forecast/226717
Current URL: https://www.accuweather.com/en/lb/akkar-al-atiqah/226717/hourly-weather-forecast/226717
file name: 2019-12-22 04PM_Akkar.csv
Current URL: https://www.accuweather.com/en/lb/adlun/228606/weather-forecast/228606
Current URL: https://www.accuweather.com/en/lb/adlun/228606/hourly-weather-forecast/228606
file name: 2019-12-22 04PM_Adloun.csv
Current URL: https://www.accuweather.com/en/lb/ed-damour/229249/weather-forecast/229249
Current URL: https://www.accuweather.com/en/lb/ed-damour/229249/hourly-weather-forecast/229249
file name: 2019-12-22 04PM_Damour.csv
Current URL: https://www.accuweather.com/en/lb/al-qa/227230/weather-forecast/227230
Current URL: https://www.accuweather.com/en/lb/al-qa/227230/hourly-weather-forecast/227230
file name: 2019-12-22 04PM_AlQaa.csv
Current URL: https://www.accuweather.com/en/lb/jubb-jannin/227350/weather-forecast/227350
Current URL: https://www.accuwea

INFO:apscheduler.executors.default:Job "weather_forcast (trigger: cron[day_of_week='mon-sun', hour='16', minute='0'], next run at: 2019-12-23 16:00:00 EET)" executed successfully
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-23 16:00:00+02:00 (in 15717.567340 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-24 07:00:00+02:00 (in 51948.360738 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-24 11:00:00+02:00 (in 1989.742522 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-24 16:00:00+02:00 (in 17843.102770 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
DEBUG:apscheduler.scheduler:Next wakeup is due at 2019-12-25 07:00:00+02:00 (in 35542.585809 seconds)
DEBUG:apscheduler.scheduler:Looking for jobs to run
