# Scraping weather data 
for Washington DC from the website https://www.wunderground.com/history/daily/us/dc/washington/KDCA  for the year 2017.


In [None]:
# Web-Scraping packages
from bs4 import BeautifulSoup
from selenium import webdriver

# Essential packages
import numpy as np
import pandas as pd
import time
import datetime
from datetimerange import DateTimeRange



In [None]:
# a function that open a page wait 5 sec (to ensure that needed data are laoded) then scrape needed data.
def render_page(url):

    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(5)
    r = driver.page_source
    driver.quit()
    return r

In [None]:
# our main scraping function
def scraper(page, dates):
    
    output = pd.DataFrame()
    fails = []
    
    
    for d in dates:
    
        try:
            url = str(str(page) + str(d))

            r = render_page(url)

            # Find needed data
            soup = BeautifulSoup(r, "html.parser")
            container = soup.find('lib-city-history-observation')
            check = container.find('tbody')

            # Creating holder dataframe
            data = pd.DataFrame(np.zeros((105,10)) ,columns=["Date","Temp", "Dew point", "Humidity", "Wind", "Wind speed", "Wind Gust", "Pressure", "Precip.","Condition"])

            # Filling the Datafram
            for row, c in enumerate(check.find_all('tr', class_='ng-star-inserted')):
                for col, i in enumerate(c.find_all('td', class_='ng-star-inserted')):
                    trial = i.text
                    trial = trial.strip('  ')
                    data.iloc[row,col] = trial
            
            # Drop the empty rows
            data= data.loc[data["Date"]!=0]

            # Adding the date to the time recorded in data["Date"] (data["Date"] only has time in it, So we need to add the date)
            data["Date"]=[d + " " + i for i in data["Date"]]

            # Appending the data of each day to our output
            output = pd.concat([output, data], ignore_index=True)
            
        except:
            fails.append(d)

    return(output, fails)

In [None]:
# Creating a list of dates from 2017-01-01 to 2017-12-31

a = DateTimeRange("2017-1-1",'2017-12-31').range(datetime.timedelta(days=1))
c= list(a)
dates = []
for i in c:
    dates.append(str(i.date()))

dates

['2017-01-01',
 '2017-01-02',
 '2017-01-03',
 '2017-01-04',
 '2017-01-05',
 '2017-01-06',
 '2017-01-07',
 '2017-01-08',
 '2017-01-09',
 '2017-01-10',
 '2017-01-11',
 '2017-01-12',
 '2017-01-13',
 '2017-01-14',
 '2017-01-15',
 '2017-01-16',
 '2017-01-17',
 '2017-01-18',
 '2017-01-19',
 '2017-01-20',
 '2017-01-21',
 '2017-01-22',
 '2017-01-23',
 '2017-01-24',
 '2017-01-25',
 '2017-01-26',
 '2017-01-27',
 '2017-01-28',
 '2017-01-29',
 '2017-01-30',
 '2017-01-31',
 '2017-02-01',
 '2017-02-02',
 '2017-02-03',
 '2017-02-04',
 '2017-02-05',
 '2017-02-06',
 '2017-02-07',
 '2017-02-08',
 '2017-02-09',
 '2017-02-10',
 '2017-02-11',
 '2017-02-12',
 '2017-02-13',
 '2017-02-14',
 '2017-02-15',
 '2017-02-16',
 '2017-02-17',
 '2017-02-18',
 '2017-02-19',
 '2017-02-20',
 '2017-02-21',
 '2017-02-22',
 '2017-02-23',
 '2017-02-24',
 '2017-02-25',
 '2017-02-26',
 '2017-02-27',
 '2017-02-28',
 '2017-03-01',
 '2017-03-02',
 '2017-03-03',
 '2017-03-04',
 '2017-03-05',
 '2017-03-06',
 '2017-03-07',
 '2017-03-

In [None]:
# scraping round 1
weather, fails = scraper("https://www.wunderground.com/history/daily/us/va/arlington/KDCA/date/", dates=dates)
weather

Unnamed: 0,Date,Temp,Dew point,Humidity,Wind,Wind speed,Wind Gust,Pressure,Precip.,Condetion
0,2017-01-01 12:52 AM,46 °F,22 °F,39 °%,SW,9 °mph,0 °mph,29.97 °in,0.0 °in,Mostly Cloudy
1,2017-01-01 1:52 AM,44 °F,22 °F,42 °%,SSW,9 °mph,0 °mph,29.98 °in,0.0 °in,Partly Cloudy
2,2017-01-01 2:52 AM,43 °F,23 °F,45 °%,SSW,13 °mph,0 °mph,30.00 °in,0.0 °in,Partly Cloudy
3,2017-01-01 3:52 AM,41 °F,23 °F,49 °%,SW,5 °mph,0 °mph,30.02 °in,0.0 °in,Partly Cloudy
4,2017-01-01 4:52 AM,40 °F,23 °F,51 °%,SSW,5 °mph,0 °mph,30.04 °in,0.0 °in,Partly Cloudy
...,...,...,...,...,...,...,...,...,...,...
10361,2017-12-31 7:52 PM,19 °F,4 °F,52 °%,NW,10 °mph,0 °mph,30.40 °in,0.0 °in,Fair
10362,2017-12-31 8:52 PM,18 °F,4 °F,54 °%,NNW,8 °mph,0 °mph,30.41 °in,0.0 °in,Fair
10363,2017-12-31 9:52 PM,17 °F,2 °F,52 °%,NNW,10 °mph,0 °mph,30.41 °in,0.0 °in,Fair
10364,2017-12-31 10:52 PM,17 °F,2 °F,52 °%,NNW,8 °mph,0 °mph,30.42 °in,0.0 °in,Fair


In [None]:
len(fails)

18

In [None]:
# Scraping round 2
w1 , f1 = scraper("https://www.wunderground.com/history/daily/us/va/arlington/KDCA/date/", dates= fails)
w1, len(f1)

(                    Date   Temp Dew point Humidity  Wind Wind speed Wind Gust  \
 0    2017-02-21 12:52 AM  50 °F     25 °F    38 °%   ENE     5 °mph    0 °mph   
 1     2017-02-21 1:52 AM  48 °F     24 °F    39 °%   ENE     8 °mph    0 °mph   
 2     2017-02-21 2:52 AM  46 °F     22 °F    39 °%     E     8 °mph    0 °mph   
 3     2017-02-21 3:52 AM  45 °F     25 °F    46 °%     E     6 °mph    0 °mph   
 4     2017-02-21 4:52 AM  43 °F     24 °F    47 °%  CALM     0 °mph    0 °mph   
 ..                   ...    ...       ...      ...   ...        ...       ...   
 580   2017-12-20 7:52 PM  44 °F     23 °F    43 °%     N     6 °mph    0 °mph   
 581   2017-12-20 8:52 PM  41 °F     28 °F    60 °%   SSE     3 °mph    0 °mph   
 582   2017-12-20 9:52 PM  40 °F     21 °F    47 °%     N     8 °mph    0 °mph   
 583  2017-12-20 10:52 PM  40 °F     25 °F    55 °%     E     3 °mph    0 °mph   
 584  2017-12-20 11:52 PM  39 °F     24 °F    55 °%     E     3 °mph    0 °mph   
 
       Pressur

In [None]:
# final scraping result
weather = pd.concat([weather, w1], ignore_index=True)
weather.head()

Unnamed: 0,Date,Temp,Dew point,Humidity,Wind,Wind speed,Wind Gust,Pressure,Precip.,Condetion
0,2017-01-01 12:52 AM,46 °F,22 °F,39 °%,SW,9 °mph,0 °mph,29.97 °in,0.0 °in,Mostly Cloudy
1,2017-01-01 1:52 AM,44 °F,22 °F,42 °%,SSW,9 °mph,0 °mph,29.98 °in,0.0 °in,Partly Cloudy
2,2017-01-01 2:52 AM,43 °F,23 °F,45 °%,SSW,13 °mph,0 °mph,30.00 °in,0.0 °in,Partly Cloudy
3,2017-01-01 3:52 AM,41 °F,23 °F,49 °%,SW,5 °mph,0 °mph,30.02 °in,0.0 °in,Partly Cloudy
4,2017-01-01 4:52 AM,40 °F,23 °F,51 °%,SSW,5 °mph,0 °mph,30.04 °in,0.0 °in,Partly Cloudy


In [None]:
# Creating a weather time-serise
weather["Date"] = pd.to_datetime(weather["Date"])
weather["Time"]=weather["Date"].apply(lambda a: a.time())


weather.head()

Unnamed: 0,Date,Temp,Dew point,Humidity,Wind,Wind speed,Wind Gust,Pressure,Precip.,Condetion,Time
0,2017-01-01 00:52:00,46 °F,22 °F,39 °%,SW,9 °mph,0 °mph,29.97 °in,0.0 °in,Mostly Cloudy,00:52:00
1,2017-01-01 01:52:00,44 °F,22 °F,42 °%,SSW,9 °mph,0 °mph,29.98 °in,0.0 °in,Partly Cloudy,01:52:00
2,2017-01-01 02:52:00,43 °F,23 °F,45 °%,SSW,13 °mph,0 °mph,30.00 °in,0.0 °in,Partly Cloudy,02:52:00
3,2017-01-01 03:52:00,41 °F,23 °F,49 °%,SW,5 °mph,0 °mph,30.02 °in,0.0 °in,Partly Cloudy,03:52:00
4,2017-01-01 04:52:00,40 °F,23 °F,51 °%,SSW,5 °mph,0 °mph,30.04 °in,0.0 °in,Partly Cloudy,04:52:00


In [None]:
# Exporting the resultant data 
weather.to_csv("data/scraped_weather.csv")