In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as ddf
from pandas import Series, DataFrame
import requests
from bs4 import BeautifulSoup
from bs4 import Tag
import re
import csv
import json
import datetime

## Step 1: Request climate data from NOAA (National Oceanic and Atmospheric Administration) directly

### Data Source:

The climate data can be requested from https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/ 

Python Package needed: request and BeautifulSoup

#### Function 1: Parse url to BeautifulSoup Object

In [2]:
def generate_soup(url):
    response = requests.get(url)
    html = response.text
    return BeautifulSoup(html, "html.parser")

In [3]:
basicUrl = 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/'

In [5]:
year = '1929'

In [10]:
# Example : 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/'
year_url = basicUrl + year + '/'

#### Function 2: Create a loop to find all the urls in each file downloading page and create a list of file names for each year

In [11]:
def findFileNames(basicUrl, year):
    url_year = basicUrl + year + '/'
    soup = generate_soup(url_year)
    files = soup.findAll('td')
    fileNames = []
    for file in files[1:]:
        if file.find('a') != None:
            fileName = file.find('a').text
            fileNames.append(fileName)
    return fileNames

In [12]:
fileList = findFileNames(basicUrl, year)
fileList[:10]

['03005099999.csv',
 '03075099999.csv',
 '03091099999.csv',
 '03159099999.csv',
 '03262099999.csv',
 '03311099999.csv',
 '03379099999.csv',
 '03396099999.csv',
 '03497099999.csv',
 '03601099999.csv']

#### Function 3: Once we get the list of file names from the previous function, we can create a list of urls so that we can download the data from each single url. 
example of csv's url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03091099999.csv'

In [14]:
def createUrls(fileList, year):
    urls = []
    for element in fileList:
        url = "https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/" + year + "/" + element
        urls.append(url)
    return urls

In [15]:
urls = createUrls(fileList, year)
urls[:10]

['https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03005099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03075099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03091099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03159099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03262099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03311099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03379099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03396099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03497099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03601099999.csv']

#### Function 4: create dataframes for each csv file. Here we need to use a empty csv to hold the data downloading from the url

In [21]:
def findDataFrame(url):
    r = requests.get(url)
    open('temp.csv', 'wb').write(r.content)
    df = pd.read_csv('temp.csv')
    return df

In [22]:
example_url = 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03005099999.csv'

In [23]:
example_df = findDataFrame(example_url)
example_df.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,MXSPD,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT
0,3005099999,1929-10-01,60.133333,-1.183333,84.0,"LERWICK, UK",45.3,4,40.0,4,...,8.9,999.9,51.1,,44.1,*,0.0,I,999.9,0
1,3005099999,1929-10-02,60.133333,-1.183333,84.0,"LERWICK, UK",49.5,4,45.2,4,...,29.9,999.9,53.1,*,44.1,,99.99,,999.9,10000
2,3005099999,1929-10-03,60.133333,-1.183333,84.0,"LERWICK, UK",49.0,4,41.7,4,...,23.9,999.9,53.1,,46.0,,99.99,,999.9,10000
3,3005099999,1929-10-04,60.133333,-1.183333,84.0,"LERWICK, UK",45.7,4,38.5,4,...,36.9,999.9,53.1,,44.1,,99.99,,999.9,10000
4,3005099999,1929-10-05,60.133333,-1.183333,84.0,"LERWICK, UK",46.5,4,41.5,4,...,13.0,999.9,48.0,*,43.0,,99.99,,999.9,10000


#### Get rid of NA values. We have to check the data description

In [24]:
def changeToNone(value):
    if value == 999.9 or value == 9999.9:
        return np.nan
    else:
        return value
    
### Since 99.9 may represent real record for other columns, we need to create a funtion specific for PRCP column
def changeToNone2(value):
    if value == 99.99:
        return np.nan
    else:
        return value

In [25]:
def changMissingData(df):
    df['TEMP'] = df['TEMP'].apply(changeToNone)   # missing data = 9999.9
    df['DEWP'] = df['DEWP'].apply(changeToNone)   # missing data = 9999.9
    df['SLP'] = df['SLP'].apply(changeToNone)     # missing data = 9999.9
    df['STP'] = df['STP'].apply(changeToNone)     # missing data = 9999.9
    df['STP'] = df['STP'].apply(changeToNone)     # missing data = 999.9
    df['VISIB'] = df['VISIB'].apply(changeToNone) # missing data = 999.9
    df['WDSP'] = df['WDSP'].apply(changeToNone)   # missing data = 999.9
    df['MXSPD'] = df['MXSPD'].apply(changeToNone) # missing data = 999.9
    df['GUST'] = df['GUST'].apply(changeToNone)   # missing data = 999.9
    df['MAX'] = df['MAX'].apply(changeToNone)     # missing data = 9999.9
    df['MIN'] = df['MIN'].apply(changeToNone)     # missing data = 9999.9
    df['PRCP'] = df['PRCP'].apply(changeToNone2)   # missing data = 99.99
    df['SNDP'] = df['SNDP'].apply(changeToNone)   # missing data = 999.9
    return df

In [26]:
# We may want to find the year and date information from the DATE column
def addDateInfor(df):
    df['DATE'] = pd.to_datetime(df['DATE']) # convert string to date time 
    df['YEAR'] = df['DATE'].dt.year # find the year information from the data time
    return df

In [27]:
### We can create dataframe with getting rid of the NA values and adding date information directly with this function
def findDataFrame(url):
    r = requests.get(url)
    open('temp.csv', 'wb').write(r.content)
    df = pd.read_csv('temp.csv')
    df = changMissingData(df)
    df = addDateInfor(df)
    return df

In [28]:
example_df = findDataFrame(example_url)
example_df.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT,YEAR
0,3005099999,1929-10-01,60.133333,-1.183333,84.0,"LERWICK, UK",45.3,4,40.0,4,...,,51.1,,44.1,*,0.0,I,,0,1929
1,3005099999,1929-10-02,60.133333,-1.183333,84.0,"LERWICK, UK",49.5,4,45.2,4,...,,53.1,*,44.1,,,,,10000,1929
2,3005099999,1929-10-03,60.133333,-1.183333,84.0,"LERWICK, UK",49.0,4,41.7,4,...,,53.1,,46.0,,,,,10000,1929
3,3005099999,1929-10-04,60.133333,-1.183333,84.0,"LERWICK, UK",45.7,4,38.5,4,...,,53.1,,44.1,,,,,10000,1929
4,3005099999,1929-10-05,60.133333,-1.183333,84.0,"LERWICK, UK",46.5,4,41.5,4,...,,48.0,*,43.0,,,,,10000,1929


#### Function 5: Conbine each single weather station data record in each year to create a dataframe which contains all the records from each weather station

In [30]:
urls[:10]

['https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03005099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03075099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03091099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03159099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03262099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03311099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03379099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03396099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03497099999.csv',
 'https://www.ncei.noaa.gov/data/global-summary-of-the-day/access/1929/03601099999.csv']

In [34]:
def combineYearData(urls):
    df1 = pd.DataFrame()                            # Empty dataframe
    for url in urls:                                # Loop each url in the url list in each year
        try:
            df2 = findDataFrame(url)
            df = pd.concat([df1, df2], axis = 0)   # adding them together
            df1 = df
        except:
            print('result has not been initialized')
    return df1

In [35]:
df = combineYearData(urls)

In [37]:
df

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,GUST,MAX,MAX_ATTRIBUTES,MIN,MIN_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,SNDP,FRSHTT,YEAR
0,3005099999,1929-10-01,60.133333,-1.183333,84.0,"LERWICK, UK",45.3,4,40.0,4,...,,51.1,,44.1,*,0.0,I,,0,1929
1,3005099999,1929-10-02,60.133333,-1.183333,84.0,"LERWICK, UK",49.5,4,45.2,4,...,,53.1,*,44.1,,,,,10000,1929
2,3005099999,1929-10-03,60.133333,-1.183333,84.0,"LERWICK, UK",49.0,4,41.7,4,...,,53.1,,46.0,,,,,10000,1929
3,3005099999,1929-10-04,60.133333,-1.183333,84.0,"LERWICK, UK",45.7,4,38.5,4,...,,53.1,,44.1,,,,,10000,1929
4,3005099999,1929-10-05,60.133333,-1.183333,84.0,"LERWICK, UK",46.5,4,41.5,4,...,,48.0,*,43.0,,,,,10000,1929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,99006199999,1929-12-26,0.100000,147.000000,3.0,BUOY 52079 ARGOS 3781,65.7,24,44.9,24,...,,70.3,*,63.3,*,0.0,I,,0,1929
40,99006199999,1929-12-27,0.100000,147.000000,3.0,BUOY 52079 ARGOS 3781,69.1,24,57.2,24,...,,73.4,*,65.3,*,0.0,I,,0,1929
41,99006199999,1929-12-29,0.100000,147.000000,3.0,BUOY 52079 ARGOS 3781,65.1,24,60.5,24,...,,69.3,*,62.4,*,0.0,I,,0,1929
42,99006199999,1929-12-30,0.100000,147.000000,3.0,BUOY 52079 ARGOS 3781,62.3,24,58.1,24,...,,64.4,*,60.3,*,0.0,I,,0,1929


In [40]:
df['STATION'].unique()   # the result is the same as the list on the 1929 file list page

array([ 3005099999,  3075099999,  3091099999,  3159099999,  3262099999,
        3311099999,  3379099999,  3396099999,  3497099999,  3601099999,
        3777099999,  3795099999,  3804099999,  3811099999,  3856099999,
        3864099999,  3894099999,  3953099999,  3973099999,  3980099999,
       99006199999])

#### Save the dataframe to csv file

In [42]:
df.to_csv('Daily_Climate/DailyTemp_full_' + year + '.csv')