In [12]:
import urllib
import csv
from lxml import html
from lxml import etree
import re
from dateutil import parser
import pandas
import os
import datetime
import schedule
import time
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,} 

In [13]:
def telco_data():
    pageurl = 'http://ref.data.gov.sg/common/search.aspx?r=1&s=default&o=a&a=ASTA%2cIDA%2cIPOS%2cDOS%2cNRF%2cPDPC&theme=09&f=CSV%2cTXT%2cOTHERS%2cXLS%2cXML&count=10&page=1'
    request = urllib.request.Request(pageurl,None,headers)
    response = urllib.request.urlopen(request)
    htmldata = response.read()
    tree = html.fromstring(htmldata)

    names = ['3G Mobile Subscriptions','Nation-wide Service Coverage', 'Average Drop Call Rate', 'Average Success Rate Across All Cells']
    type_field = {}
    type_field['3G Mobile Subscriptions'] = ['Month','SubscriptionCount_thousands']
    type_field['Nation-wide Service Coverage'] = ['Month','Percentage']
    type_field['Average Drop Call Rate'] = ['Month','Percentage']
    type_field['Average Success Rate Across All Cells'] = ['Month','Percentage']

    for n in names:
        csv_url = tree.xpath("//tr[td/strong/a[contains(text(),'%s')]]//input[contains(@name,'hdnFileURL')]/@value" % n)[0] + '&filetype=csv'
        csv_request = urllib.request.Request(csv_url,None,headers)
        csv_response = urllib.request.urlopen(csv_request)
        csv_data = csv_response.read()
        cleaned_data = str(csv_data).split("\\r\\n")
        final_data = [t.replace('M','-').split("^^") for t in cleaned_data if t.startswith('20')]
        with open(n.replace(' ','_')+'.csv', 'w', newline='') as wf:
            cwriter = csv.writer(wf, delimiter='|', quoting=csv.QUOTE_NONE)
            cwriter.writerow(type_field[n])
            for r in final_data:
                cwriter.writerow(r)

In [14]:
zones = {
    'C':'Central Region',
    'N':'North Region',
    'E':'East Region',
    'W':'West Region',
    'S':'South Region'
}

icons = {
    'FD':'Fair (Day)',
    'FN':'Fair (Night)',
    'PC':'Partly Cloudy',
    'CD':'Cloudy',
    'HZ':'Hazy',
    'WD':'Windy',
    'RA':'Rain',
    'PS':'Passing Showers',
    'SH':'Showers',
    'TS':'Thundery Showers'
}

regional_code = {
    'NRS':'National Reporting Stations',
    'rNO':'North Region',
    'rSO':'South Region',
    'rCE':'Central Region',
    'rWE':'West Region',
    'rEA':'East Region'
}

In [15]:
def weatherForecast_3hr():
    
    api_url = "http://www.nea.gov.sg/api/WebAPI?dataset=nowcast&keyref=781CF461BB6606AD28A78E343E0E4176E9DB7CA9E909F6EC"
    api_request = urllib.request.Request(api_url,None,headers)
    api_response = urllib.request.urlopen(api_request)
    api_data = api_response.read()
    
    root = etree.fromstring(api_data)
    matchObj = re.match(r'FROM (.*) TO \d?\d:\d\d [AP]M(.*?)<br>', root.xpath('//issue_datentime')[0].text)
    issue_datetime = str(parser.parse(matchObj.group(2) + ' ' + matchObj.group(1), dayfirst = True))
    
    filename = 'weatherForecast_3hr.csv'
    file_exists = os.path.isfile(filename)
    max_date = ''
    if file_exists:
        df = pandas.read_csv(filename, delimiter="|")
        df['issue_datetime'] = pandas.to_datetime(df['issue_datetime'])
        max_date = str(df['issue_datetime'].max())
    if not file_exists or (parser.parse(max_date) < parser.parse(issue_datetime)):
        fields = ['issue_datetime','name','zone','lon','lat','icon','forecast']
        with open(filename, 'a', newline='') as wf:
            writer = csv.DictWriter(wf, fieldnames=fields, delimiter='|', quoting=csv.QUOTE_NONE)
            if not file_exists:
                writer.writeheader()
            for r in root.xpath('//weatherForecast/area'):
                data = {}
                for f in fields:
                    if f == 'issue_datetime':
                        data[f] = issue_datetime
                    elif f == 'zone':
                        data[f] = zones[r.get(f).strip()]
                    else:
                        data[f] = r.get(f).strip()
                writer.writerow(data)

In [16]:
def weatherForecast_12hr():
    api_url = "http://www.nea.gov.sg/api/WebAPI?dataset=12hrs_forecast&keyref=781CF461BB6606AD28A78E343E0E4176E9DB7CA9E909F6EC"
    api_request = urllib.request.Request(api_url,None,headers)
    api_response = urllib.request.urlopen(api_request)
    api_data = api_response.read()

    root = etree.fromstring(api_data)

    data = {}
    for r in root.xpath('//item')[0]:
        forecastDates = ['forecastIssue','forecastValidity','forecastValidityFrom','forecastValidityTill']
        regions = ['wxmain','wxeast','wxwest','wxnorth','wxsouth','wxcentral']
        tempHum = ['temperature','relativeHumidity']
        tides = ['highTide','lowTide']

        if r.tag in forecastDates:
            data[r.tag] = str(parser.parse(r.get('date').strip() + ' ' + r.get('time').strip(), dayfirst = True))
        elif r.tag in regions:
            data[r.tag] = icons[r.text.strip()]
        elif r.tag in tempHum:
            data[r.tag + 'Low'] = r.get('low').strip()
            data[r.tag + 'High'] = r.get('high').strip()
        elif r.tag in tides:
            data[r.tag + 'Time'] = r.get('time').strip()
            data[r.tag + 'Height'] = r.get('height').strip()
        elif r.tag == 'forecast':
            data[r.tag] = r.text.strip()

    filename = 'weatherForecast_12hr.csv'
    file_exists = os.path.isfile(filename)
    max_date = ''
    if file_exists:
        df = pandas.read_csv(filename, delimiter="|")
        df['forecastIssue'] = pandas.to_datetime(df['forecastIssue'])
        max_date = str(df['forecastIssue'].max())
    if not file_exists or (parser.parse(max_date) < parser.parse(data['forecastIssue'])):
        fields = ['forecastIssue','forecastValidity','forecastValidityFrom','forecastValidityTill','wxmain','wxeast','wxwest','wxnorth',
             'wxsouth','wxcentral','forecast','temperatureLow','temperatureHigh','relativeHumidityLow','relativeHumidityHigh',
                  'highTideTime','highTideHeight','lowTideTime','lowTideHeight']
        with open(filename, 'a', newline='') as wf:
            writer = csv.DictWriter(wf, fieldnames=fields, delimiter='|', quoting=csv.QUOTE_NONE)
            if not file_exists:
                writer.writeheader()
            writer.writerow(data)

In [17]:
def weatherForecast_3days():
    api_url = "http://www.nea.gov.sg/api/WebAPI?dataset=3days_outlook&keyref=781CF461BB6606AD28A78E343E0E4176E9DB7CA9E909F6EC"
    api_request = urllib.request.Request(api_url,None,headers)
    api_response = urllib.request.urlopen(api_request)
    api_data = api_response.read()

    root = etree.fromstring(api_data)

    data = {}
    dateObj = parser.parse(root.xpath('//item/issueDate')[0].text.strip(), dayfirst = True)
    data['issueDate'] = str(dateObj)

    filename = 'weatherForecast_3days.csv'
    file_exists = os.path.isfile(filename)
    max_date = ''
    if file_exists:
        df = pandas.read_csv(filename, delimiter="|")
        df['issueDate'] = pandas.to_datetime(df['issueDate'])
        max_date = str(df['issueDate'].max())
    if not file_exists or (parser.parse(max_date) < dateObj):
        fields = ['issueDate','predictionDate','day','temperatureLow','temperatureHigh','icon','forecast']
        with open(filename, 'a', newline='') as wf:
            writer = csv.DictWriter(wf, fieldnames=fields, delimiter='|', quoting=csv.QUOTE_NONE)
            if not file_exists:
                writer.writeheader()
            for item in root.xpath('//item')[0]:
                if item.tag == 'weatherForecast':
                    increment = 0
                    for r in item:
                        if r.tag == 'day':
                            increment += 1
                            data[r.tag] = r.text.strip()
                            data['predictionDate'] = str(dateObj + datetime.timedelta(days=increment))
                        elif r.tag == 'forecast':
                            data[r.tag] = r.text.strip()
                        elif r.tag == 'icon':
                            data[r.tag] = icons[r.text.strip()]
                        elif r.tag == 'temperature':
                            data[r.tag + 'Low'] = r.get('low').strip()
                            data[r.tag + 'High'] = r.get('high').strip()
                            writer.writerow(data)

In [18]:
def psi_update():
    api_url = "http://www.nea.gov.sg/api/WebAPI?dataset=psi_update&keyref=781CF461BB6606AD28A78E343E0E4176E9DB7CA9E909F6EC"
    api_request = urllib.request.Request(api_url,None,headers)
    api_response = urllib.request.urlopen(api_request)
    api_data = api_response.read()

    root = etree.fromstring(api_data)

    li = []
    for regions in root.xpath('//item/region'):
        data = {}
        for item in regions:
            if item.tag == 'id':
                data['region'] = regional_code[item.text.strip()]
            if item.tag in ['latitude','longitude']:
                data[item.tag] = item.text.strip()
            if item.tag == 'record':
                data['timestamp'] = str(parser.parse(item.get('timestamp').strip()))
                for r in item:
                    data[r.get('type').strip()] = r.get('value').strip()
        li.append(data)

    filename = 'psi_update.csv'
    file_exists = os.path.isfile(filename)
    max_date = ''
    if file_exists:
        df = pandas.read_csv(filename, delimiter="|")
        df['timestamp'] = pandas.to_datetime(df['timestamp'])
        max_date = str(df['timestamp'].max())
    if not file_exists or (parser.parse(max_date) < parser.parse(li[0]['timestamp'])):
        fields = ['timestamp','region','latitude','longitude',
                  'NPSI','NPSI_PM25_3HR','NO2_1HR_MAX','PM10_24HR','PM25_24HR',
                  'SO2_24HR','CO_8HR_MAX','O3_8HR_MAX',
                  'NPSI_CO','NPSI_O3','NPSI_PM10','NPSI_PM25','NPSI_SO2']
        with open(filename, 'a', newline='') as wf:
            writer = csv.DictWriter(wf, fieldnames=fields, delimiter='|', quoting=csv.QUOTE_NONE)
            if not file_exists:
                writer.writeheader()
            for data in li:
                writer.writerow(data)

In [19]:
def pm25_update():
    api_url = "http://www.nea.gov.sg/api/WebAPI?dataset=pm2.5_update&keyref=781CF461BB6606AD28A78E343E0E4176E9DB7CA9E909F6EC"
    api_request = urllib.request.Request(api_url,None,headers)
    api_response = urllib.request.urlopen(api_request)
    api_data = api_response.read()

    root = etree.fromstring(api_data)
    li = []
    for regions in root.xpath('//item/region'):
        data = {}
        for item in regions:
            if item.tag == 'id':
                data['region'] = regional_code[item.text.strip()]
            if item.tag in ['latitude','longitude']:
                data[item.tag] = item.text.strip()
            if item.tag == 'record':
                data['timestamp'] = str(parser.parse(item.get('timestamp').strip()))
                for r in item:
                    data[r.get('type').strip()] = r.get('value').strip()
        li.append(data)


    filename = 'pm2.5_update.csv'
    file_exists = os.path.isfile(filename)
    max_date = ''
    if file_exists:
        df = pandas.read_csv(filename, delimiter="|")
        df['timestamp'] = pandas.to_datetime(df['timestamp'])
        max_date = str(df['timestamp'].max())
    if not file_exists or (parser.parse(max_date) < parser.parse(li[0]['timestamp'])):
        fields = ['timestamp','region','latitude','longitude','PM25_RGN_1HR']
        with open(filename, 'a', newline='') as wf:
            writer = csv.DictWriter(wf, fieldnames=fields, delimiter='|', quoting=csv.QUOTE_NONE)
            if not file_exists:
                writer.writeheader()
            for data in li:
                writer.writerow(data)

In [20]:
def job():
    print("Pulling Data...")
    try:
        weatherForecast_3hr()
        time.sleep(1)
    except:
        print("Failed to pull data. Skipping [%s]" % 'weatherForecast_3hr')
    try:
        weatherForecast_12hr()
        time.sleep(1)
    except:
        print("Failed to pull data. Skipping [%s]" % 'weatherForecast_12hr')
    try:
        weatherForecast_3days()
        time.sleep(1)
    except:
        print("Failed to pull data. Skipping [%s]" % 'weatherForecast_3days')
    try:
        psi_update()
        time.sleep(1)
    except:
        print("Failed to pull data. Skipping [%s]" % 'psi_update')
    try:
        pm25_update()
        time.sleep(1)
    except:
        print("Failed to pull data. Skipping [%s]" % 'pm25_update')
    try:
        telco_data()
        time.sleep(1)
    except:
        print("Failed to pull data. Skipping [%s]" % 'telco_data')
    print("Data pull done.")

In [21]:
job()

Pulling Data...
Data pull done.


In [10]:
schedule.every(30).minutes.do(job)
while True:
    schedule.run_pending()
    time.sleep(1)

KeyboardInterrupt: 