# Preprocessing notebook

### Import and format data for CNN model trainings

In [1]:
import numpy as np
from constants import Constants
from grid_definition import GridDefinition
from osm_reader import OSMReader
import pandas as pd
from data_download import DataDownloader
import math
import csv
import datetime
import os

In [2]:
constants = Constants()
grid_definition = GridDefinition()
grid_definition.init()
osm_reader = OSMReader()
osm_reader.init()

GRID_SIZE = constants.getGridSize()

topLat = grid_definition.getTopLat()
bottomLat = grid_definition.getBottomLat()
leftLon = grid_definition.getLeftLon()
rightLon = grid_definition.getRightLon()

heightInterval = (topLat - bottomLat) / GRID_SIZE
widthInterval = (rightLon - leftLon) / GRID_SIZE

data_collected = constants.getSelectCollectedDates()

In [3]:
for start_date in data_collected:
#     pullCollectedData(start_date) #pull airspeck P data
    print(start_date)
    formatInputData(start_date) # pull and format airspeck S data

20180629


NameError: name 'formatInputData' is not defined

In [4]:
##Create CSV labels from AirSpeckP data
def pullCollectedData(start_date):
    end_date = getEndDate(start_date)
    sids = ['XXM007', 'XXM008']
    
    data_dir = "/Users/ryanegan/Documents/diss/projectZoe/data/raw/personal/"+str(start_date)+"-"+str(end_date)+"/"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
        
    label_dir = "/Users/ryanegan/Documents/diss/projectZoe/data/label"+str(start_date)+"/"
    if not os.path.exists(label_dir):
        os.makedirs(label_dir)
    
    data_downloader = DataDownloader()
#     data_downloader.loadAirSpeckP(start_date, end_date, sids, data_dir) #Only needs to be done once for each date of collected data
    pdata = data_downloader.readAirSpeckPCSV(start_date, end_date, data_dir)

    PMstrs = ['PM1', 'PM2.5', 'PM10']

    for i in range(len(sids)):
        labels_sum = np.zeros((3, GRID_SIZE, GRID_SIZE))
        labels_count = np.zeros((3, GRID_SIZE, GRID_SIZE))
    
        dataLength = len(pdata[i])
        if (dataLength != 0):
            startTime = pdata[i]['Timestamp'][0]
            endTime = pdata[i]['Timestamp'][len(pdata[i]["Timestamp"]) - 1]
            pdata[i]['Timestamp'] = pd.to_datetime(pdata[i]['Timestamp'])
            pdata[i]['Timestamp'] = pdata[i]['Timestamp'].dt.round('h')
            medianTime = pdata[i]['Timestamp'][int(len(pdata[i]["Timestamp"]) / 2) - 1]

            for num, lat in enumerate(pdata[i]["latitude"]):
                lon = pdata[i]["longitude"][num]
        
                row = math.floor((topLat - lat) / heightInterval)
                col = math.floor((lon - leftLon) / widthInterval)
        
                if (row >= 0) & (row < GRID_SIZE) & (col >= 0) & (col < GRID_SIZE): #Because we may have walked outside the area
                    for PM in range(3):
                        labels_sum[PM][row][col] += pdata[i][PMstrs[PM]][num]
                        labels_count[PM][row][col] += 1
        
            #Take average of all readings within a grid cell
            tempLabels = labels_sum / labels_count
            nanMask = np.isnan(tempLabels)
            tempLabels[nanMask] = -1    
    
    
            #Write PM labels to csv file
            for PM in range(3):
                filename = label_dir + sids[i] + "_" + str(start_date) + "_" + PMstrs[PM] + "_grid" + str(GRID_SIZE) + ".csv"
                with open(filename, 'w') as csvfile:
                    csvfile.write(str(startTime) + "\n")
                    csvfile.write(str(endTime) + "\n")
                    csvfile.write(str(medianTime) + "\n")
                    writer = csv.writer(csvfile, lineterminator='\n')
                    writer.writerows(tempLabels[PM])


## Create an input CSV from AirSpeckS data
def formatInputData(start_date):

    uuids = ["02E5F77764B873DA",
        "200A7CED9D597407",
        "E5FD8C55EAA37555",
        "AA0E63CF5118F98F",
        "B61241EF668DBC2C",
        "E786F1568F65C296" ]

    end_date = getEndDate(start_date)
    data_dir = "/Users/ryanegan/Documents/diss/projectZoe/data/raw/static/"+str(start_date)+"-"+str(end_date)+"/"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    label_dir = "/Users/ryanegan/Documents/diss/projectZoe/data/label/"+str(start_date)+"/"
    train_dir = "/Users/ryanegan/Documents/diss/projectZoe/data/train/"+str(start_date)+"/"
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
        
    temperatureSum = 0
    temperatureCount = 0
    humiditySum = 0
    humidityCount = 0
    PM = np.zeros((3, 6))

    data_downloader = DataDownloader()
#     data_downloader.loadAirSpeckS(start_date, end_date, data_dir) #Only needs to be done once for each date of collected data
    sdata = data_downloader.readAirSpeckSCSV(start_date, end_date, data_dir)

    folder_names = os.listdir(label_dir)

    filename = label_dir + folder_names[0]
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        startTime = pd.to_datetime(next(reader)[0])
        endTime = pd.to_datetime(next(reader)[0])
        medianTime = pd.to_datetime(next(reader)[0])
    
    for i in range(len(uuids)):
        sdata[i]['Timestamp'] = pd.to_datetime(sdata[i]['Timestamp'])
        sdata[i] = sdata[i][(sdata[i]['Timestamp'] > startTime) & (sdata[i]['Timestamp'] < endTime)]
    
        PM[0][i] = np.average(sdata[i]['PM1'])
        PM[1][i] = np.average(sdata[i]['PM2.5'])
        PM[2][i] = np.average(sdata[i]['PM10'])
    
        temperatureSum += np.sum(sdata[i]['temperature'])
        temperatureCount += len(sdata[i]['temperature'])

        humiditySum += np.sum(sdata[i]['humidity'])
        humidityCount += len(sdata[i]['humidity'])

    averageTemp = temperatureSum / temperatureCount
    averageHum = humiditySum / humidityCount
       
    filename = train_dir + str(start_date) + "_grid" + str(GRID_SIZE) + ".csv"
    with open(filename, 'w') as csvfile:
        csvfile.write(str(medianTime) + "\n")
        csvfile.write(str(averageTemp) + "\n")
        csvfile.write(str(averageHum) + "\n")
        writer = csv.writer(csvfile, lineterminator='\n')
        writer.writerows(PM)
             
def getEndDate(start_date):
    return start_date + 1