# Preprocessing of data

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

pathDynamicData = "Mobility/dynamische\ Verkehrsdaten/FFM_DZG_180701"

## Dynamic countrates
As a first step, preprocess the dynamic data by loading an additional xml-file to identify the location corresponding to the coles.

In [11]:
def prepareDynData(pathAndFilename,
                   pathToXML="./Mobility/dynamische Verkehrsdaten/Statische Detektordaten.xml"):
    """
    Read and preprocess data from dynamic countrates.
    """
    # Open data as data frame
    df = pd.read_csv(pathAndFilename, encoding='latin1', low_memory=False, sep="\t")
    df = df.fillna(0) # NaN corresponds to 0 count rates
    df = df[df["Number"] != "########"]

    # Open xml file
    xmlTree = ET.parse(pathToXML)
    root = xmlTree.getroot()

    # Get from records the identification ID and map these to location 
    IDList = []
    lat = {}
    lon = {}

    for record in root.findall(".//{http://datex2.eu/schema/2/2_0}measurementSiteRecord"):
        identification = record.findall(".//{http://datex2.eu/schema/2/2_0}measurementSiteIdentification");
        if len(identification) is not 1:
            print("More IDs per site. Take first one.")

        coordinates = record.find(".//{http://datex2.eu/schema/2/2_0}pointCoordinates")

        #ID.append(identification[0].text)
        ID = identification[0].text.split("[")[0]
        IDList.append(int(ID))
        lat[int(ID)] = float(coordinates.find(".//{http://datex2.eu/schema/2/2_0}latitude").text)
        lon[int(ID)] = float(coordinates.find(".//{http://datex2.eu/schema/2/2_0}longitude").text)

    # Only consider those coles, where location information is available
    df = df[df["ElemUID"].isin(IDList)]

    # Append location info
    df["Lat"] = df["ElemUID"]
    df["Lon"] = df["ElemUID"]
    df = df.replace({"Lat": lat})                                                                    
    df = df.replace({"Lon": lon})
    
    return df

## Run the preprocessing and save data (TODO)

In [13]:
file = "Mobility/dynamische Verkehrsdaten/FFM_DZG_180701/FFM_DEZ_180701.csv"
print(prepareDynData(file))

          ElemUID      ElemName Kind    Number          DaySecFrom(UTC)  \
0        15424002            D2  DEZ    120.00  2018-07-01 23:58:00.000   
1        17790006     D6 (KFZ4)  DEZ    120.00  2018-07-01 23:58:00.000   
2        17392004            D4  DEZ         0  2018-07-01 23:58:00.000   
3        18301005        D5 (2)  DEZ         0  2018-07-01 23:58:00.000   
4        18297002        D2 (1)  DEZ     60.00  2018-07-01 23:58:00.000   
5        18301003        D3 (3)  DEZ     60.00  2018-07-01 23:58:00.000   
6        15428004            D4  DEZ         0  2018-07-01 23:58:00.000   
7        17399010           D10  DEZ         0  2018-07-01 23:58:00.000   
8        17399006            D6  DEZ         0  2018-07-01 23:58:00.000   
9        18983002            D2  DEZ         0  2018-07-01 23:58:00.000   
10       17405003        D3 (2)  DEZ         0  2018-07-01 23:58:00.000   
11       17403004     D4 (KFZ2)  DEZ         0  2018-07-01 23:58:00.000   
12       18391001     D1 