Author: Patrick Pickard
Date: August 6, 2020
Description:

In [135]:
import pandas as pd
import numpy as np

In [154]:
speed_limit_df = pd.read_csv("Speed_Limits.csv")

# cleaning the speed_limit data
speed_limit_df['multiline'] = speed_limit_df['multiline'].str.replace(r'[MULTILINESTRING]','').str.replace(r'\(','').str.replace(r'\)','').str.replace(r'\)','').str.replace("−", "-").str.strip()
speed_limit_df.dropna(subset = ["SPEED"], inplace=True)

# taking the multilinestring coordinates and creating a latitude and longitude column in the data frame. We will assume that the first point is an accurate representation of the location for the line segments
latitude = []
longitude = []
for temp in speed_limit_df.multiline:
    temp = temp.split(',')
    for i in temp:
        i = i.split()
        latitude.append(float(i[1]))
        longitude.append(float(i[0]))
        break
    
speed_limit_df['latitude'] = latitude
speed_limit_df['longitude'] = longitude
speed_limit_df = speed_limit_df.drop(['BOUND','CREATED_DT','DISTANCE','multiline'], axis=1)
speed_limit_df = speed_limit_df.sort_values('SPEED',ascending=False)
speed_limit_df

Unnamed: 0,SPEED,STREET_NAME,latitude,longitude
319,110,TRANS CANADA HI W,51.088641,-114.259021
153,110,TRANS CANADA HI W,51.088552,-114.292310
874,110,DEERFOOT TR NE RAMP,51.169156,-114.000749
639,110,MACLEOD TR SE,50.848539,-114.023095
590,110,DEERFOOT TR NE,51.136862,-114.039942
...,...,...,...,...
760,30,CROWCHILD TR SW RAMP,51.002836,-114.118039
908,30,CROWCHILD TR SW,50.990472,-114.118005
997,30,HARVEST HILLS BV N,51.172578,-114.068950
397,30,GLENMORE TR SW RAMP,51.000458,-114.116900


In [155]:
traffic_volume_df = pd.read_csv("Traffic_Volumes_for_2018.csv")

# cleaning the traffic volume data
traffic_volume_df['multilinestring'] = traffic_volume_df['multilinestring'].str.replace(r'[MULTILINESTRING]','').str.replace(r'\(','').str.replace(r'\)','').str.replace(r'\)','').str.replace("−", "-").str.strip()
traffic_volume_df.dropna(subset = ["VOLUME"], inplace=True)

# taking the multilinestring coordinates and creating a latitude and longitude column in the data frame. We will assume that the first point is an accurate representation of the location for the line segments
latitude = []
longitude = []
for temp in traffic_volume_df.multilinestring:
    temp = temp.split(',')
    for i in temp:
        i = i.split()
        latitude.append(float(i[1]))
        longitude.append(float(i[0]))
        break
    
traffic_volume_df['latitude'] = latitude
traffic_volume_df['longitude'] = longitude
traffic_volume_df = traffic_volume_df.drop(['YEAR','Shape_Leng','multilinestring'], axis=1)
traffic_volume_df = traffic_volume_df.sort_values('VOLUME',ascending=False)
traffic_volume_df

Unnamed: 0,SECNAME,VOLUME,latitude,longitude
961,DEERFOOT7,185000,51.047955,-114.017118
962,DEERFOOT8,179000,51.037096,-114.004674
959,DEERFOOT6,177000,51.067447,-114.026747
956,DEERFOOT3,174000,51.110524,-114.046882
957,DEERFOOT4,171000,51.096076,-114.040608
...,...,...,...,...
671,8AVS2_A,0,51.045297,-114.053192
283,210AVSE1,0,50.863099,-114.070895
594,68STSE13,0,50.989498,-113.935167
124,144AVNW7,0,51.183317,-114.094587


In [156]:
traffic_cameras_df = pd.read_csv("Traffic_Camera_Locations.csv")

# this data already looks clean to me, only need to remove some rows for ease of viewing
traffic_cameras_df = traffic_cameras_df.drop(['Quadrant','Camera URL'], axis=1)
traffic_cameras_df

Unnamed: 0,Camera Location,longitude,latitude
0,Stoney Trail / Deerfoot Trail SE,-113.976606,50.900726
1,Memorial Drive / 52 Street E,-113.955818,51.053253
2,Crowchild Trail / Shaganappi Trail NW,-114.149379,51.098849
3,Crowchild Trail / Sarcee Trail NW,-114.178204,51.111255
4,Airport Trail / Barlow Trail NE,-114.001451,51.139352
...,...,...,...
121,Memorial Drive / Edmonton Trail NE,-114.050136,51.050802
122,Glenmore Trail / Barlow Trail SE,-113.981495,50.979446
123,Glenmore Trail / Stoney Trail SE,-113.929263,50.979635
124,5 Avenue / 5 Street SW,-114.073644,51.048677


In [157]:
traffic_signals_df = pd.read_csv("Traffic_Signals.csv")

traffic_signals_df.dropna(axis=1, how='all')    # remove the empty column with all NaN values
# need to remove some rows for ease of viewing
traffic_signals_df = traffic_signals_df.drop(['INSTDATE','INT_TYPE','PEDBUTTONS','PED_TIMER','Point','Count','ACCESSIBLE PEDESTRIAN SIGNAL'], axis=1)
traffic_signals_df

Unnamed: 0,FIRSTROAD,SECONDROAD,QUADRANT,latitude,longitude
0,KENSINGTON ROAD,12 STREET,NW,51.052514,-114.091740
1,COUNTRY HILLS BOULEVARD,COVENTRY HILLS BOULEVARD,NE,51.154211,-114.052475
2,McCALL WAY,PEGASUS ROAD,NE,51.094081,-114.011478
3,72 AVENUE,CENTRE STREET,N,51.117992,-114.069588
4,9 AVENUE,15 STREET/17 AVENUE,SE,51.037739,-114.025122
...,...,...,...,...,...
1533,COUNTRY HILLS BOULEVARD,ROCKY RIDGE RECREATIONAL FACILITY,NW,51.154147,-114.228956
1534,BRIDLERIDGE WAY,BRIDLEWOOD ROAD,SW,50.902069,-114.109199
1535,10 STREET,23 AVENUE,NW,51.073213,-114.084589
1536,5 AVENUE,1 STREET,SE,51.048341,-114.060387


In [158]:
traffic_signs_df = pd.read_csv("Traffic_Signs.csv")

traffic_signs_df.dropna(axis=1, how='all')    # remove the empty columns with all NaN values
traffic_signs_df['POINT'] = traffic_signs_df['POINT'].str.replace(r'[POINT]','').str.replace(r'\(','').str.replace(r'\)','').str.replace(r'\)','').str.replace("−", "-").str.strip()

# taking the POINT coordinates and creating a latitude and longitude column in the data frame. We will assume that the first point is an accurate representation of the location for the line segments
latitude = []
longitude = []
for temp in traffic_signs_df.POINT:
    temp = temp.split(',')
    for i in temp:
        i = i.split()
        latitude.append(float(i[1]))
        longitude.append(float(i[0]))
        break
    
traffic_signs_df['latitude'] = latitude
traffic_signs_df['longitude'] = longitude
traffic_signs_df = traffic_signs_df.drop(['BLADE_TYPE','COMMENTS','FACING_CD','FLD_SRC_TXT','INSTDATE','MATERIAL','PL_TY','REUSE','SGN_COUNT_NO','SGN_STA_CD','SUPPORTTYPE','UNITID','TE_SIGNLOCATION_UNITID','POINT','Ward Boundaries','City Quadrants','Calgary Communities','Ward Boundaries 2013-2017','STA_CD','SIZE_CD'],axis=1)
traffic_signs_df

Unnamed: 0,SIGN_TXT,latitude,longitude
0,,51.068567,-114.210957
1,,50.911166,-113.935804
2,"3HR, 0900-1800, SAT, AL",51.042028,-114.054231
3,"3 HR, 0900-1800, M-F, PAY REQD, AR",51.041351,-114.053597
4,DELINEATOR,50.911341,-113.935926
...,...,...,...
207515,,50.914135,-114.089567
207516,RESERVED BIKE LANE BEGINS (ADL),51.050466,-114.073475
207517,,51.052801,-114.090109
207518,,50.974158,-114.053874


In [159]:
traffic_accidents_df = pd.read_csv("Traffic_Incidents.csv")

traffic_accidents_df.dropna(axis=1, how='all')    # remove the empty columns with all NaN values
traffic_accidents_df = traffic_accidents_df[~traffic_accidents_df.START_DT.str.contains("/2016 ")]     # remove all non 2018 data
traffic_accidents_df = traffic_accidents_df[~traffic_accidents_df.START_DT.str.contains("/2017 ")]
traffic_accidents_df = traffic_accidents_df[~traffic_accidents_df.START_DT.str.contains("/2019 ")]
traffic_accidents_df = traffic_accidents_df[~traffic_accidents_df.START_DT.str.contains("/2020 ")]
traffic_accidents_df = traffic_accidents_df.drop(['MODIFIED_DT','START_DT','location','Count','id'],axis=1)
traffic_accidents_df

Unnamed: 0,INCIDENT INFO,DESCRIPTION,QUADRANT,Longitude,Latitude
875,Southbound Deerfoot Trail and 16 Avenue NE,Multi-vehicle incident.,NE,-114.028193,51.065699
2169,36 Avenue and Burnsland Road SE,Two vehicle incident.,SE,-114.059276,51.021205
3262,Falconridge Boulevard at Castleridge Boulevard NE,Two vehicle incident.,NE,-113.958764,51.099956
4559,Southbound Deerfoot Trail and 16 Avenue NE,Multi-vehicle incident.,NE,-114.028183,51.065739
4695,Spruce Meadows Trail and Macleod Trail SE,Two vehicle incident.,SE,-114.061310,50.892629
...,...,...,...,...,...
17407,Westbound 16 Avenue approaching Deerfoot Trail NE,Two vehicle incident. Blocking the left lane.,NE,-114.020057,51.067053
17408,Southbound Crowchild Trail at Kensington Road NW,Two vehicle incident. Blocking multiple lanes,NW,-114.118501,51.052492
17409,52 Street and 5 Avenue SE,Multi-vehicle incident.,SE,-113.956571,51.049133
17410,Eastbound Memorial Drive at 8 Street NW,Two vehicle incident.,NW,-114.079493,51.054765


In [169]:
weather_df = pd.read_csv("2018_weather_data.csv", index_col='Date/Time', parse_dates=True)

weather_df.dropna(axis=1, how='all') 
weather_df = weather_df.drop(['Station Name','Climate ID','Year','Month','Day','Min Temp Flag','Max Temp Flag','Mean Temp Flag','Heat Deg Days Flag','Cool Deg Days Flag','Total Rain Flag','Total Snow Flag','Total Precip Flag','Snow on Grnd Flag','Dir of Max Gust Flag','Spd of Max Gust Flag','Data Quality','Dir of Max Gust (10s deg)','Heat Deg Days (°C)','Cool Deg Days (°C)','Total Rain (mm)','Total Snow (cm)','Longitude (x)','Latitude (y)'],axis=1)
weather_df

Unnamed: 0_level_0,Max Temp (°C),Min Temp (°C),Mean Temp (°C),Total Precip (mm),Snow on Grnd (cm),Spd of Max Gust (km/h)
Date/Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-01,-3.6,-32.2,-17.9,0.0,11.0,<31
2018-01-02,0.8,-11.6,-5.4,0.2,10.0,<31
2018-01-03,6.4,-10.1,-1.9,0.0,10.0,<31
2018-01-04,4.0,-12.0,-4.0,0.8,10.0,<31
2018-01-05,9.6,-12.4,-1.4,0.2,10.0,44
...,...,...,...,...,...,...
2018-12-27,-5.8,-9.8,-7.8,0.4,2.0,
2018-12-28,1.7,-13.8,-6.0,0.0,2.0,46
2018-12-29,8.7,-4.2,2.3,0.0,3.0,41
2018-12-30,3.5,-13.8,-5.1,0.2,2.0,60
