In [1]:
#exploratory analysis of json files

In [2]:
# 3rd party imports
import os
import json
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pylab as plt
import geopandas as gpd

# Configure Notebook
#for plots to be inline
%matplotlib inline 
#for auto_complete 
%config Completer.use_jedi = False 

plt.style.use('fivethirtyeight')
sns.set_context("notebook")

In [3]:
def file_parser():
    """This function will import every json file inside the folders that are inside 'data_in_path'.
    It will then concatenate every file into a huge dataframe, to be saved in the data_out_path.
    The notebook running this function must be in the root directory.
    """
    print('parsing the files now')
    data_in_path = r'src/Snow Feb 17'
    data_out_path = r'src/17_parsed_plow_df'
    
    #read the existence of the out directory
    if os.path.isdir(data_out_path) == False:
        os.mkdir(os.path.join(os.getcwd(), data_out_path))
    
    #listing all files in the folder and ordering them ascending.
    files = [f for f in os.listdir(os.path.join(os.getcwd(), data_in_path)) if '164' in f]
    files.sort()
    
    #creating dummy DF to be filled with values
    full_df = pd.DataFrame()
    i=0
    for folder in files:
        i=i+1
        #this print will keep re-writing itself
        print ('Folder {}, file {} of {} - - - - {:.2f}% completed'.format(folder,i,len(files), 100*i/len(files)), end="\r")
        for file in os.listdir(os.path.join(os.getcwd(), data_in_path,folder)):
            #print(file)
            json_file = json.load(open(os.path.join(os.getcwd(), data_in_path, folder, file)))

            if (('features' in json_file) and (len(json_file['features'])>1)):
                if 'status' in json_file['features'][0]['properties']:
                    temp = gpd.read_file(os.path.join(os.getcwd(), data_in_path,folder,file))
                    full_df = gpd.GeoDataFrame(pd.concat([full_df, temp]), crs=temp.crs)
        #if i == 15:
         #   full_df.to_file(os.path.join(os.getcwd(), data_out_path,r'streets_incomplete.geojson'), driver='GeoJSON')
          #  break
            
    #saving the huge DF as a geojson. can also change this into a shp file
    #full_df.to_file(os.path.join(os.getcwd(), data_out_path,r'streets_complete.shp'))

In [4]:
#alternative function to file_parser. This parser will drop columns and remove duplicates each iteration, keeping the final DF smaller.
def file_parser_cleaner():
    """This function will import every json file inside the folders that are inside 'data_in_path'.
    It will then concatenate every file into a huge dataframe, to be saved in the data_out_path.
    The notebook running this function must be in the root directory.
    """
    print('parsing the files now')
    data_in_path = r'src/Snow Feb 17'
    data_out_path = r'src/17_parsed_plow_df'
    
    #read the existence of the out directory
    if os.path.isdir(data_out_path) == False:
        os.mkdir(os.path.join(os.getcwd(), data_out_path))
    
    #listing all files in the folder and ordering them ascending.
    files = [f for f in os.listdir(os.path.join(os.getcwd(), data_in_path)) if '164' in f]
    files.sort()
    
    #creating dummy DF to be filled with values
    full_df = pd.DataFrame()
    i=0
    for folder in files:
        i=i+1
        #this print will keep re-writing itself
        print ('Folder {}, file {} of {} - - - - {:.2f}% completed'.format(folder,i,len(files), 100*i/len(files)), end="\r")
        for file in os.listdir(os.path.join(os.getcwd(), data_in_path,folder)):
            #print(file)
            json_file = json.load(open(os.path.join(os.getcwd(), data_in_path, folder, file)))

            if (('features' in json_file) and (len(json_file['features'])>1)):
                if 'status' in json_file['features'][0]['properties']:
                    temp = gpd.read_file(os.path.join(os.getcwd(), data_in_path,folder,file))
                    temp.sort_values(by='completetime', ascending=True, inplace=True)
                    temp = temp[temp['status'] == '1'] #keeping only streets cleaned in the last 4 hours.
                    temp.drop(columns={'district', 
                    'OBJECTID',
                    'routeno',
                    'unqueid',
                   'line',
                   'receivedtime'}, inplace=True)
                    
                    full_df = gpd.GeoDataFrame(pd.concat([full_df, temp]), crs=temp.crs)
                    full_df.drop_duplicates(inplace=True)
        
    full_df.to_file(os.path.join(os.getcwd(), data_out_path,r'streets_complete.geojson'), driver='GeoJSON')

In [5]:
#cleaning up the DF
def data_cleaner():
    """ This function will take the DF saved as a GeoJson in the df_path and will clean it
    """
    print('cleaning the plow data', end="\n")
    df_path = r'src/17_parsed_plow_df'
    file_name = r'streets_complete.geojson' #For debugging purposes. Otherwise use 'streets_complete.geojson' when running.
    data_out_path = r'src/streets_cleaned'
    file_out_name = r'17_streets_cleaned.geojson' #will keep overwriting
    
    #renaming columns of interest
    old_columns=['Shape__Length','name_neigh']
    renamed_columns=['length','neighbourhood']
    renaming=dict(zip(old_columns,renamed_columns))
    
    df = gpd.read_file(os.path.join(os.getcwd(),df_path,file_name))
    df['completed_time'] = pd.to_datetime(df['completetime'], unit='ms') #I need to add a datetime column first to give the 'unit' argument
    df.sort_values(by='completed_time', ascending=True, inplace=True)
    df.reset_index(inplace=True)
    df.index = pd.DatetimeIndex(df['completed_time'])
    df = df[df['status'] == '1'] #keeping only streets cleaned in the last 4 hours.
    
    #dropping useless columns
    df.drop(columns={'index', 'completetime',
                    'completed_time'}, inplace=True)
                     
    
    #adding crs info to df
    df.crs = {'init':'epsg:4326'}
    df = df.to_crs('EPSG:4326')
    
    #many lines will be duplicated, so dropping them here
    df.drop_duplicates(inplace=True)
    
    df = df.rename(columns=renaming)
    
    if os.path.isdir(data_out_path) == False:
        os.mkdir(os.path.join(os.getcwd(), data_out_path))
        
    #Saving the cleaned neighbourhoods DF
    df.to_file(os.path.join(os.getcwd(), data_out_path, file_out_name), driver='GeoJSON')


In [6]:
#imports neighbourhoods shapefile and cleans it up. saves just the necessary columns. no calculations.
def neighbourhoods_prep():
    """This code will read the original Toronto Open Data dataset, clean it up to only keep the columns in 'columns_keep' 
    and save it into the 'data_out_path' folder.
    """
    print('preparing the neighbourhood data')
    data_path = r'src/Neighbourhoods'
    file_name = r'Neighbourhoods.shp'
    data_out_path = r'src/17_Neighbourhoods_cleaned'
    file_out_name = r'Neighbourhoods_cleaned.shp'
    
    #selecting columns to keep, and what to rename them
    columns_keep = ['FIELD_7','FIELD_2','FIELD_14','geometry']
    rename=['name_neigh','id','area','geometry']
    renaming=dict(zip(columns_keep,rename))
    
    #Importing the neighbourhoods shapefile
    print('reading file')
    neighbourhoods = gpd.read_file(os.path.join(os.getcwd(), data_path, file_name))
    neighbourhoods = neighbourhoods[columns_keep]
    neighbourhoods = neighbourhoods.rename(columns=renaming)
    
    #removing the ID number from the neighbourhood name column
    print('renaming columns')
    neighbourhoods['name_neigh'] = neighbourhoods['name_neigh'].str.replace('[\d+()]', '', regex=True)
    
    if os.path.isdir(data_out_path) == False:
        os.mkdir(os.path.join(os.getcwd(), data_out_path))
        
    #Saving the cleaned neighbourhoods DF
    print('saving file')
    neighbourhoods.to_file(os.path.join(os.getcwd(), data_out_path, file_out_name))

In [7]:
#locates each cleaned street into a neighbourhood, using centroid and a spatial join
#it will output a multiindex df showing the length of streets cleaned within the last hour per neighbourhood.
def locate_streets():
    from shapely.geometry import LineString
    print('locating cleaned streets inside each neighbourhood')

    data_path = r'src/streets_cleaned'
    file_name = r'17_streets_cleaned.geojson'
    
    neigh_data_path = r'src/17_Neighbourhoods_cleaned'
    neigh_file_name = r'Neighbourhoods_cleaned.shp'
    data_out_path = r'src/17_joined_streets'
    data_out_file = r'joined_streets.geojson'
    
    #loading geojson and correcting datetime column
    print('reading streets')
    streets_cleaned = gpd.read_file(os.path.join(os.getcwd(), data_path, file_name))
    print('converting to datetime and dropping columns')
    streets_cleaned['completed_time'] = pd.to_datetime(streets_cleaned['completed_time'])
    streets_cleaned.index = pd.DatetimeIndex(streets_cleaned['completed_time'])
    streets_cleaned.drop(columns={'completed_time'}, inplace=True)
    
    #Making sure that geometry displays linestring
    print('applying geometry')
    streets_cleaned['geometry'] = streets_cleaned.geometry.apply(LineString)
    
    #creating a centroid column
    print('locating based on centroid')
    streets_cleaned['centroid'] = streets_cleaned['geometry'].centroid
    
    #loading the cleaned neighbourhoods file, and saving the geometry which will be used later
    print('loading neighbourhoods file')
    cleaned_neighbourhoods = gpd.read_file(os.path.join(os.getcwd(), neigh_data_path, neigh_file_name))
    cleaned_neighbourhoods['savedgeom'] = cleaned_neighbourhoods.geometry
    
    #localizing each cleaned street inside a neighbourhood by spatial join
    print('performing spatial join')
    joined_df = gpd.sjoin(left_df=streets_cleaned, 
                      right_df=cleaned_neighbourhoods, 
                     op = 'within').drop(columns={'index_right','id','area'})
    joined_df = joined_df.rename(columns={'name_neigh':'neighbourhood'})
    
    #aggregating into groups, and having multiindex
    print('aggregating into multiindex')
    grouped_streets = joined_df.groupby([pd.Grouper(freq='H'), 'neighbourhood', 'route_name']).agg(routetype = ('routetype', 'first'),
                                               length = ('length', 'sum'),
                                             geometry = ('savedgeom', 'first'))
    
    #after joining, it is a simple dataframe. converting into a geodataframe now:
    print('converting into gpd')
    grouped_streets = gpd.GeoDataFrame(grouped_streets, geometry='geometry')
    grouped_streets.crs = {'init':'epsg:4326'}
    grouped_streets = grouped_streets.to_crs('EPSG:4326')
    
    #saving df
    print('saving df')
    if os.path.isdir(data_out_path) == False:
        os.mkdir(os.path.join(os.getcwd(), data_out_path))
    grouped_streets.to_file(os.path.join(os.getcwd(), data_out_path, data_out_file), driver='GeoJSON')

In [8]:
def main():
    print('starting main script')
    file_parser_cleaner()
    data_cleaner()
    neighbourhoods_prep()
    locate_streets()

In [9]:
main()

starting main script
parsing the files now
cleaning the plow datae 3 of 3 - - - - 100.00% completed


  return _prepare_from_string(" ".join(pjargs))


preparing the neighbourhood data
reading file
renaming columns
saving file
locating cleaned streets inside each neighbourhood
reading streets
converting to datetime and dropping columns
applying geometry
locating based on centroid





loading neighbourhoods file
performing spatial join
aggregating into multiindex
converting into gpd


  return _prepare_from_string(" ".join(pjargs))


saving df
