In [3]:
import os
import json
import shutil
import codecs
import requests
import distutils
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
DATA_PATH = './data'
TORONTO_OPEN_DATA_URL = 'https://ckan0.cf.opendata.inter.prod-toronto.ca'

In [5]:
def download_file(url, path, filename):
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(f'{path}/{filename}', 'wb') as f:
            f.write(response.content)
    else:
        raise Exception(f"Failed to download file. Status code: {response.status_code}")

In [4]:
def scrap_toronto_open_data(start_idx=None):
    
    datasets_url = TORONTO_OPEN_DATA_URL + '/api/3/action/package_list'
    datasets = requests.get(datasets_url).json()['result']
    
    os.makedirs(f'{DATA_PATH}/excerpts', exist_ok=True)
    
    for dataset_id in tqdm(datasets if start_idx is None else datasets[start_idx:]):
        
        dataset_url = TORONTO_OPEN_DATA_URL + '/api/3/action/package_show'
        params = {"id": dataset_id}
        
        dataset_package = requests.get(dataset_url, params=params).json()['result']
        
        if (
            distutils.util.strtobool(dataset_package.get('is_retired', 'false')) or
            'CSV' not in dataset_package['formats']
        ): 
            continue
        
        os.makedirs(f'{DATA_PATH}/datasets/{dataset_id}/resources')
        
        resources = list(filter(lambda res: res['format'] == 'CSV', dataset_package['resources']))
        resources = [res for res in resources if ' - 2945' not in res['name'] and ' - 2952' not in res['name']]
        
        names_stripped = [(res['name'][:-4] if res['name'].endswith('.csv') else res['name']) for res in resources]
        _, unique_indices = np.unique(names_stripped, return_index=True)
        resources = list(np.array(resources)[unique_indices])

        n_failures = 0  
        for resource in resources:
            try:

                resource_name = resource['name']
                if not resource_name.endswith('.csv'):
                    resource_name = resource['name'] + '.csv'
                if ' - 4326' in resource_name:
                    resource_name.replace(' - 4326', '')
                
                download_file(
                    resource['url'], 
                    f'{DATA_PATH}/datasets/{dataset_id}/resources', 
                    resource_name
                )
            except:
                n_failures += 1
                print(f'Failed to download {resource["name"]}.csv in {dataset_id}')
        
        if n_failures == len(resources) or len(resources)==0:
            shutil.rmtree(f'{DATA_PATH}/datasets/{dataset_id}')
            continue
        
        with open(f'{DATA_PATH}/datasets/{dataset_id}/description.json', 'w') as f:
            json.dump(dataset_package, f)
            
        with open(f'{DATA_PATH}/excerpts/{dataset_id}.txt', 'w') as f:
            f.writelines(dataset_package['notes'])
            

In [9]:
scrap_toronto_open_data()

  0%|          | 2/489 [00:00<00:24, 19.58it/s]

  4%|▍         | 19/489 [00:05<02:21,  3.32it/s]

Failed to download Disease Sex and Age groups 2018.csv in annual-summary-of-reportable-communicable-diseases


 16%|█▌        | 76/489 [00:33<02:17,  3.01it/s]

Failed to download Civic Issues Readme.csv in civic-issue-questionnaire


 17%|█▋        | 81/489 [00:42<16:04,  2.36s/it]

Failed to download Committee of Adjustments Applications since 2017.csv.csv in committee-of-adjustment-applications


 32%|███▏      | 156/489 [01:35<01:04,  5.18it/s]

In [6]:
scrap_toronto_open_data(start_idx=170)

  0%|          | 0/319 [00:00<?, ?it/s]

 11%|█         | 35/319 [00:24<06:45,  1.43s/it]

Failed to download Civic Issues.csv in metadata-catalog
Failed to download Dataset Category.csv in metadata-catalog
Failed to download Formats.csv in metadata-catalog
Failed to download Owner Division.csv in metadata-catalog
Failed to download Refresh Rate.csv in metadata-catalog
Failed to download Topics.csv in metadata-catalog


100%|██████████| 319/319 [02:40<00:00,  1.99it/s]


In [6]:
def preprocess_datasets(start_idx=None):
    datasets = os.listdir(f'{DATA_PATH}/datasets')
    for dataset in tqdm(datasets if start_idx is None else datasets[start_idx:]):
        for resource in os.listdir(f'{DATA_PATH}/datasets/{dataset}/resources'):
            with codecs.open(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}', 'r', encoding='utf-8', errors='ignore') as file:
                text = file.read()

            with codecs.open(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}', 'w', encoding='utf-8') as file:
                file.write(text)

            df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')

            if 'geometry' in df.columns:
                df = df.drop('geometry', axis=1)
            
            df.to_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')

In [20]:
preprocess_datasets(start_idx=98)

  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')
 18%|█▊        | 23/129 [00:57<04:24,  2.49s/it]


ParserError: Error tokenizing data. C error: Expected 5 fields in line 9, saw 10


In [27]:
preprocess_datasets(start_idx=98)

  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')
  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')


In [7]:
preprocess_datasets(start_idx=107)

  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')
  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')
  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')
  df = pd.read_csv(f'{DATA_PATH}/datasets/{dataset}/resources/{resource}')
100%|██████████| 120/120 [02:50<00:00,  1.42s/it]
