In [13]:
import pymongo
import os, sys
import random
import pandas as pd

In [50]:
DATA_DIR = "../data/files"
OUTPUT_DIR = "../data/samples"
SAMPLE_SIZE = 300
VIEWER_PREFIX = "https://docs.google.com/viewer?url=https://student.cs.uwaterloo.ca/~c589liu/samples/"
URL_PREFIX = "https://open.canada.ca/data/en/dataset/"

In [3]:
# only use csv files, ignore files with size < 1 KiB or >= 100 MiB
csv_files = [f for f in os.listdir(DATA_DIR) if f.endswith("csv") and os.path.getsize(os.path.join(DATA_DIR, f)) > 1024 and os.path.getsize(os.path.join(DATA_DIR, f)) <= 100L * (1024 ** 2)]
print("There are", len(csv_files), "files after filtering")

There are 13371 files after filtering


In [4]:
file_uuids = [os.path.splitext(f)[0] for f in csv_files]

In [5]:
mongo_client = pymongo.MongoClient('mongodb://127.0.0.1:27017/')
db = mongo_client['opencanada']
metadata_collection = db.metadata

In [6]:
metadata = list(metadata_collection.find({"resources.id": {"$in": file_uuids}}))

In [7]:
samples = random.choices(metadata, k=SAMPLE_SIZE)

In [8]:
files = []
for s in samples:
    for r in s["resources"]:
        p = os.path.join(DATA_DIR, r["id"] + ".csv")
        if os.path.exists(p):
            files.append(p)

In [9]:
os.system("rm -rf %s" % OUTPUT_DIR)
os.system("mkdir %s" % OUTPUT_DIR)

0

In [10]:
for f in files:
    os.system("cp %s %s" % (f, OUTPUT_DIR))

In [66]:
table = []
for s in samples:
    table.append(["+", s["title"], ", ".join(s["subject"]), s["date_published"], "", "", s["id"],  URL_PREFIX + s["id"], ""])
    for r in s["resources"]:
        p = os.path.join(DATA_DIR, r["id"] + ".csv")
        url = VIEWER_PREFIX + r["id"] + ".csv" if os.path.exists(p) else ""
        table.append(["-", r["name"], "", "", ", ".join(r["language"]), r["format"], r["id"], r["url"], url])

In [11]:
mongo_client.close()

In [67]:
df = pd.DataFrame(table, columns=['', 'Name', 'Keywords', 'Date', 'Language', 'Format', 'ID', 'URL', 'Quick Preview'])

In [68]:
df

Unnamed: 0,Unnamed: 1,Name,Keywords,Date,Language,Format,ID,URL,Quick Preview
0,+,Thawing area,"form_descriptors, nature_and_environment, scie...",2017-01-31 00:00:00,,,028d9f45-17ee-44d2-995f-035ef7184633,https://open.canada.ca/data/en/dataset/028d9f4...,
1,-,Thawing area,,,fr,CSV,b02a4d90-166e-4c2d-a346-a52f1a104861,https://ws.mapserver.transports.gouv.qc.ca/swt...,https://docs.google.com/viewer?url=https://stu...
2,-,Thawing area,,,fr,GEOJSON,5be02c19-dbf1-4c6f-9b7d-c5d99c784b66,https://ws.mapserver.transports.gouv.qc.ca/swt...,
3,-,Thawing area,,,fr,GPKG,bddf7132-e16c-473d-8692-10e1e5f791d7,https://ws.mapserver.transports.gouv.qc.ca/swt...,
4,-,Thawing area,,,fr,HTML,469c744e-e9ff-4909-9e0b-1ea0e9b44a31,https://geoegl.msp.gouv.qc.ca/igo2/apercu-qc/?...,
...,...,...,...,...,...,...,...,...,...
2467,-,"Population and dwelling counts, for Canada and...",,,fr,CSV,b32ed8e5-d91a-4014-83ba-37b87ce74614,http://www12.statcan.gc.ca/census-recensement/...,https://docs.google.com/viewer?url=https://stu...
2468,-,"Population and dwelling counts, for Canada and...",,,en,other,9951f98f-30d6-47e3-9ab2-598e2d755d4b,http://www12.statcan.gc.ca/census-recensement/...,
2469,-,"Population and dwelling counts, for Canada and...",,,fr,other,f1ffce13-7003-4d6d-9e95-ebba33a8bf30,http://www12.statcan.gc.ca/census-recensement/...,
2470,-,Supporting Document,,,en,HTML,ca0cdcac-b45e-495a-81e3-fc51b8212cb1,http://www12.statcan.gc.ca/census-recensement/...,


In [69]:
df.to_csv("../../data/samples.csv", index=False)