# Download metadata for top 5 datasets from Socrata portals, get number of datasets

After manually creating a list of state portals, this uses Socrata's discovery API to download metadata from the top 5 (by pageviews) datasets from each state portal.  It downloads a select list of metadata elements:
 'id','name','description','category','downloadCount','viewCount','licenseId','publicationDate','rowsUpdateAt','provenance','domain'

In [1]:
#import the required libraries
import csv
import requests
import json as json
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Do you want state or city portals?

In [2]:
#Create the list of state portals to iterate through

universe = 'State'
stp = []

stp =["opendata.utah.gov",
"opendata.maryland.gov",
"data.wa.gov",
"data.vermont.gov",
"data.texas.gov",
"data.pa.gov",
"data.oregon.gov",
"data.ny.gov",
"data.nj.gov",
"data.mo.gov",
"data.michigan.gov",
"data.iowa.gov",
"data.hawaii.gov",
"data.delaware.gov",
"data.ct.gov",
"data.colorado.gov"]



In [3]:
#Create the list of city portals to iterate through

universe = "City"
stp = []

stp =["pip.nashville.gov",
"opendata.cityofmesquite.com",
"opendata.cityofhenderson.com",
"opendata.ci.richmond.ca.us",
"hartstat.hartford.gov",
"data.urbanaillinois.us",
"data.tuscaloosa.com",
"data.topeka.org",
"data.somervillema.gov",
"data.seattle.gov",
"data.roseville.ca.us",
"data.richmondgov.com",
"data.redmond.gov",
"data.readingpa.gov",
"data.providenceri.gov",
"data.norfolk.gov",
"data.nashville.gov",
"data.montgomeryal.gov",
"data.littlerock.gov",
"data.lacity.org",
"data.honolulu.gov",
"data.hartford.gov",
"data.cityoftacoma.org",
"data.cityoforlando.net",
"data.cityofnewyork.us",
"data.cityofgp.com",
"data.cityofgainesville.org",
"data.cityofevanston.org",
"data.cityofchicago.org",
"data.cityofberkeley.info",
"data.cincinnati-oh.gov"]


### Download json data and select metadata elements

Creates a csv file in home directory and adds metadata to that file.

In [5]:
metadata=open(universe+'-portals-top5-METADATA.csv', 'w', newline='')
csv.writer(metadata).writerow(['id','name','description','category','downloadCount','viewCount','licenseId','publicationDate','provenance','domain'])


for s in stp:
    i=requests.get('http://api.us.socrata.com/api/catalog/v1?domains='+s+'&search_context='+s+'&limit=5&only=datasets') #build string according to SOCRATA's convention to get top 5 assets
    j=json.loads(i.text) #parse the json into a list named j
    for r in j['results']:
        csv.writer(metadata).writerow([r['resource']['id'].encode("utf-8"),r['resource']['name'].encode("utf-8"),r['resource'].get('description','N/A'),r['classification'].get('domain_category','N/A'),r['resource']['download_count'],r['resource']['page_views']['page_views_total'],r['metadata'].get('license','N/A'),r['resource']['createdAt'],r['resource']['provenance'],r['metadata']['domain']]) #write one line to csv file

metadata.close() #Close the output file, release all locks

### The following cell gives the number of datasets for the list of portals

In [3]:
metadata=open(universe+'-portals-datasetnum.csv', 'w', newline='')
csv.writer(metadata).writerow(['domain','number of datasets'])

for s in stp:
    i=requests.get('http://api.us.socrata.com/api/catalog/v1?domains='+s+'&only=datasets') #build string according to SOCRATA's convention to get top 5 assets
    j=json.loads(i.text) #parse the json into a list named j
    csv.writer(metadata).writerow([s,j['resultSetSize']])
metadata.close() #Close the output file, release all locks