# Masterdata

**openclean** provides easy access to several different datasets that are available online for download. Below are a few examples.

In [1]:
# All datasets can be downloaded and accessed via a data repository.
# Print the repository names and some example datasets.

from openclean.data.source.registry import repositories

for repo in repositories():
    print('{} ({})'.format(repo.name, repo.identifier))
    # The list of datasets for the Socrata Open Data API is very long.
    # here we only print information about the first ten datasets.
    count = 0;
    for ds in repo.catalog():
        print('\t{} ({})'.format(ds.name, ds.identifier))
        count += 1
        if count == 10:
            break

Encyclopaedia Britannica (britannica)
	United States Cities (us_cities)
restcountries.eu (restcountries)
	Countries of the World (countries)
Socrata Open Data API (socrata)
	Permitting Dashboard Full Dataset (mcm3-xbid)
	Time Dataset (pt25-zdgd)
	Test Socrata Push 2 (saxa-vwk9)
	test 1 (k8qb-prdi)
	Test Socrata Push (i6cn-zc28)
	Project Full Dataset - Copy 1 (dx3v-2pvx)
	Accountability Scorecard Time to Complete Data (v38y-irda)
	Project Location Testing 1 (5xty-5v4b)
	Crimes (mjmi-qtn7)
	Licenses (m8b2-td2c)


## restcountries.eu

Dataset of countries in the world that is available from the [restcountries.eu project](http://restcountries.eu/).

In [2]:
# Download the countries dataset. By default, the downloaded data is stored
# in the $HOME/.openclean/masterdata folder. Set the environment variable
# 'OPENCLEAN_MASTERDATA_DIR' to change the default behaviour.

import os

from openclean.config import ENV_MASTERDATA_DIR

os.environ[ENV_MASTERDATA_DIR] = './data'

In [3]:
# Download the current listing of country names in the world.

from openclean.data.source.restcountries import RestcountriesRepository, COUNTRIES

import openclean.data.masterdata as masterdata

countries = RestcountriesRepository().dataset(COUNTRIES).load()

print(countries.head())

archive = masterdata.create('restcountries', primary_key=['alpha3Code'], replace=True)
archive.commit(countries)

             name alpha2Code alpha3Code    capital   region        subregion
0     Afghanistan         AF        AFG      Kabul     Asia    Southern Asia
1   Åland Islands         AX        ALA  Mariehamn   Europe  Northern Europe
2         Albania         AL        ALB     Tirana   Europe  Southern Europe
3         Algeria         DZ        DZA    Algiers   Africa  Northern Africa
4  American Samoa         AS        ASM  Pago Pago  Oceania        Polynesia


<Snapshot (version=0 description='' at=2021-01-11 16:07:13.119980-05:00)>

In [4]:
countries.loc[countries['name'] == 'Venezuela (Bolivarian Republic of)']

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
243,Venezuela (Bolivarian Republic of),VE,VEN,Caracas,Americas,South America


### Modified Mastedata Copies

The user has the option to modify the downloaded data and store the updated dataset version in the local repository.

In [5]:
from openclean.operator.transform.update import update

# Rename 'Venezuela (Bolivarian Republic of)' to 'Venezuela'
countries = update(countries, 'name', {'Venezuela (Bolivarian Republic of)': 'Venezuela'})
countries.loc[countries['name'] == 'Venezuela']

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
243,Venezuela,VE,VEN,Caracas,Americas,South America


In [6]:
countries.head()

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
0,Afghanistan,AF,AFG,Kabul,Asia,Southern Asia
1,Åland Islands,AX,ALA,Mariehamn,Europe,Northern Europe
2,Albania,AL,ALB,Tirana,Europe,Southern Europe
3,Algeria,DZ,DZA,Algiers,Africa,Northern Africa
4,American Samoa,AS,ASM,Pago Pago,Oceania,Polynesia


In [7]:
archive = masterdata.get('restcountries')
archive.commit(countries)

<Snapshot (version=1 description='' at=2021-01-11 16:07:13.335825-05:00)>

In [8]:
countries = masterdata.get('restcountries').checkout()
countries.loc[countries['name'] == 'Venezuela']

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
239,Venezuela,VE,VEN,Caracas,Americas,South America


In [9]:
snapshots = masterdata.get('restcountries').snapshots()
for s in snapshots:
    print(s)

<Snapshot (version=0 description='' at=2021-01-11 16:07:13.119980-05:00)>
<Snapshot (version=1 description='' at=2021-01-11 16:07:13.335825-05:00)>


In [10]:
prov = masterdata.get('restcountries').diff(0, 1)
prov.describe()

Schema Changes
Inserted Columns : 0
Deleted Columns  : 0
Moved Columns    : 0
Renamed Columns  : 0

Data Changes
Inserted Rows    : 0
Deleted Rows     : 0
Moved Rows       : 0
Updated Rows     : 1
Updated Values   : 1


In [11]:
row = prov.rows().update()[0]
old_val, new_val = row.cells[0].values()

print("Country name updated from '{}' to '{}'".format(old_val, new_val))

Country name updated from 'Venezuela (Bolivarian Republic of)' to 'Venezuela'


## Encyclopaedia Britannica

Datasets that are extracted from Web pages of the Encyclopaedia Britannica.

In [12]:
# Download dataset with US city names.

from openclean.data.source.britannica import EncyclopaediaBritannica, US_CITIES

us_cities = EncyclopaediaBritannica().dataset(US_CITIES).load()
us_cities.head()

Unnamed: 0,city,state
0,Birmingham,Alabama
1,Bessemer,Alabama
2,Scottsboro,Alabama
3,Florence,Alabama
4,Athens,Alabama
