# Masterdata

**openclean** provides easy access to several different datasets that are available online for download. Below are a few examples.

In [1]:
# All datasets can be downloaded and accessed via the
# masterdata module.

import openclean.data.masterdata as masterdata

In [2]:
for repo in masterdata.repositories():
    print('{} ({})'.format(repo.name(), repo.identifier()))
    for ds in repo.datasets():
        print('\t{} ({})'.format(ds.name(), ds.identifier()))

restcountries.eu (restcountries)
	Countries of the World (countries)
Encyclopaedia Britannica (britannica)
	United States Cities (us_cities)


## restcountries.eu

Dataset of countries in the world that is available from the [restcountries.eu project](http://restcountries.eu/).

In [3]:
# Download the countries dataset. By default, the downloaded data is stored
# in the $HOME/.openclean/masterdata folder.

masterdata.download('restcountries', 'countries', replace=True)

In [4]:
countries = masterdata.load('restcountries', 'countries')
countries.head()

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
1,Afghanistan,AF,AFG,Kabul,Asia,Southern Asia
4,Åland Islands,AX,ALA,Mariehamn,Europe,Northern Europe
5,Albania,AL,ALB,Tirana,Europe,Southern Europe
64,Algeria,DZ,DZA,Algiers,Africa,Northern Africa
10,American Samoa,AS,ASM,Pago Pago,Oceania,Polynesia


In [5]:
countries.loc[countries['name'] == 'Venezuela (Bolivarian Republic of)']

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
239,Venezuela (Bolivarian Republic of),VE,VEN,Caracas,Americas,South America


### Modified Mastedata Copies

The user has the option to modify the downloaded data and store the updated dataset version in the local repository.

In [6]:
from openclean.operator.transform.update import update

# Rename 'Venezuela (Bolivarian Republic of)' to 'Venezuela'
countries = update(countries, 'name', {'Venezuela (Bolivarian Republic of)': 'Venezuela'})
countries.loc[countries['name'] == 'Venezuela']

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
239,Venezuela,VE,VEN,Caracas,Americas,South America


In [7]:
countries.head()

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
1,Aruba,AF,AFG,Kabul,Asia,Southern Asia
4,Afghanistan,AX,ALA,Mariehamn,Europe,Northern Europe
5,Angola,AL,ALB,Tirana,Europe,Southern Europe
64,Anguilla,DZ,DZA,Algiers,Africa,Northern Africa
10,Åland Islands,AS,ASM,Pago Pago,Oceania,Polynesia


In [8]:
masterdata.update('restcountries', 'countries', countries)

In [9]:
countries = masterdata.load('restcountries', 'countries', version=1)
countries.loc[countries['name'] == 'Venezuela']

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
239,Venezuela,VE,VEN,Caracas,Americas,South America


In [10]:
snapshots = masterdata.snapshots('restcountries', 'countries')
for s in snapshots:
    print(s)

<Snapshot (version=0 description='' at=2020-10-30 13:35:15.657955-04:00)>
<Snapshot (version=1 description='' at=2020-10-30 13:35:16.116390-04:00)>


In [11]:
prov = masterdata.diff('restcountries', 'countries', 0, 1)
prov.describe()

Schema Changes
Inserted Columns : 0
Deleted Columns  : 0
Moved Columns    : 0
Renamed Columns  : 0

Data Changes
Inserted Rows    : 0
Deleted Rows     : 0
Moved Rows       : 0
Updated Rows     : 241
Updated Values   : 241


In [12]:
row = prov.rows().update()[0]
old_val, new_val = row.cells[0].values()

print("Country name updated from '{}' to '{}'".format(old_val, new_val))

Country name updated from 'Aruba' to 'French Southern Territories'


## Encyclopaedia Britannica

Datasets that are extracted from Web pages of the Encyclopaedia Britannica.

In [13]:
# Download dataset with US city names.
masterdata.download('britannica', 'us_cities', replace=True)

In [14]:
us_cities = masterdata.load('britannica', 'us_cities')
us_cities.head()

Unnamed: 0,city,state
96,Auburn,Alabama
1155,Montgomery,Alabama
62,Anniston,Alabama
1635,Sheffield,Alabama
345,Clanton,Alabama
