In [1]:
import os

os.environ['OPENCLEAN_DATA_DIR'] = '../data/'
os.environ['OPENCLEAN_MASTERDATA_DIR'] = '../data/'

In [2]:
from openclean.data import load

df = load('3bxy-wfk9.tsv.gz')

In [3]:
df.head()

Unnamed: 0,year,country_id,country_name,sub_index,value_type,value
0,2015,KOR,Korea (Rep.),,rank,1
1,2015,DNK,Denmark,,rank,2
2,2015,ISL,Iceland,,rank,3
3,2015,GBR,United Kingdom,,rank,4
4,2015,SWE,Sweden,,rank,5


In [4]:
from openclean.data import masterdata

masterdata.download('restcountries.eu')
countries_groundtruth = load('restcountries.eu')
countries_groundtruth.head()

Unnamed: 0,name,alpha2Code,alpha3Code,capital,region,subregion
0,Afghanistan,AF,AFG,Kabul,Asia,Southern Asia
1,Åland Islands,AX,ALA,Mariehamn,Europe,Northern Europe
2,Albania,AL,ALB,Tirana,Europe,Southern Europe
3,Algeria,DZ,DZA,Algiers,Africa,Northern Africa
4,American Samoa,AS,ASM,Pago Pago,Oceania,Polynesia


In [5]:
from openclean.function.value.comp import lt
from openclean.function.value.normalize import divide_by_total
from openclean.profiling.anomalies.frequency import frequency_outliers
from openclean.profiling.distinct import distinct

top_regions = distinct(countries_groundtruth, 'region').normalize(divide_by_total)
# top_regions = apply(distinct(countries_groundtruth, 'region'), 'count', divide_by_total)
print(top_regions)
rare_region = frequency_outliers(countries_groundtruth, 'region', lt(0.1))
print(rare_region)

Feature({'Africa': 0.24, 'Americas': 0.228, 'Europe': 0.212, 'Asia': 0.2, 'Oceania': 0.108, '': 0.008, 'Polar': 0.004})
['', 'Polar']


In [6]:
country_names = countries_groundtruth['name']  # or df_restcountries['name']
country_codes = countries_groundtruth['alpha3Code']

In [7]:
country_names.count()

250

In [8]:
from openclean.profiling.anomalies import domain_outliers
unknown_countries = domain_outliers(df, 'country_name', country_names)

In [9]:
unknown_countries

['Congo (Dem. Rep.)',
 'Hong Kong, China',
 'Congo (Rep.)',
 'Cape Verde',
 'TFYR Macedonia',
 'St. Lucia',
 'United Kingdom',
 'Dominican Rep.',
 'St. Vincent and the Grenadines',
 'Iran (I.R.)',
 'Korea (Rep.)',
 'Moldova',
 'Bolivia',
 'Tanzania',
 'Antigua & Barbuda',
 'Macao, China',
 'Lao P.D.R.',
 'Syria',
 'St. Kitts and Nevis',
 'Venezuela',
 'United States',
 "C™te d'Ivoire",
 'Trinidad & Tobago']

In [10]:
unknown_codes = domain_outliers(df, 'country_id', country_codes)

In [11]:
unknown_codes

[]

In [12]:
from openclean.function.eval.predicate.domain import IsNotIn
from openclean.operator.transform.filter import filter

df1 = filter(df, IsNotIn('country_name', country_names))

In [13]:
df1.head()

Unnamed: 0,year,country_id,country_name,sub_index,value_type,value
0,2015,KOR,Korea (Rep.),,rank,1
3,2015,GBR,United Kingdom,,rank,4
8,2015,HKG,"Hong Kong, China",,rank,9
14,2015,USA,United States,,rank,15
23,2015,MAC,"Macao, China",,rank,24


In [14]:
from openclean.data.transform import to_lookup
code_to_name = to_lookup(countries_groundtruth, 'alpha3Code', 'name')

In [15]:
code_to_name

{'AFG': 'Afghanistan',
 'ALA': 'Åland Islands',
 'ALB': 'Albania',
 'DZA': 'Algeria',
 'ASM': 'American Samoa',
 'AND': 'Andorra',
 'AGO': 'Angola',
 'AIA': 'Anguilla',
 'ATA': 'Antarctica',
 'ATG': 'Antigua and Barbuda',
 'ARG': 'Argentina',
 'ARM': 'Armenia',
 'ABW': 'Aruba',
 'AUS': 'Australia',
 'AUT': 'Austria',
 'AZE': 'Azerbaijan',
 'BHS': 'Bahamas',
 'BHR': 'Bahrain',
 'BGD': 'Bangladesh',
 'BRB': 'Barbados',
 'BLR': 'Belarus',
 'BEL': 'Belgium',
 'BLZ': 'Belize',
 'BEN': 'Benin',
 'BMU': 'Bermuda',
 'BTN': 'Bhutan',
 'BOL': 'Bolivia (Plurinational State of)',
 'BES': 'Bonaire, Sint Eustatius and Saba',
 'BIH': 'Bosnia and Herzegovina',
 'BWA': 'Botswana',
 'BVT': 'Bouvet Island',
 'BRA': 'Brazil',
 'IOT': 'British Indian Ocean Territory',
 'UMI': 'United States Minor Outlying Islands',
 'VGB': 'Virgin Islands (British)',
 'VIR': 'Virgin Islands (U.S.)',
 'BRN': 'Brunei Darussalam',
 'BGR': 'Bulgaria',
 'BFA': 'Burkina Faso',
 'BDI': 'Burundi',
 'KHM': 'Cambodia',
 'CMR': 'Came

In [16]:
from openclean.function.replace import Lookup
from openclean.operator.transform.update import update

df2 = update(df1, 'country_name', Lookup('country_id', code_to_name))

In [17]:
df2.head()

Unnamed: 0,year,country_id,country_name,sub_index,value_type,value
0,2015,KOR,Korea (Republic of),,rank,1
3,2015,GBR,United Kingdom of Great Britain and Northern I...,,rank,4
8,2015,HKG,Hong Kong,,rank,9
14,2015,USA,United States of America,,rank,15
23,2015,MAC,Macao,,rank,24


In [18]:
from openclean.operator.transform.mapping import mapping

country_to_country = mapping(df1, 'country_name', Lookup('country_id', code_to_name))

In [19]:
country_to_country

Unnamed: 0,source,target
0,Korea (Rep.),Korea (Republic of)
1,United Kingdom,United Kingdom of Great Britain and Northern I...
2,"Hong Kong, China",Hong Kong
3,United States,United States of America
4,"Macao, China",Macao
5,TFYR Macedonia,Macedonia (the former Yugoslav Republic of)
6,Antigua & Barbuda,Antigua and Barbuda
7,St. Kitts and Nevis,Saint Kitts and Nevis
8,Moldova,Moldova (Republic of)
9,St. Vincent and the Grenadines,Saint Vincent and the Grenadines


In [20]:
country_to_country = country_to_country.drop(1).drop(11)

In [21]:
country_to_country

Unnamed: 0,source,target
0,Korea (Rep.),Korea (Republic of)
2,"Hong Kong, China",Hong Kong
3,United States,United States of America
4,"Macao, China",Macao
5,TFYR Macedonia,Macedonia (the former Yugoslav Republic of)
6,Antigua & Barbuda,Antigua and Barbuda
7,St. Kitts and Nevis,Saint Kitts and Nevis
8,Moldova,Moldova (Republic of)
9,St. Vincent and the Grenadines,Saint Vincent and the Grenadines
10,Trinidad & Tobago,Trinidad and Tobago


In [22]:
df2 = update(df1, 'country_name', Lookup('country_name', country_to_country))

In [23]:
df2.head()

Unnamed: 0,year,country_id,country_name,sub_index,value_type,value
0,2015,KOR,Korea (Republic of),,rank,1
3,2015,GBR,United Kingdom,,rank,4
8,2015,HKG,Hong Kong,,rank,9
14,2015,USA,United States of America,,rank,15
23,2015,MAC,Macao,,rank,24
