In [1]:
from openclean.data import load

df = load('3bxy-wfk9.tsv.gz')

In [2]:
df.head()

Unnamed: 0,year,country_id,country_name,sub_index,value_type,value
0,2015,KOR,Korea (Rep.),,rank,1
1,2015,DNK,Denmark,,rank,2
2,2015,ISL,Iceland,,rank,3
3,2015,GBR,United Kingdom,,rank,4
4,2015,SWE,Sweden,,rank,5


In [4]:
from openclean.profiling.classifier.base import classify
from openclean.profiling.classifier.column import DEFAULT_CLASSIFIER

types = classify(df, 'value_type', DEFAULT_CLASSIFIER, distinct=False)

In [5]:
types

Feature({'text': 2664})

In [6]:
types = classify(df, 'value', DEFAULT_CLASSIFIER, distinct=False)

In [7]:
types

Feature({'int': 1341, 'float': 1323})

In [8]:
types.most_common(1)

[('int', 1341)]

In [9]:
types = classify(df, 'value', DEFAULT_CLASSIFIER, distinct=True)
types.most_common(1)

[('float', 754)]

In [10]:
from openclean.function.predicate.datatype import IsInt
from openclean.function.predicate.logic import Not
from openclean.operator.transform.filter import filter

df1 = filter(df, Not(IsInt('value')))

In [11]:
df1.shape

(1323, 6)

In [12]:
df1.head()

Unnamed: 0,year,country_id,country_name,sub_index,value_type,value
167,2015,KOR,Korea (Rep.),,value,8.93
168,2015,DNK,Denmark,,value,8.88
169,2015,ISL,Iceland,,value,8.86
170,2015,GBR,United Kingdom,,value,8.75
171,2015,SWE,Sweden,,value,8.67


In [13]:
from openclean.profiling.classifier.base import classprofile

cp = classprofile(df, 'value', DEFAULT_CLASSIFIER)
print(cp['types'])

FeatureDict(<class 'collections.Counter'>, {'int': {'distinct': 1, 'total': 1175}, 'float': {'distinct': 1, 'total': 1}})


In [14]:
from openclean.profiling.classifier.typepicker import MajorityTypePicker

coltype = MajorityTypePicker().select(cp['types'])

In [15]:
coltype

['int']

In [16]:
coltype = MajorityTypePicker(statistics='total').select(cp['types'])

In [17]:
coltype

['int']

In [18]:
coltype = MajorityTypePicker(statistics='total', threshold=0.4, pick_one=False).select(cp['types'])

In [19]:
coltype

['int']

In [20]:
coltype = MajorityTypePicker(statistics='total', threshold=0.75, pick_one=False).select(cp['types'])

In [21]:
coltype

['int']

In [22]:
from openclean.function.value.replace import lookup
from openclean.profiling.anomalies.datatype import datatype_outliers

coltype = MajorityTypePicker(statistics='total').select(cp['types'])
floats = datatype_outliers(df, 'value', lookup(cp['values']), coltype)

In [23]:
floats

['1.23',
 '1.21',
 '5.31',
 '6.86',
 '5.28',
 '6.404333',
 '9.09',
 '1.84',
 '1.68',
 '7.83',
 '2.4',
 '5.35',
 '3.45',
 '3.21',
 '6.05',
 '4.63',
 '7.96',
 '0.79',
 '2.29',
 '2.02',
 '6.59',
 '1.6',
 '3.73',
 '1.42',
 '4.98',
 '7.82',
 '4.02',
 '7.98',
 '0.41',
 '0.84',
 '2.81',
 '9.51',
 '9.91',
 '2.15',
 '9.28',
 '5.54',
 '2.86',
 '9.75',
 '3.44',
 '6.24',
 '8.5',
 '4.15',
 '6.964667',
 '1.38',
 '5.32',
 '3.18',
 '9.13',
 '3.76',
 '9.32',
 '3.86',
 '7.42',
 '7.49',
 '3.39',
 '6.82',
 '7.19',
 '8.34',
 '1.76',
 '3.41',
 '8.52',
 '8.68',
 '1.75',
 '4.97',
 '4.85',
 '0.31',
 '4.11',
 '6.38',
 '6.65',
 '0.24',
 '3.75',
 '4.73',
 '3.92',
 '7.052',
 '6.32',
 '2.84',
 '2.07',
 '2.37',
 '2.23',
 '2.45',
 '2.31',
 '4.82',
 '6.879928',
 '8.4',
 '1.26',
 '0.12',
 '4.51',
 '8.07',
 '8.18',
 '0.13',
 '3.94',
 '1.02',
 '7.13',
 '3.66',
 '3.93',
 '9.24',
 '7.36',
 '7.45',
 '8.97',
 '2.73',
 '1.7',
 '8.17',
 '8.83',
 '9.35',
 '0.55',
 '8.66',
 '3.79',
 '0.76',
 '6.03',
 '3.88',
 '3.19',
 '0.75',
 '

In [29]:
from openclean.profiling.classifier.column import datatypes

coltypes = datatypes(df, picker=MajorityTypePicker())

In [30]:
coltypes

[{'name': 'year',
  'index': 0,
  'types': FeatureDict(collections.Counter,
              {'int': {'distinct': 1, 'total': 2330}}),
  'domain': ['int']},
 {'name': 'country_id',
  'index': 1,
  'types': FeatureDict(collections.Counter,
              {'text': {'distinct': 1, 'total': 2498}}),
  'domain': ['text']},
 {'name': 'country_name',
  'index': 2,
  'types': FeatureDict(collections.Counter,
              {'text': {'distinct': 1, 'total': 2498}}),
  'domain': ['text']},
 {'name': 'sub_index',
  'index': 3,
  'types': FeatureDict(collections.Counter,
              {'text': {'distinct': 1, 'total': 666}}),
  'domain': ['text']},
 {'name': 'value_type',
  'index': 4,
  'types': FeatureDict(collections.Counter,
              {'text': {'distinct': 1, 'total': 2497}}),
  'domain': ['text']},
 {'name': 'value',
  'index': 5,
  'types': FeatureDict(collections.Counter,
              {'int': {'distinct': 1, 'total': 1175},
               'float': {'distinct': 1, 'total': 1}}),
  'domain': 

In [27]:
coltypes = datatypes(df, columns=[2, 1], include_values=True)

In [28]:
coltypes

[{'name': 'country_name',
  'index': 2,
  'types': FeatureDict(collections.Counter,
              {'text': {'distinct': 1, 'total': 2498}}),
  'values': {'Korea (Rep.)': 'text',
   'Denmark': 'text',
   'Iceland': 'text',
   'United Kingdom': 'text',
   'Sweden': 'text',
   'Luxembourg': 'text',
   'Switzerland': 'text',
   'Netherlands': 'text',
   'Hong Kong, China': 'text',
   'Norway': 'text',
   'Japan': 'text',
   'Finland': 'text',
   'Australia': 'text',
   'Germany': 'text',
   'United States': 'text',
   'New Zealand': 'text',
   'France': 'text',
   'Monaco': 'text',
   'Singapore': 'text',
   'Estonia': 'text',
   'Belgium': 'text',
   'Ireland': 'text',
   'Canada': 'text',
   'Macao, China': 'text',
   'Austria': 'text',
   'Spain': 'text',
   'Bahrain': 'text',
   'Andorra': 'text',
   'Barbados': 'text',
   'Malta': 'text',
   'Qatar': 'text',
   'United Arab Emirates': 'text',
   'Slovenia': 'text',
   'Czech Republic': 'text',
   'Israel': 'text',
   'Belarus': 'text'