In [1]:
import datamart_profiler
import io
import pandas

In [2]:
csvfile = io.StringIO('''\
pickup latitude,pickup longitude,brand,description,tip
40.734746,-74.000077,mazda,black suv with tinted windows,0.0
40.726640,-73.993186,hyundai,grey ish convertible pickup,13.333
40.735108,-73.996996,mazda,dark grey compact suv,8.5
40.729115,-74.001726,hyundai,black camry with lyft sign,5.667
40.728896,-73.998542,kia,bright green box car,10.0
''')

In [3]:
metadata = datamart_profiler.process_dataset(csvfile)
# Input can be a file object, a path, or a pandas.DataFrame

In [4]:
list(metadata.keys())

['size',
 'nb_rows',
 'nb_profiled_rows',
 'columns',
 'spatial_coverage',
 'attribute_keywords',
 'types']

In [5]:
# Overall dataset type
metadata['types']

['numerical', 'spatial']

In [6]:
# Column information
metadata['columns']
# 'structural_type' is the programmatic type to use to store the data
#   integer, float, geo point, geo polygon, string
# 'semantic_types' provide information on the meaning of the data
#   boolean, categorical, natural language, administrative area, identifier, date time, latitude, longitude
# Other information:
#   - ratio of missing/unclean values
#   - statistical information (mean/stddev) for numerical columns
#   - number of distinct values for categorical columns

[{'name': 'pickup latitude',
  'structural_type': 'http://schema.org/Float',
  'semantic_types': ['http://schema.org/latitude'],
  'unclean_values_ratio': 0.0,
  'mean': 40.730901,
  'stddev': 0.003401435461682556},
 {'name': 'pickup longitude',
  'structural_type': 'http://schema.org/Float',
  'semantic_types': ['http://schema.org/longitude'],
  'unclean_values_ratio': 0.0,
  'mean': -73.9981054,
  'stddev': 0.002919514863812805},
 {'name': 'brand',
  'structural_type': 'http://schema.org/Text',
  'semantic_types': [],
  'num_distinct_values': 3},
 {'name': 'description',
  'structural_type': 'http://schema.org/Text',
  'semantic_types': ['http://schema.org/Text']},
 {'name': 'tip',
  'structural_type': 'http://schema.org/Float',
  'semantic_types': [],
  'unclean_values_ratio': 0.0,
  'mean': 7.5,
  'stddev': 4.49185436095161,
  'coverage': [{'range': {'gte': 0.0, 'lte': 0.0}},
   {'range': {'gte': 5.667, 'lte': 8.5}},
   {'range': {'gte': 10.0, 'lte': 13.333}}]}]

In [7]:
# Spatial coverage information
metadata['spatial_coverage']
# This can be extracted from a pair of latitude/longitude columns, point formats (WKT, ...),
# named administrative areas, or full addresses (that one requires a Nominatim server)

[{'type': 'latlong',
  'column_names': ['pickup latitude', 'pickup longitude'],
  'column_indexes': [0, 1],
  'ranges': [{'range': {'type': 'envelope',
     'coordinates': [[-74.001726, 40.729115], [-73.998542, 40.728896]]}},
   {'range': {'type': 'envelope',
     'coordinates': [[-74.000077, 40.735108], [-73.996996, 40.734746]]}},
   {'range': {'type': 'envelope',
     'coordinates': [[-73.993286, 40.72674000000001],
      [-73.99308599999999, 40.72654]]}}]}]

In [8]:
# Example using datamart-geo to identify named areas
import datamart_geo
geo_data = datamart_geo.GeoData.download()

In [9]:
# Also show-cases profiling a DataFrame object
df = pandas.DataFrame({
    'place': ['france', 'france', 'italy', 'germany'],
    'favorite': ['Brittany', 'Normandie', 'Hamburg', 'Bavaria'],
})
df

Unnamed: 0,place,favorite
0,france,Brittany
1,france,Normandie
2,italy,Hamburg
3,germany,Bavaria


In [10]:
datamart_profiler.process_dataset(df, geo_data=geo_data)

{'nb_rows': 4,
 'nb_profiled_rows': 4,
 'columns': [{'name': 'place',
   'structural_type': 'http://schema.org/Text',
   'semantic_types': ['http://schema.org/AdministrativeArea',
    'http://schema.org/Enumeration'],
   'num_distinct_values': 3,
   'admin_area_level': 0},
  {'name': 'favorite',
   'structural_type': 'http://schema.org/Text',
   'semantic_types': ['http://schema.org/AdministrativeArea',
    'http://schema.org/Enumeration'],
   'num_distinct_values': 4,
   'admin_area_level': 1}],
 'spatial_coverage': [{'type': 'admin',
   'column_names': ['place'],
   'column_indexes': [0],
   'ranges': [{'range': {'type': 'envelope',
      'coordinates': [[-61.797841, 55.065334], [55.854503, -21.370782]]}}]},
  {'type': 'admin',
   'column_names': ['favorite'],
   'column_indexes': [1],
   'ranges': [{'range': {'type': 'envelope',
      'coordinates': [[8.97659, 50.56286], [13.81686, 47.27112]]}}]}],
 'attribute_keywords': ['place', 'favorite'],
 'types': ['spatial']}

In [11]:
# Note that envelopes are computed, and the AdministrativeArea type is applied (with 'admin_area_level')
# See also https://gitlab.com/ViDA-NYU/auctus/datamart-geo/-/blob/master/test.ipynb