In [1]:
import logging
logging.root.handlers.clear()
logging.basicConfig(level=logging.INFO)

In [2]:
import d3m.index
import d3m.primitives

In [3]:
from frozendict import frozendict

def _conv(d):
    if isinstance(d, frozendict):
        return {k: _conv(v) for k, v in d.items()}
    elif isinstance(d, (list, tuple)):
        return [_conv(e) for e in d]
    else:
        return d

def print_dict(d):
    """Print function that hides 'FrozenDict' classes from the output.
    
    It just shows the FrozenDict like normal dicts.
    """
    print(_conv(d))

In [5]:
# List mounted D3M datasets
!ls /d3m/data/seed_datasets_current

1491_one_hundred_plants_margin
1491_one_hundred_plants_margin_clust
1567_poker_hand
185_baseball
196_autoMpg
22_handgeometry
26_radon_seed
27_wordLevels
299_libras_move
30_personae
313_spectrometer
31_urbansound
32_wikiqa
38_sick
4550_MiceProtein
49_facebook
534_cps_85_wages
56_sunspots
57_hypothyroid
59_umls
60_jester
66_chlorineConcentration
6_70_com_amazon
6_86_com_DBLP
DS01876
LL0_1100_popularkids
LL0_186_braziltourism
LL0_207_autoPrice
LL0_acled
LL0_acled_reduced
LL1_336_MS_Geolife_transport_mode_prediction
LL1_3476_HMDB_actio_recognition
LL1_726_TIDY_GPS_carpool_bus_service_rating_prediction
LL1_736_stock_market
LL1_EDGELIST_net_nomination_seed
LL1_net_nomination_seed
LL1_penn_fudan_pedestrian
uu1_datasmash
uu2_gp_hyperparameter_estimation
uu2_gp_hyperparameter_estimation_v2
uu3_world_development_indicators
uu4_SPECT


In [6]:
from d3m.container import Dataset
from d3m.metadata.base import ALL_ELEMENTS

In [31]:
# Load a dataset
dataset = Dataset.load('file:///d3m/data/seed_datasets_current/185_baseball/185_baseball_dataset/datasetDoc.json')

In [32]:
# Run DatasetToDataFrame primitive
from d3m.primitives.datasets import DatasetToDataFrame
hyperparams_class = DatasetToDataFrame.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
hyperparams = hyperparams_class.defaults()
conv = DatasetToDataFrame(hyperparams=hyperparams)
res = conv.produce(inputs=dataset)
assert res.has_finished
dataframe = res.value
dataframe.head()

Unnamed: 0,d3mIndex,Player,Number_seasons,Games_played,At_bats,Runs,Hits,Doubles,Triples,Home_runs,RBIs,Walks,Strikeouts,Batting_average,On_base_pct,Slugging_pct,Fielding_ave,Position,Hall_of_Fame
0,0,HANK_AARON,23,3298,12364,2174,3771,624,98,755,2297,1402,1383.0,0.305,0.377,0.555,0.98,Outfield,1
1,1,JERRY_ADAIR,13,1165,4019,378,1022,163,19,57,366,208,499.0,0.254,0.294,0.347,0.985,Second_base,0
2,2,SPARKY_ADAMS,13,1424,5557,844,1588,249,48,9,394,453,223.0,0.286,0.343,0.353,0.974,Second_base,0
3,3,BOBBY_ADAMS,14,1281,4019,591,1082,188,49,37,303,414,447.0,0.269,0.34,0.368,0.955,Third_base,0
4,4,JOE_ADCOCK,17,1959,6606,823,1832,295,35,336,1122,594,1059.0,0.277,0.339,0.485,0.994,First_base,0


In [33]:
# Show metadata for all the columns (before running Profiler)
features = dataframe.metadata.query([ALL_ELEMENTS])
num_columns = features['dimension']['length']
for i in range(num_columns):
    print_dict(dict(dataframe.metadata.query([ALL_ELEMENTS, i])))
    print()

{'name': 'd3mIndex', 'structural_type': <class 'str'>, 'semantic_types': ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey')}

{'name': 'Player', 'structural_type': <class 'str'>, 'semantic_types': ('https://metadata.datadrivendiscovery.org/types/CategoricalData', 'https://metadata.datadrivendiscovery.org/types/Attribute')}

{'name': 'Number_seasons', 'structural_type': <class 'str'>, 'semantic_types': ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')}

{'name': 'Games_played', 'structural_type': <class 'str'>, 'semantic_types': ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')}

{'name': 'At_bats', 'structural_type': <class 'str'>, 'semantic_types': ('http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/Attribute')}

{'name': 'Runs', 'structural_type': <class 'str'>, 'semantic_types': ('http://schema.org/Integer', 'https://metadata.datadrivendis

In [34]:
# Run ISI's profiler
# Warning: it mutates the metadata in-place, don't run above cell out-of-order
from d3m.primitives.dsbox import Profiler
hyperparams_class = Profiler.metadata.query()['primitive_code']['class_type_arguments']['Hyperparams']
hyperparams = hyperparams_class.defaults()
profiler = Profiler(hyperparams=hyperparams)
res = profiler.produce(inputs=dataframe)
assert res.has_finished
profiled = res.value

In [35]:
# Print metadata from ISI's profiler
features = profiled.metadata.query([ALL_ELEMENTS])
num_columns = features['dimension']['length']
for i in range(num_columns):
    print_dict(profiled.metadata.query([ALL_ELEMENTS, i]))
    print()

{'name': 'd3mIndex', 'structural_type': <class 'int'>, 'semantic_types': ['http://schema.org/Integer', 'https://metadata.datadrivendiscovery.org/types/PrimaryKey'], 'ratio_of_numeric_values': 1.0, 'number_of_outlier_numeric_values': 0, 'most_common_tokens': [{'name': '0', 'count': 1}, {'name': '1', 'count': 1}, {'name': '10', 'count': 1}, {'name': '100', 'count': 1}, {'name': '1000', 'count': 1}, {'name': '1001', 'count': 1}, {'name': '1002', 'count': 1}, {'name': '1003', 'count': 1}, {'name': '1004', 'count': 1}, {'name': '1005', 'count': 1}], 'number_of_tokens_containing_numeric_char': 1340, 'ratio_of_tokens_containing_numeric_char': 1.0, 'number_of_values_containing_numeric_char': 1340, 'ratio_of_values_containing_numeric_char': 1.0}

{'name': 'Player', 'structural_type': <class 'str'>, 'semantic_types': ['https://metadata.datadrivendiscovery.org/types/Attribute', 'http://schema.org/Text'], 'most_common_tokens': [{'name': 'ELMER_SMITH', 'count': 2}, {'name': 'AARON_WARD', 'count': 1

In [37]:
keys = set()
for i in range(num_columns):
    for k in profiled.metadata.query([ALL_ELEMENTS, i]):
        keys.add(k)
keys

{'most_common_tokens',
 'name',
 'number_of_outlier_numeric_values',
 'number_of_tokens_containing_numeric_char',
 'number_of_values_containing_numeric_char',
 'ratio_of_numeric_values',
 'ratio_of_tokens_containing_numeric_char',
 'ratio_of_values_containing_numeric_char',
 'semantic_types',
 'structural_type'}