# Basic Script

The following cell contains utility functions that use the metalearn package from BYU to extract meatafeatures from datasets. Please install the requirements in the root folder of this repository or at least the metalearn pakcage via 

```
   pip install git+https://github.com/byu-dml/metalearn.git#egg=metalearn

```


In [1]:
import os
import pprint
from json import loads, dumps
import pandas as pd
import numpy as np
from metalearn.metafeatures.simple_metafeatures import SimpleMetafeatures
from metalearn.metafeatures.statistical_metafeatures import StatisticalMetafeatures
from metalearn.metafeatures.information_theoretic_metafeatures import InformationTheoreticMetafeatures

def load_dataframe(training_set, data_format="dict"):
    df = pd.DataFrame.from_csv(training_set)
    df.fillna('', inplace=True)
    X = df.as_matrix()[:,:-1]
    Y = df[df.keys()[-1]].astype('str').as_matrix()
    attributes = []
    for i in range(0,len(X[0])):
        attributes.append((df.keys()[i],str(type(X[0][i]))))
    attributes.append(('class', list(set(Y))))
    return X, Y, attributes

def extract_metafeatures(X,Y,attributes):
    metafeatures = {}
    features, time = SimpleMetafeatures().timed_compute(X,Y,attributes)
    print("simple metafeatures compute time: {}".format(time))
    total_time = time
    for key, value in features.items():
        metafeatures[key] = value

    features, time = StatisticalMetafeatures().timed_compute(X,Y,attributes)
    print("statistical metafeatures compute time: {}".format(time))
    total_time = total_time + time
    for key, value in features.items():
        metafeatures[key] = value

    features, time = InformationTheoreticMetafeatures().timed_compute(X,Y,attributes)
    print("information theoretic metafeatures compute time: {}".format(time))
    total_time = total_time + time
    for key, value in features.items():
        metafeatures[key] = value

    return metafeatures

def compute_metafeatures(dataset_path):
    X, Y, attributes = load_dataframe(dataset_path, "dict")
    metadata = extract_metafeatures(X, Y, attributes)
    return metadata


# Downloading Seed Datasets

In [7]:
import subprocess
# Please change this path.
path = '/Users/raonilourenco/D3M/'
cd_command = 'cd '+path'; '
#Please provide a valid token, this follwoing token is for rlourenco
token = 'IcrgDrcQpmOdHxO0BSaW8tzHP9HZvhyMiA8TAqssrdxrVWze2NlPvlccgu3XVQ2t'
datasets_url = 'https://datadrivendiscovery.org/data/training_datasets/LL0/'
wget_command = "wget -r -np -R \"index.html*\" -nH --header \'Authorization:%s\' %s"%(token,datasets_url)

subprocess.call(cd_command+wget_command, shell=True)

0

# Collecting training sets files

In [19]:
name = 'learningData.csv'
# Please change this path.
path = '/Users/raonilourenco/D3M/data/training_datasets/LL0'
training_sets = []
for root, dirs, files in os.walk(path):    
    if name in files:
        training_sets.append(os.path.join(root, name))
#example
training_sets[0]

'/Users/raonilourenco/D3M/data/training_datasets/LL0/LL0_1530_volcanoes_a4/LL0_1530_volcanoes_a4_dataset/tables/learningData.csv'

In [None]:
training_sets_metafeatures = {}
for training_set in training_sets:
    training_sets_metafeatures[training_set] = compute_metafeatures(training_set)

  # This is added back by InteractiveShellApp.init_path()


simple metafeatures compute time: 0.021451950073242188
statistical metafeatures compute time: 0.06834101676940918
information theoretic metafeatures compute time: 0.11081504821777344
simple metafeatures compute time: 0.05450105667114258


  c = cov(x, y, rowvar)


statistical metafeatures compute time: 4.788779973983765
information theoretic metafeatures compute time: 0.05870485305786133
simple metafeatures compute time: 157.3386549949646
statistical metafeatures compute time: 613.0797030925751
information theoretic metafeatures compute time: 1.2013719081878662
simple metafeatures compute time: 0.04349803924560547
statistical metafeatures compute time: 1.2210321426391602
information theoretic metafeatures compute time: 2.831623077392578
simple metafeatures compute time: 0.0029609203338623047
statistical metafeatures compute time: 0.0657663345336914
information theoretic metafeatures compute time: 0.08357405662536621
simple metafeatures compute time: 0.023691892623901367
statistical metafeatures compute time: 9.599669933319092
information theoretic metafeatures compute time: 0.9215030670166016
simple metafeatures compute time: 0.7691850662231445
statistical metafeatures compute time: 332.9446620941162
information theoretic metafeatures compute ti

In [18]:
len(training_sets)


294