# Basic Script

The following cell contains utility functions that use the metalearn package from BYU to extract meatafeatures from datasets. Please install the requirements in the root folder of this repository or at least the metalearn pakcage via 

```
   pip install git+https://github.com/byu-dml/metalearn.git#egg=metalearn

```


In [22]:
import os
import pprint
from json import loads, dumps
import pandas as pd
import numpy as np
from metalearn.metafeatures.simple_metafeatures import SimpleMetafeatures
from metalearn.metafeatures.statistical_metafeatures import StatisticalMetafeatures
from metalearn.metafeatures.information_theoretic_metafeatures import InformationTheoreticMetafeatures

def load_dataframe(training_set, data_format="dict"):
    df = pd.read_csv(training_set)
    #df.fillna('', inplace=True)
    df.fillna(method='ffill', inplace=True)
    X =  df.values[:,0:-1]
    Y = df[df.keys()[-1]].astype('str').as_matrix()
    print('Y ', Y)
    Y = df.filter([df.keys()[-1]]).astype('str').values.flatten()
    print('Y ', Y)
    attributes = []
    for i in range(0,len(X[0])):
        attributes.append((df.keys()[i],str(type(X[0][i]))))
    attributes.append(('class', list(set(Y))))
    return X, Y, attributes

def extract_metafeatures(X,Y,attributes):
    metafeatures = {}
    features, time = SimpleMetafeatures().timed_compute(X,Y,attributes)
    print("simple metafeatures compute time: {}".format(time))
    total_time = time
    for key, value in features.items():
        metafeatures[key] = value

    features, time = StatisticalMetafeatures().timed_compute(X,Y,attributes)
    print("statistical metafeatures compute time: {}".format(time))
    total_time = total_time + time
    for key, value in features.items():
        metafeatures[key] = value

    features, time = InformationTheoreticMetafeatures().timed_compute(X,Y,attributes)
    print("information theoretic metafeatures compute time: {}".format(time))
    total_time = total_time + time
    for key, value in features.items():
        metafeatures[key] = value

    return metafeatures

def compute_metafeatures(dataset_path):
    X, Y, attributes = load_dataframe(dataset_path, "dict")
    metadata = extract_metafeatures(X, Y, attributes)
    return metadata


# Downloading Seed Datasets

In [3]:
import subprocess
# Please change this path.
path = '/Users/raonilourenco/D3M/'
cd_command = 'cd '+path'; '
#Please provide a valid token, this follwoing token is for rlourenco
token = 'IcrgDrcQpmOdHxO0BSaW8tzHP9HZvhyMiA8TAqssrdxrVWze2NlPvlccgu3XVQ2t'
datasets_url = 'https://datadrivendiscovery.org/data/training_datasets/LL0/'
wget_command = "wget -r -np -R \"index.html*\" -nH --header \'Authorization:%s\' %s"%(token,datasets_url)

subprocess.call(cd_command+wget_command, shell=True)

SyntaxError: invalid syntax (<ipython-input-3-d481260dfb70>, line 4)

# Collecting training sets files

In [6]:
name = 'learningData.csv'
# Please change this path.
path = '/Users/yamuna/D3M/data/LL0/LL0_22_mfeat_zernike'
training_sets = []
for root, dirs, files in os.walk(path):    
    if name in files:
        training_sets.append(os.path.join(root, name))
#example
training_sets[0]

'/Users/yamuna/D3M/data/LL0/LL0_22_mfeat_zernike/LL0_22_mfeat_zernike_dataset/tables/learningData.csv'

In [23]:
training_sets_metafeatures = {}
for training_set in training_sets:
    training_sets_metafeatures[training_set] = compute_metafeatures(training_set)
print(training_sets_metafeatures)

Y  ['1' '1' '1' ..., '10' '10' '10']
Y  ['1' '1' '1' ..., '10' '10' '10']
simple metafeatures compute time: 0.057614803314208984
statistical metafeatures compute time: 14.388926982879639
information theoretic metafeatures compute time: 2.0278217792510986
{'/Users/yamuna/D3M/data/LL0/LL0_22_mfeat_zernike/LL0_22_mfeat_zernike_dataset/tables/learningData.csv': {'number_of_classes': 10, 'number_of_instances': 2000, 'number_of_features': 48, 'dimensionality': 0.024, 'number_of_numeric_features': 48, 'percentage_of_numeric_features': 1.0, 'number_of_nominal_features': 0, 'percentage_of_nominal_features': 0.0, 'symbols_min': 10, 'symbols_max': 10, 'symbols_mean': 10.0, 'symbols_q1': 10.0, 'symbols_q2': 10.0, 'symbols_q3': 10.0, 'symbols_sd': 0.0, 'symbols_sum': 10, 'class_prob_min': 0.10000000000000001, 'class_prob_max': 0.10000000000000001, 'class_prob_mean': 0.10000000000000001, 'class_prob_q1': 0.10000000000000001, 'class_prob_q2': 0.10000000000000001, 'class_prob_q3': 0.10000000000000001,

In [None]:
len(training_sets)
