# OpenML Cheat Sheet (Python)

In [1]:
# General imports
from openml import datasets, tasks, runs, flows, config
import os, pandas, sklearn, arff, pprint, numpy

## config

Find your API key (required for uploads):
* `www.openml.org` > Your profile > API Authentication

Main OpenML servers:
* Public: `https://www.openml.org/api/v1` (default)
* Test: `https://test.openml.org/api/v1` 

Set server, API key and cache directory (default: `~/.openml/cache`)

Or, create a config file called `~/.openml/config`
and add these lines:

In [None]:
server=https://www.openml.org/api/v1
apikey=qxlfpbeaudtprb23985hcqlfoebairtd
cachedir=/homedir/.openml/cache

## datasets  
**`list_datasets(offset=None, size=None, tag=None)`**
* returns ID -> dataset dict mapping 
* `offset` and `size` for paging results
* `tag` to filter datasets (e.g. 'uci')

In [2]:
dlist = datasets.list_datasets(size=100)
pandas.DataFrame.from_dict(dlist, orient='index')[
['name','NumberOfInstances', 'NumberOfFeatures']][:3]

Unnamed: 0,name,NumberOfInstances,NumberOfFeatures
2,anneal,898,39
3,kr-vs-kp,3196,37
4,labor,57,17


**`get_dataset(dataset_id)`**
* returns **OpenMLDataset** object
* automatically downloads and caches the data itself

In [3]:
odata = datasets.get_dataset(1471)
print(odata.name, "Target: "+ odata.default_target_attribute, 
      odata.description[260:308], sep='\n')

eeg-eye-state
Target: Class
All data is from one continuous EEG measurement 


**`OpenMLDataset`**  

**`.features`**: list of features and their properties  
**`.qualities`**: list of all dataset properties  

**`.get_data`**(target,return_attribute_names=False,return_categorical_indicator=False):  
  returns numpy arrays (or sparse matrices) with features and targets, optionally with attribute names and which are categorical  
**`.retrieve_class_labels(target_name='class')`**: return all class labels for the given target attribute



In [4]:
X, y, attribute_names = odata.get_data(
    target=odata.default_target_attribute, 
    return_attribute_names=True)
pandas.DataFrame(X, columns=attribute_names)[:2]

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14
0,4329.22998,4009.22998,4289.22998,4148.209961,4350.259766,4586.149902,4096.919922,4641.029785,4222.049805,4238.459961,4211.279785,4280.509766,4635.899902,4393.850098
1,4324.620117,4004.620117,4293.850098,4148.720215,4342.049805,4586.669922,4097.439941,4638.970215,4210.77002,4226.669922,4207.689941,4279.490234,4632.819824,4384.100098


In [6]:
# Train a scikit-learn classifier on this data
from sklearn import linear_model
sklearn.linear_model.LinearRegression().fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

**Upload new datasets**
* Create a new OpenML dataset with all relevant information
* Call **`.publish()`** to upload
* Note: use test server for testing

## tasks  
**`list_tasks(task_type_id=None, offset=None, size=None, tag=None)`**
* returns ID -> task dict mapping (task IDs do not match dataset IDs)
* `offset` and `size` for paging results, `tag` to filter tags
* `task_type_id`: 1=Classification, 2=Regression,...

In [7]:
tlist = tasks.list_tasks(size=100)
pandas.DataFrame.from_dict(tlist, orient='index')[
['name','task_type','estimation_procedure']][:3]

Unnamed: 0,name,task_type,estimation_procedure
2,anneal,Supervised Classification,10-fold Crossvalidation
3,kr-vs-kp,Supervised Classification,10-fold Crossvalidation
4,labor,Supervised Classification,10-fold Crossvalidation


**`OpenMLTask`**  
**`.get_dataset()`**: downloads associated dataset   
**`.download_split()`**: downloads train/test splits  
    
**Create new tasks**  
Under development

**`get_task(task_id)`**
* returns **OpenMLTask** object
    *  includes estimation procedure, target name, cost matrix,...
* automatically caches the task description

In [8]:
task = tasks.get_task(14951)
pprint.pprint(task.estimation_procedure)

{'data_splits_url': 'https://www.openml.org/api_splits/get/14951/Task_14951_splits.arff',
 'parameters': {'number_folds': '10',
                'number_repeats': '1',
                'percentage': '',
                'stratified_sampling': 'true'},
 'type': 'crossvalidation'}


## flows  
**`list_flows(offset=None, size=None, tag=None)`**
* returns ID -> flow dict mapping
* `offset` and `size` for paging results, `tag` to filter tags

In [9]:
flist = flows.list_flows(size=200)
pandas.DataFrame.from_dict(flist, orient='index')[
    ['name','version','external_version']][100:102]

Unnamed: 0,name,version,external_version
101,moa.WEKAClassifier_REPTree,1,Moa_2014.03_1.0
102,weka.REPTree,2,Weka_3.7.5_9378


**`sklearn_to_flow(sklearn_estimator)`**

* converts a scikit-learn estimator or pipeline to an OpenML Flow

**`publish()`**

* Uploads the flow to the server. Returns ID

In [10]:
lr = sklearn.linear_model.LinearRegression().fit(X, y)
flow = flows.sklearn_to_flow(lr)

pipe = sklearn.pipeline.Pipeline(steps=[
    ('Imputer', sklearn.preprocessing.Normalizer()),
    ('Classifier', sklearn.linear_model.LinearRegression())])
flow2 = flows.sklearn_to_flow(pipe)
# flows.publish(flow)

## runs  
**`list_runs(offset=None, size=None, tag=None, id=None, task=None, flow=None, uploader=None, display_errors=False)`**
* returns ID -> run dict mapping
* `offset` and `size` for paging results, `tag` to filter tags
* `id`: list of run IDs to filter on, e.g. [1,2,3]
* `task`: list of task IDs to filter on, e.g. [1,2,3]
* `flow`: list of flow IDs to filter on, e.g. [1,2,3]
* `uploader`: list of uploader IDs to filter on, e.g. [1,2,3]
* `display_errors`: whether to return failed runs

In [11]:
rlist = runs.list_runs(task=[14951],size=100)
pandas.DataFrame.from_dict(rlist, orient='index')[1:3]

Unnamed: 0,run_id,task_id,setup_id,flow_id,uploader
544514,544514,14951,5540,3404,2
595116,595116,14951,6436,4074,2


In [12]:
rlist = runs.list_runs(id=[1,2,3])
pandas.DataFrame.from_dict(rlist, orient='index')[1:3]

Unnamed: 0,run_id,task_id,setup_id,flow_id,uploader
2,2,72,16,75,1
3,3,95,8,63,1


**`get_run(run_id)`**
* returns **OpenMLRun** object
    *  includes the exact task, exact flow, and all evaluations
* automatically caches the run description

**OpenMLRun**  
**.uploader_name**: full name of the run author  
**.flow_name**: full name of the flow  
**.parameter_settings**: hyperparameters of the flow  
**.evaluations**: key-value pairs of metric and score  
**.fold_evaluations**: dict of per-fold evaluations  

In [13]:
rlist = runs.list_runs(task=[14951],size=100)
scores = []
for id, _ in rlist.items():
    run = runs.get_run(id)
    scores.append({"flow":run.flow_name, 
                   "score":run.evaluations['area_under_roc_curve']})
pandas.DataFrame.from_dict(scores)[5:8]

Unnamed: 0,flow,score
5,sklearn.tree.tree.ExtraTreeClassifier(11),0.791964
6,sklearn.tree.tree.ExtraTreeClassifier(11),0.789517
7,sklearn.tree.tree.ExtraTreeClassifier(11),0.789965


**`run_flow_on_task(task, flow)`**
* Runs the flow on the task
* Trains and tests the flow of all train/test splits defined by the task
* Returns an **OpenMLRun** model with all information

**`publish()`**
* Publishes the run on OpenML


In [14]:
task = tasks.get_task(14951)
clf = sklearn.linear_model.LogisticRegression()
flow = flows.sklearn_to_flow(clf)
run = runs.run_flow_on_task(task, flow)
run.fold_evaluations['predictive_accuracy'][0]
# run.publish()

{0: 0.63551401869158874,
 1: 0.65220293724966627,
 2: 0.63484646194926564,
 3: 0.63684913217623496,
 4: 0.6435246995994659,
 5: 0.64552736982643522,
 6: 0.64018691588785048,
 7: 0.63551401869158874,
 8: 0.63417890520694264,
 9: 0.64686248331108143}

---

## Example use case
Algorithm benchmarking:
* Download a number of tasks
* Choose any model or pipeline (flow)
* Run the flow on the task a model, evaluate and upload

In [206]:
for task_id in [14951,10103,9945]:
    task = tasks.get_task(task_id)
    data = datasets.get_dataset(task.dataset_id)
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
    flow = flows.sklearn_to_flow(clf)
    
    try:
        run = runs.run_flow_on_task(task, flow)
        myrun = run.publish()
        print("kNN on %s: http://www.openml.org/r/%d" % (data.name, myrun.run_id))
    except oml.exceptions.PyOpenMLError as err:
        print("OpenML: {0}".format(err))

kNN on eeg-eye-state: http://www.openml.org/r/7946204
kNN on volcanoes-a1: http://www.openml.org/r/7946207
kNN on walking-activity: http://www.openml.org/r/7946209
