**Python Version Requirement:** Python 3.6

In [63]:
from d3m import container
import datamart
import datamart_nyu
import pandas as pd
from pathlib import Path
import subprocess
import os
import shutil
import json

In [39]:
DATAMART_PATH = '/Users/fchirigati/projects/d3m/datamart'

In [40]:
def print_results(results):
    if not results:
        return
    for result in results:
        print(result.score())
        print(result.get_json_metadata()['metadata']['name'])
        if (result.get_augment_hint()):
            left_columns = []
            for column_ in result.get_augment_hint().left_columns:
                left_columns.append([])
                for column in column_:
                    left_columns[-1].append((column.resource_id, column.column_index))
            print("Left Columns: %s" % str(left_columns))
            right_columns = []
            for column_ in result.get_augment_hint().right_columns:
                right_columns.append([])
                for column in column_:
                    right_columns[-1].append((column.resource_id, column.column_index))
            print("Right Columns: %s" % str(right_columns))
        else:
            print(result.id())
        print("-------------------")

In [41]:
def get_materialize_info(results):
    if not results:
        return
    id_to_materialize = dict()
    for result in results:
        id_ = result.get_json_metadata()['id']
        if id_ in id_to_materialize:
            continue
        id_to_materialize[id_] = dict(
            has_info=False,
            url=None,
            path=None
        )
        if 'direct_url' in result.get_json_metadata()['metadata']['materialize']:
            id_to_materialize[id_]['url'] = result.get_json_metadata()['metadata']['materialize']['direct_url']
            id_to_materialize[id_]['has_info'] = True
        else:
            # try to find them on volumes
            datamart_file_path = os.path.join(DATAMART_PATH, 'volumes/datasets', id_, 'main.csv')
            if os.path.exists(datamart_file_path):
                id_to_materialize[id_]['path'] = datamart_file_path
                id_to_materialize[id_]['has_info'] = True
    return id_to_materialize

In [78]:
def download_datasets_and_generate_training_records(results, supplied_data, supplied_data_path, target,
                                                    id_to_materialize, dir_):
    if not results:
        return
    
    training_records = list()
    
    current_working_dir = os.getcwd()
    os.chdir(dir_)
    try:
        # downloading candidate datasets
        for id_ in id_to_materialize:
            if id_to_materialize[id_]['url'] or id_to_materialize[id_]['path']:
                if id_to_materialize[id_]['url']:
                    subprocess.call('wget -O %s %s'%(id_, id_to_materialize[id_]['url']), shell=True)
                else:
                    shutil.copyfile(id_to_materialize[id_]['path'], id_)
                
                companion_data = pd.read_csv(id_)
                # excluding categorical / textual attributes
                companion_data = companion_data.select_dtypes(exclude=['object'])
                # excluding columns with all NaN values
                companion_data.dropna(axis=1, how='all', inplace=True)
                companion_data.to_csv(id_, index=False)
            else:
                print('%s has no materialization information for download.' % id_)
            
        os.mkdir('joined-datasets')
        for i in range(len(results)):
            metadata = results[i].get_json_metadata()
            id_ = metadata['id']
            if not id_to_materialize[id_]['has_info']:
                continue
            join_ = results[i].augment(
                supplied_data=supplied_data,
                connection_url='http://localhost:8002/'
            )
            # excluding d3mIndex
            join_['learningData'].drop(['d3mIndex'], axis=1, inplace=True)
            join_['learningData'].to_csv('joined-datasets/%d.csv'%i, index=False)
            
            # need to load and save again to exclude categorical / textual attributes
            joined_data = pd.read_csv('joined-datasets/%d.csv'%i)
            joined_data = joined_data.select_dtypes(exclude=['object'])
            # excluding columns with all NaN values
            joined_data.dropna(axis=1, how='all', inplace=True)
            joined_data.to_csv('joined-datasets/%d.csv'%i, index=False)
            
            left_column_index = results[i].get_augment_hint().left_columns[0][0].column_index
            right_column_index = results[i].get_augment_hint().right_columns[0][0].column_index
            
            query_key = list(supplied_data['learningData'].columns)[left_column_index]
            candidate_key = metadata['metadata']['columns'][right_column_index]['name']
            
            training_records.append(dict(
                query_dataset=supplied_data_path,
                query_key=query_key,
                target=target,
                candidate_dataset=os.path.abspath('%s'%id_),
                candidate_key=candidate_key,
                joined_dataset=os.path.abspath('joined-datasets/%d.csv'%i),
                imputation_strategy='mean'
            ))
    except Exception as e:
        raise e
    finally:
        os.chdir(current_working_dir)
        
    return training_records

In [43]:
if not os.path.exists('companion-datasets'):
    os.mkdir('companion-datasets')
for p in ['taxi-vehicle-collision', 'ny-taxi-demand', 'college-debt', 'poverty-estimation']:
    if not os.path.exists('companion-datasets/%s'%p):
        os.mkdir('companion-datasets/%s'%p)

In [44]:
client = datamart_nyu.NYUDatamart('http://localhost:8002/')

## NY Taxi and Vehicle Collision Problem

In [9]:
taxi_vehicle_collision_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/taxi-vehicle-collision/' +\
       'taxi-vehicle-collision-v2.csv'
taxi_vehicle_collision = container.Dataset.load('file://' + taxi_vehicle_collision_path)

In [10]:
cursor = client.search_with_data(query=None, supplied_data=taxi_vehicle_collision)

In [11]:
taxi_vehicle_collision_results = list()
results = cursor.get_next_page()
while results:
    taxi_vehicle_collision_results += results
    results = cursor.get_next_page()

In [12]:
len(taxi_vehicle_collision_results)

1067

In [13]:
# print_results(taxi_vehicle_collision_results)

In [14]:
taxi_vehicle_collision_info = get_materialize_info(taxi_vehicle_collision_results)

In [13]:
taxi_vehicle_collision_training_records = download_datasets_and_generate_training_records(
    taxi_vehicle_collision_results,
    taxi_vehicle_collision,
    taxi_vehicle_collision_path,
    'n. trips',
    taxi_vehicle_collision_info,
    'companion-datasets/taxi-vehicle-collision/'
)

## NY Taxi Demand Problem

In [15]:
ny_taxi_demand_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/ny-taxi-demand/' +\
       'yellow-taxi-2017-v2.csv'
ny_taxi_demand = container.Dataset.load('file://' + ny_taxi_demand_path)

## College Debt Problem

In [45]:
college_debt_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/college-debt/' +\
       'college-debt-v2.csv'
college_debt = container.Dataset.load('file://' + college_debt_path)

In [46]:
cursor = client.search_with_data(query=None, supplied_data=college_debt)

In [47]:
college_debt_results = list()
results = cursor.get_next_page()
while results:
    college_debt_results += results
    results = cursor.get_next_page()

In [48]:
len(college_debt_results)

3

In [49]:
print_results(college_debt_results)

0.9575737
Most- Recent- Cohorts- Scorecard- Elements
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9575737
College Scorecard Data - Most Recent
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.8439322
Most- Recent- Cohorts- Scorecard- Elements
Left Columns: [[('0', 12)]]
Right Columns: [[('0', 83)]]
-------------------


In [50]:
college_debt_info = get_materialize_info(college_debt_results)

In [79]:
college_debt_training_records = download_datasets_and_generate_training_records(
    college_debt_results,
    college_debt,
    college_debt_path,
    'DEBT_EARNINGS_RATIO',
    college_debt_info,
    'companion-datasets/college-debt/'
)

## Poverty Estimation Problem

In [52]:
poverty_estimation_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/poverty-estimation/' +\
       'poverty-estimation-v2.csv'
poverty_estimation = container.Dataset.load('file://' + poverty_estimation_path)

In [53]:
cursor = client.search_with_data(query=None, supplied_data=poverty_estimation)

In [54]:
poverty_estimation_results = list()
results = cursor.get_next_page()
while results:
    poverty_estimation_results += results
    results = cursor.get_next_page()

In [55]:
len(poverty_estimation_results)

13

In [56]:
print_results(poverty_estimation_results)

1.0
SF Development Pipeline 2017 Q3
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 25)]]
-------------------
1.0
SF Development Pipeline 2017 Q2
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 31)]]
-------------------
1.0
SF Development Pipeline 2019 Q2
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 25)]]
-------------------
0.93730605
Zillow Median Listing Prices 2017
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 3)]]
-------------------
0.9362234
FIPS Population
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9362234
Unemployment in the US
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.012638724
SF Development Pipeline 2016 Q3
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 23)]]
-------------------
0.0092448
SF Development Pipeline 2016 Q4
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 25)]]
-------------------
0.007953859
SF Development Pipeline 2016 Q2
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 22)

In [57]:
poverty_estimation_info = get_materialize_info(poverty_estimation_results)

In [80]:
poverty_estimation_training_records = download_datasets_and_generate_training_records(
    poverty_estimation_results,
    poverty_estimation,
    poverty_estimation_path,
    'POVALL_2016',
    poverty_estimation_info,
    'companion-datasets/poverty-estimation/'
)

datamart.upload.a8241c91db1e4d75a4e4dd37cce12cd1 has no materialization information for download.
datamart.upload.2f6a998b4f5c4c589aaf990c867446b9 has no materialization information for download.


## Generating file with training records

In [81]:
if os.path.exists('datamart-records/'):
    shutil.rmtree('datamart-records/')
os.mkdir('datamart-records/')

In [82]:
all_records = list()
# all_records += taxi_vehicle_collision_training_records
all_records += college_debt_training_records
all_records += poverty_estimation_training_records

training_records = open('datamart-records/datamart-records', 'w')
for record in all_records:
    training_records.write(json.dumps(record) + "\n")
training_records.close()