**Python Version Requirement:** Python 3.6

In [79]:
from d3m import container
import datamart
import datamart_nyu
from pathlib import Path
import subprocess
import os
import shutil

In [78]:
DATAMART_PATH = '/Users/fchirigati/projects/d3m/datamart'

In [55]:
def print_results(results):
    if not results:
        return
    for result in results:
        print(result.score())
        print(result.get_json_metadata()['metadata']['name'])
        if (result.get_augment_hint()):
            left_columns = []
            for column_ in result.get_augment_hint().left_columns:
                left_columns.append([])
                for column in column_:
                    left_columns[-1].append((column.resource_id, column.column_index))
            print("Left Columns: %s" % str(left_columns))
            right_columns = []
            for column_ in result.get_augment_hint().right_columns:
                right_columns.append([])
                for column in column_:
                    right_columns[-1].append((column.resource_id, column.column_index))
            print("Right Columns: %s" % str(right_columns))
        else:
            print(result.id())
        print("-------------------")

In [67]:
def get_join_info(results):
    links = list()
    if not results:
        return
    id_to_join_info = dict()
    for result in results:
        id_ = result.get_json_metadata()['id']
        if id_ not in id_to_join_info:
            id_to_join_info[id_] = dict(
                url=None,
                left=[],
                right=[]
            )
        if not id_to_join_info[id_]['url']:
            if 'direct_url' in result.get_json_metadata()['metadata']['materialize']:
                id_to_join_info[id_]['url'] = result.get_json_metadata()['metadata']['materialize']['direct_url']
        for column_ in result.get_augment_hint().left_columns:
            for column in column_:
                id_to_join_info[id_]['left'].append(column.column_index - 1)  # ignore d3mIndex
        for column_ in result.get_augment_hint().right_columns:
            for column in column_:
                id_to_join_info[id_]['right'].append(column.column_index)
    return id_to_join_info

In [82]:
def download_datasets(id_to_join_info, dir_):
    current_working_dir = os.getcwd()
    os.chdir(dir_)
    try:
        for id_ in id_to_join_info:
            if id_to_join_info[id_]['url']:
                subprocess.call('wget -O %s %s'%(id_, id_to_join_info[id_]['url']), shell=True)
                join_info = open('.join-%s'%id_, 'w')
                join_info.write(str(id_to_join_info[id_]['left']) + '\n')
                join_info.write(str(id_to_join_info[id_]['right']))
                join_info.close()
            else:
                # try to find them on volumes
                datamart_file_path = os.path.join(DATAMART_PATH, 'volumes/datasets', id_, 'main.csv')
                if os.path.exists(datamart_file_path):
                    shutil.copyfile(datamart_file_path, id_)
                else:
                    print('%s has no direct url for download.' % id_)
    except Exception as e:
        raise e
    finally:
        os.chdir(current_working_dir)

In [70]:
if not os.path.exists('companion-datasets'):
    os.mkdir('companion-datasets')
for p in ['taxi-vehicle-collision', 'ny-taxi-demand', 'college-debt', 'poverty-estimation']:
    if not os.path.exists('companion-datasets/%s'%p):
        os.mkdir('companion-datasets/%s'%p)

In [6]:
client = datamart_nyu.NYUDatamart('http://localhost:8002/')

## NY Taxi and Vehicle Collision Problem

In [34]:
taxi_vehicle_collision_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/taxi-vehicle-collision/' +\
       'taxi-vehicle-collision-v2.csv'
taxi_vehicle_collision = container.Dataset.load('file://' + taxi_vehicle_collision_path)

In [35]:
cursor = client.search_with_data(query=None, supplied_data=taxi_vehicle_collision)

In [36]:
taxi_vehicle_collision_results = list()
results = cursor.get_next_page()
while results:
    taxi_vehicle_collision_results += results
    results = cursor.get_next_page()

In [37]:
len(taxi_vehicle_collision_results)

1020

In [38]:
# print_results(taxi_vehicle_collision_results)

In [69]:
taxi_vehicle_collision_links = get_join_info(taxi_vehicle_collision_results)

In [13]:
# download_datasets(taxi_vehicle_collision_links, 'companion-datasets/taxi-vehicle-collision/')

## NY Taxi Demand Problem

In [14]:
ny_taxi_demand_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/ny-taxi-demand/' +\
       'yellow-taxi-2017-v2.csv'
ny_taxi_demand = container.Dataset.load('file://' + ny_taxi_demand_path)

## College Debt Problem

In [71]:
college_debt_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/college-debt/' +\
       'college-debt-v2.csv'
college_debt = container.Dataset.load('file://' + college_debt_path)

In [72]:
cursor = client.search_with_data(query=None, supplied_data=college_debt)

In [73]:
college_debt_results = list()
results = cursor.get_next_page()
while results:
    college_debt_results += results
    results = cursor.get_next_page()

In [74]:
len(college_debt_results)

5

In [75]:
print_results(college_debt_results)

0.9575737
Most- Recent- Cohorts- Scorecard- Elements
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9575737
None
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9575737
None
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9575737
College Scorecard Data - Most Recent
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.8439322
Most- Recent- Cohorts- Scorecard- Elements
Left Columns: [[('0', 12)]]
Right Columns: [[('0', 83)]]
-------------------


In [76]:
college_debt_links = get_join_info(college_debt_results)

In [83]:
download_datasets(college_debt_links, 'companion-datasets/college-debt/')

## Poverty Estimation Problem

In [44]:
poverty_estimation_path = str(Path.home()) + '/projects/dataset-ranking/use-cases/data/poverty-estimation/' +\
       'poverty-estimation-v2.csv'
poverty_estimation = container.Dataset.load('file://' + poverty_estimation_path)

In [45]:
cursor = client.search_with_data(query=None, supplied_data=poverty_estimation)

In [46]:
poverty_estimation_results = list()
results = cursor.get_next_page()
while results:
    poverty_estimation_results += results
    results = cursor.get_next_page()

In [47]:
len(poverty_estimation_results)

15

In [48]:
print_results(poverty_estimation_results)

1.0
SF Development Pipeline 2017 Q3
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 25)]]
-------------------
1.0
SF Development Pipeline 2017 Q2
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 31)]]
-------------------
1.0
SF Development Pipeline 2019 Q2
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 25)]]
-------------------
0.93730605
Zillow Median Listing Prices 2017
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 3)]]
-------------------
0.9362234
FIPS Population
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9362234
None
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9362234
None
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.9362234
Unemployment in the US
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 0)]]
-------------------
0.012638724
SF Development Pipeline 2016 Q3
Left Columns: [[('0', 1)]]
Right Columns: [[('0', 23)]]
-------------------
0.0092448
SF Development Pipeline 2

In [84]:
poverty_estimation_links = get_join_info(poverty_estimation_results)

In [85]:
download_datasets(poverty_estimation_links, 'companion-datasets/poverty-estimation/')

datamart.upload.a8241c91db1e4d75a4e4dd37cce12cd1 has no direct url for download.
datamart.upload.2f6a998b4f5c4c589aaf990c867446b9 has no direct url for download.
