# Explaination

This is file is to help export csvs (to then be cropped using GSVutils.utils.bulk_export_crops) for the old dataset containing crops directly around labeled features (as opposed to to the new dataset which has crops made from sliding window points).

The reason we need a new file is because here we are re-making the old dataset (including remaking the null crops from esther's csv) so that we can recrop them and include sidecar files that have metadata. GSVutils.utils.bulk_export_crops automatically writes these metadata sidecars, we just need to give it a list of the crops to make.

We're going to select labels for the training and val sets using the existing partitions (at the pano level) that were made for the new sliding window dataset

In [1]:
import csv
from collections import defaultdict
import numpy
import GSVutils.point as point

In [12]:
path_to_labels    = '/mnt/c/Users/gweld/sidewalk/minus_onboard.csv'
path_to_nulls     = '/mnt/c/Users/gweld/sidewalk/random_null_crops.csv'
path_to_train_set = '/mnt/c/Users/gweld/sidewalk/sidewalk_ml/dataset_csvs/Train.csv'
path_to_val_set   = '/mnt/c/Users/gweld/sidewalk/sidewalk_ml/dataset_csvs/Val.csv'

In [18]:
with open(path_to_train_set) as f:
    reader = csv.reader(f)
    header_row = next(reader)
print " ".join(header_row)

Pano ID SV_x SV_y Label Photographer Heading Heading Label ID


# get train and val set panos

In [4]:
train_panos = set()

with open(path_to_train_set) as f:
    reader = csv.reader(f)
    for row in reader:
        pano_id = row[0]
        train_panos.add(pano_id)
train_panos.remove("Pano ID")

In [5]:
val_panos = set()

with open(path_to_val_set) as f:
    reader = csv.reader(f)
    for row in reader:
        pano_id = row[0]
        val_panos.add(pano_id)
val_panos.remove("Pano ID")

In [6]:
len(train_panos), len(val_panos)

(46463, 5774)

# copy rows into dicts

In [22]:
val_rows =   []
train_rows = []

with open(path_to_labels) as f:

    reader = csv.reader(f)
    for row in reader:
        pano_id = row[0]
        label = row[3]
        if label not in ('1', '2', '3', '4'):
            # skip label types we don't care about
            continue
        if pano_id in train_panos: train_rows.append(row)
        elif pano_id in val_panos:   val_rows.append(row)

print len(train_rows), len(val_rows)

160013 19748


# add in nulls

In [23]:
with open(path_to_nulls) as f:

    reader = csv.reader(f)
    for row in reader:
        pano_id = row[0]
        if pano_id in train_panos: train_rows.append(row)
        elif pano_id in val_panos:   val_rows.append(row)

print len(train_rows), len(val_rows)

166030 20543


# write these badies to files

In [24]:
path_to_train_set_out = '/mnt/c/Users/gweld/sidewalk/sidewalk_ml/new_old_dataset_csvs/Train.csv'
path_to_val_set_out   = '/mnt/c/Users/gweld/sidewalk/sidewalk_ml/new_old_dataset_csvs/Val.csv'

In [25]:
for dataset, path in [(train_rows, path_to_train_set_out), (val_rows, path_to_val_set_out)]:
    with open(path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(header_row)
        count = defaultdict(int)
        for row in dataset:
            label = row[3]
            writer.writerow(row)
            count[label] += 1
    print "Wrote {} items to {}".format(sum(count.values()), path)
    for label, count in count.items():
        print "{:>5} {}".format(label, count)
    print ""

Wrote 166030 items to /mnt/c/Users/gweld/sidewalk/sidewalk_ml/new_old_dataset_csvs/Train.csv
    1 119799
    8 6017
    3 17519
    2 15692
    4 7003

Wrote 20543 items to /mnt/c/Users/gweld/sidewalk/sidewalk_ml/new_old_dataset_csvs/Val.csv
    1 14697
    8 795
    3 2192
    2 1979
    4 880

