# Initialize Datasets
## Places

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

import json
from tqdm import tqdm

### Dataset Parameters

In [5]:
ds_path = "../../data/places_dump_US.geojson"
output_dir = "../../data/exported/"

num_objects = 100000 * 2
num_queries = 5000000

should_scale = True
grid_range = 512
spatial_uni = False
fixed_max_keywords = True
min_keywords = 1
max_keywords = 7

seed = 7

### Load Data

In [6]:
content = []

with open(ds_path, "r") as f:
    place_id = 0
    for i, line in tqdm(enumerate(f)):
        if (len(content) >= num_objects + num_queries):
            break

        place = json.loads(line)
        if ('id' in place and 'properties' in place and 'tags' in place['properties']):
            if (len(place['properties']['tags']) < min_keywords):
                continue
            
            content.append({
                "id": place_id,
                "x": place['geometry']['coordinates'][0],
                "y": place['geometry']['coordinates'][1],
                "keywords": place['properties']['tags'],
                "scaled": False
            })
            place_id += 1;


df = pd.DataFrame(content)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../../data/places_dump_US.geojson'

: 

### Pre-process

In [None]:
def scale(c, minc, maxc):
    return (c - minc) * grid_range / (maxc - minc)


if should_scale and not spatial_uni:
    minx = df['x'].min()
    miny = df['y'].min()
    maxx = df['x'].max()
    maxy = df['y'].max()
    
    df['x'] = df['x'].apply(lambda x: scale(x, minx, maxx))
    df['y'] = df['y'].apply(lambda y: scale(y, miny, maxy))
    df['scaled'] = True

df.head()

### Uniform Sample

In [None]:
if spatial_uni:
    sample = pd.DataFrame(np.random.uniform(0, 512, (num_objects + num_queries, 2)), columns=['x', 'y'])
    
    df['x'] = sample['x']
    df['y'] = sample['y']

df.head()

In [None]:
if fixed_max_keywords:
    import random

    def clamp_keywords(keywords):
        if (len(keywords) < max_keywords):
            return keywords
        return sorted(random.sample(keywords, max_keywords))
    df['keywords'] = df['keywords'].apply(clamp_keywords)

df.head()

### Visualize

In [None]:
sns.scatterplot(x="x", y="y", data=df.loc[:, ["x", "y"]])

### Export

In [None]:
fail

In [None]:
output_name = f'{output_dir}places_o{num_objects}_q{num_queries}'

if (should_scale and not spatial_uni):
    output_name += f'_scaled'

if (spatial_uni):
    output_name += f'_spatialuni'

if (min_keywords != 0):
    output_name += f'_minkeys' + str(min_keywords)

df.to_json(f'{output_name}.json', orient='records', lines=True)
print(f'{output_name}.json')

In [None]:
df.shape