# Grid search 2 - Find the optimal parameters

In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
# load libs
import pandas as pd
import os
import json
import numpy as np
import shutil

def kullback_leibler(distances, titles=None):
    """
    :param distances:
    A data frame, output of compute(); _sum will be used to calculate Kullback-Leibler divergence measure
    titles[0] ground truth, titles[1] model
    """
    if (titles == None) | (len(titles) != 2):
        raise Exception("Two distance distributions need to be specified.")
    distances = distances.loc[distances[titles[1] + '_sum'] != 0, :]
    d_ground_truth = distances[titles[0] + '_sum'].values
    d_model = distances[titles[1] + '_sum'].values
    d_kl = sum(d_ground_truth * np.log10(d_ground_truth / d_model))
    return d_kl


def nan_proc(row, scale):
    r = row['country']
    rid = row['runid']
    df = pd.read_csv(f'../../results/{r}-{rid}/distance-metrics-{scale}.csv')
    # Sampers vs the proposed model
    return kullback_leibler(distances=df, titles =['sampers', 'model'])


## 1 Process the results of the second layer of grid searching
### 1.1 Get the results paths

In [22]:
country = 'saopaulo'
rs_path = '../../results/'
vs_path = f'../../dbs/{country}/visits/'
paths = os.listdir(rs_path)
paths = [x for x in paths if '.csv' not in x]
rs_paths = [rs_path + x + '/' for x in paths if country in x]
runids = [x.split('-')[1][:-1] for x in rs_paths]

### 1.2 Read results.json and parameters.json

In [23]:
log_list = []
for run, rs in zip(runids, rs_paths):
    if country == 'sweden':
        para = json.load(open(rs + 'parameters.json'))['visits']
    else:
        para = json.load(open(rs + 'parameters.json'))
    kl = json.load(open(rs + 'results.json'))
    # log the results
    log = dict()
    log['country'] = country
    log['runid'] = run
    log['p'] = para['model']['p']
    log['gamma'] = para['model']['gamma']
    log['beta'] = para['model']['region_sampling']['beta']
    if country == 'sweden':
        for k, v in kl.items():
            log[k] = v
    else:
        log['national'] = kl
    log_list.append(log)
df = pd.DataFrame(log_list)

### 1.3 Nan process

In [5]:
if country == 'sweden':
    scale = 'national'
    df.loc[np.isnan(df[scale]), scale] = df.loc[np.isnan(df[scale]), :].apply(lambda row: nan_proc(row, scale), axis=1)

### 1.4 Save the results

In [24]:
df.to_csv(f'../../results/{country}-grid-search-2.csv', index=False)

## 2 Find the optimal parameters

In [25]:
if country == 'sweden':
    for scale in ['national', 'east', 'west']:
        print(df.loc[df[scale] == min(df[scale]), ['runid', 'p', 'gamma', 'beta', scale]].transpose())
    df2 = df.loc[(df['national'] == min(df['national'])) | (df['east'] == min(df['east'])) | (df['west'] == min(df['west']))]
    runids2keep = list(df2.runid)
else:
    df2 = df.loc[df['national'] == min(df['national'])]
    runids2keep = list(df2.runid)

### 2.1 Remove the other results

In [26]:
# log results
for path in rs_paths:
    if path.split('-')[1][:-1] not in runids2keep:
        shutil.rmtree(path)

In [27]:
# visits results
for runid in runids:
    if runid not in runids2keep:
        os.remove(vs_path + f'{runid}.csv')