# Thresholding

This notebook will show the tours whilst adjusting the threshold value.

Each edge has a probability of being in the final tour. By adjusting this threshold, we can adjust the amount of true edges are in the tour.

In [None]:
import pandas as pd
import numpy as np
import pickle
import os

os.environ['PROJ_LIB'] = os.environ['CONDA_PREFIX'] + '\\Library\\share' # bug fix with anaconda and basemap
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import joblib

import model_utils as M

%matplotlib inline

In [None]:
set_idx = 1 # This variable allows you to preview different tours within the test set.
THRESHOLD = 0.2 # Threshold for all the tours.

Load in the models.

In [None]:
TSP_NB = joblib.load('Models/TSP_NB_model.pkl')
TSP_LR = joblib.load('Models/TSP_LR_model.pkl')
TSP_RF = joblib.load('Models/TSP_RF_model.pkl')

Import the data and split into Training and Testing sets.

In [None]:
DATA_DIR = '../6_feature_engineering/Feature_Dataset/'
data_files = [i for i in os.listdir(DATA_DIR) if i.endswith('.csv')]
data_files.sort()
df_TSP_list = [pd.read_csv(DATA_DIR + file) for file in data_files]

for i in range(len(df_TSP_list)):
    # Convert booleans to int
    df_TSP_list[i] = df_TSP_list[i].astype({'IS_IN_1ST_QUARTILE': 'int64',
                                            'IS_IN_2ND_QUARTILE': 'int64',
                                            'IS_IN_3RD_QUARTILE': 'int64',
                                            'EDGE_IN_SOL': 'int64'})
    df_TSP_list[i] = df_TSP_list[i].reset_index(drop=True)
    
df_TSP_list[0].head()

In [None]:
train_set = df_TSP_list[:5]
test_set = df_TSP_list[5:]

print("Train Data Graphs:\t{}".format(len(train_set)))
print("Test Data Graphs:\t{}".format(len(test_set)))

Load in the coordinate data.

In [None]:
COORD_DATA_DIR = '../5_ground_truth/Final_Dataset/'
coord_datafiles = [i for i in os.listdir(COORD_DATA_DIR) if i.endswith('.csv')]
coord_datafiles.sort()

df_coord_list = [pd.read_csv(COORD_DATA_DIR + file) for file in coord_datafiles]

for i in range(len(df_coord_list)):
    df_coord_list[i]['NODE1_COORDS'] = df_coord_list[i]['NODE1_COORDS'].apply(eval)
    df_coord_list[i]['NODE2_COORDS'] = df_coord_list[i]['NODE2_COORDS'].apply(eval)

    
coords_train_list = df_coord_list[:5]
coords_test_list = df_coord_list[5:]

coords_test_list[set_idx].head()

Find the bounds of the data.

In [None]:
PUBS_DATA = '../1_pubs_crawler/pubs.csv'

df_pubs = pd.read_csv(PUBS_DATA)
df_pubs

BBox = ((df_pubs.longitude.min(), df_pubs.longitude.max(),
         df_pubs.latitude.min(), df_pubs.latitude.max()))
BBox

Define the function to plot the tours.

In [None]:
def plot_tour(node_1_coords, node_2_coords, ground_truth=None):
    fig = plt.figure(figsize=(12,9))

    m = Basemap(projection='mill',
                llcrnrlat=BBox[2],
                urcrnrlat=BBox[3],
                llcrnrlon=BBox[0],
                urcrnrlon=BBox[1],
                resolution='i')

    m.drawcoastlines()

    for i in range(len(node_1_coords)):
        longs = []
        lats = []
        coord1 = node_1_coords.iloc[i]
        coord2 = node_2_coords.iloc[i]
        longs.append(coord1[1])
        lats.append(coord1[0])
        longs.append(coord2[1])
        lats.append(coord2[0])
        if ground_truth is None:
            m.plot(longs, lats, latlon=True, color='red', linewidth=1, marker='o', markersize=2, markerfacecolor='blue')
        else:
            if ground_truth.iloc[i]:  # If this edge is a part of the ground truth
                m.plot(longs, lats, latlon=True, color='blue', linewidth=1, marker='o', markersize=2, markerfacecolor='blue')
            else:
                m.plot(longs, lats, latlon=True, color='red', linewidth=1, marker='o', markersize=2, markerfacecolor='blue')

    plt.title("")
    plt.show()

## Ground Truth Tour

In [None]:
df_ground_truth = coords_test_list[set_idx][coords_test_list[set_idx]['EDGE_IN_SOL']]
df_ground_truth.head()

In [None]:
plot_tour(df_ground_truth['NODE1_COORDS'], df_ground_truth['NODE2_COORDS'], df_ground_truth['EDGE_IN_SOL'])

## Naïve Bayes tour

In [None]:
cols = test_set[set_idx].columns[:-1]  # Every column except the last one
X = test_set[set_idx][cols].values

RF_thresh_tour = M.threshold_tour(X, coords_test_list[set_idx], TSP_NB, threshold=THRESHOLD)
plot_tour(RF_thresh_tour['NODE1_COORDS'], RF_thresh_tour['NODE2_COORDS'], RF_thresh_tour['EDGE_IN_SOL'])

## Logisitc Regression Tour

In [None]:
cols = test_set[set_idx].columns[:-1]  # Every column except the last one
X = test_set[set_idx][cols].values

RF_thresh_tour = M.threshold_tour(X, coords_test_list[set_idx], TSP_LR, threshold=THRESHOLD)
plot_tour(RF_thresh_tour['NODE1_COORDS'], RF_thresh_tour['NODE2_COORDS'], RF_thresh_tour['EDGE_IN_SOL'])

## Random Forest Tour

In [None]:
cols = test_set[set_idx].columns[:-1]  # Every column except the last one
X = test_set[set_idx][cols].values

RF_thresh_tour = M.threshold_tour(X, coords_test_list[set_idx], TSP_RF, threshold=THRESHOLD)
plot_tour(RF_thresh_tour['NODE1_COORDS'], RF_thresh_tour['NODE2_COORDS'], RF_thresh_tour['EDGE_IN_SOL'])