In [1]:
import numpy as np
import tempfile
from shutil import copy
import pandas as pd
from multiprocessing import Pool
import multiprocessing.pool as mpp
import subprocess
import scipy.spatial.distance as ssd
import matplotlib.pyplot as plt
from function import *
from platform import python_version
import tqdm

# Approach
1. Generate list of maps to compare
1. Generate list of all required comparisons
1. Pre-load all csv maps into Python
1. Select map pair to compare
1. Create temp folder
1. Provide temp folder with maskland and mask file
1. Generate asc version of csv maps that are to be compared
1. Generate CSL and log file for the comparison and copy to temp folder
1. Copy legends folder to temp folder
1. Run comparison in MCK
1. Extract required stats from outputfile
1. Delete temp folder with all files
1. Go back and repeat from step 4 until all comparisons have been run
1. Export all data to disk
<img src="PythonPipeline.png">

# Problems
1. For some reason map19 in the sample converts some rows to 'NULL' eventhough corresponding rows in ascmaps & ascmaps2 are correct

# Help Functions
<b><font size="4">Any function not defined here can be found in function.py</font></b><br>
This distinction is necessary as pool does not run with the function defined in the notebook.

In [2]:
def single_df(maps, stats, metric):
    """This function takes the output from the run comparisons function for single map comparison metrics and returns the distance matrix"""
    #create base df to form basis for distance matrix
    df = pd.DataFrame(index=maps)
    df[metric] = stats
    #calculate euclidean distance between all values then change to matrix form
    matrix = ssd.squareform(ssd.pdist(df))
    df_clean = pd.DataFrame(matrix, index=maps, columns=maps)
    
    # save values to disk
    csv_val = csv_dir + metric + '_values.csv'
    df_vals = pd.DataFrame(index=map_set)
    df_vals[metric] = stats
    df_vals.to_csv(csv_val)
    #save df to disk
    csv_name = csv_dir + metric + '_df.csv'
    df_clean.to_csv(csv_name, index=False)
    return df_clean

def multi_df(map1, map2, stats, metric):
    """This function takes the output from the run comparisons function for multi map comparison metrics and returns the distance matrix"""
    #Create two dataframes with swapped map columns
    df = pd.DataFrame()
    df['map1'] = [x for x in map1]
    df['map2'] = [x for x in map2]
    df[metric] = stats
    df2 = df
    df2 = df2[[ 'map2', 'map1', metric]]
    df2.columns = ['map1', 'map2', metric]      
    df_concat = pd.concat([df, df2])
    df_pivot = df_concat.pivot(index='map2', columns='map1', values=metric)
    
    #clean up the presentation
    #Remove unecessary labeling
    index = df_pivot.index.union(df_pivot.columns)
    df_clean = df_pivot.reindex(index=index, columns=index)
    #reindex to correct numerical order
    ordered = df_clean.index.to_series().str.rsplit('p').str[-1].astype(int).sort_values()
    df_clean = df_clean.reindex(index=ordered.index, columns=ordered.index).fillna(1).round(decimals=3)
    
    #save df to disk
    csv_name = csv_dir + metric + '_df.csv'
    df_clean.to_csv(csv_name, index=False)
    return df_clean

def istarmap(self, func, iterable, chunksize=1):
    """starmap-version of imap
    """
    if self._state != mpp.RUN:
        raise ValueError("Pool not running")

    if chunksize < 1:
        raise ValueError(
            "Chunksize must be 1+, not {0:n}".format(
                chunksize))

    task_batches = mpp.Pool._get_tasks(func, iterable, chunksize)
    result = mpp.IMapIterator(self._cache)
    self._taskqueue.put(
        (
            self._guarded_task_generation(result._job,
                                          mpp.starmapstar,
                                          task_batches),
            result._set_length
        ))
    return (item for chunk in result for item in chunk)


mpp.Pool.istarmap = istarmap

# Setup Variables Main

In [3]:
#directory where the dataframes containing output are stored
csv_dir = 'C:/LUMOS/MCK/Output_DFs/'

#Variables that store number of iteration for single and multi map comparisons
single_its = len(map_list)
multi_its = len(map_pairs)

map_set = ['map' + str(i) for i in range(nr_maps)]

# Metric_ID

This list contains the metrics available for analysis and the metric_id that needs to be assigned to the metric_id list to generate the desired output, and the value passed to the extract stats function:<br>

| Metric                        |   ID   |              note             |
|-------------------------------|--------|--------------------------------|
|   Kappa                       | kappa  |         done                       |
|   Fuzzy Kappa                 | kfuzzy |                      |
|   Overall Accuracy                 | oa |                      |
|   Proportion Correct               | prop |                      |
|   Allocation Disagreement     | alloc  |                      | 
|   Quantity Disagreement       | quant  |                      | 
|   Total Disagreement       | td  |                      |
|   Total Class Area          | tca|                      |
|  Percentage of Landscape  | pland|                      |
|   Simpson's Diversity Index   | simp|                 done     |
|   Shannon's Diversity Index   | shan|             done         |

# Run Kappa

In [None]:
kappa = []
kappa_maps1 = []
kappa_maps2 = []

with Pool(16) as pool:
    iterable =  [(i, 'kappa') for i in range(multi_its)]
    for n1, n2, sts in tqdm.tqdm(pool.istarmap(run_comparisons, iterable),
                       total=len(iterable)):
        kappa_maps1.append(n1)
        kappa_maps2.append(n2)
        kappa.append(sts) 

### Kappa Dataframe

In [None]:
df_kappa = multi_df(kappa_maps1, kappa_maps2, kappa, 'kappa')
df_kappa.head(5)

# Run Fuzzy Kappa

In [4]:
kfuzzy = []
kfuzzy_maps1 = []
kfuzzy_maps2 = []

with Pool(16) as pool:
    iterable =  [(i, 'kfuzzy') for i in range(multi_its)]
    for n1, n2, sts in tqdm.tqdm(pool.istarmap(run_comparisons, iterable),
                       total=len(iterable)):
        kfuzzy_maps1.append(n1)
        kfuzzy_maps2.append(n2)
        kfuzzy.append(sts) 

100%|████████████████████████████████████████████████████████████████████████████████| 190/190 [18:16<00:00,  5.77s/it]


### Fuzzy Kappa DF

In [5]:
df_kfuzzy = multi_df(kfuzzy_maps1, kfuzzy_maps2, kfuzzy, 'kfuzzy')
df_kfuzzy.head(5)

Unnamed: 0,map0,map1,map2,map3,map4,map5,map6,map7,map8,map9,map10,map11,map12,map13,map14,map15,map16,map17,map18,map19
map0,1.0,0.946,0.93,0.951,0.96,0.951,0.923,0.923,0.921,0.949,0.94,0.919,0.954,0.943,0.942,0.985,0.949,0.939,0.94,0.927
map1,0.946,1.0,0.972,0.987,0.948,0.987,0.966,0.964,0.961,0.946,0.953,0.966,0.934,0.939,0.934,0.946,0.98,0.963,0.96,0.971
map2,0.93,0.972,1.0,0.972,0.94,0.972,0.966,0.967,0.964,0.939,0.95,0.961,0.936,0.94,0.937,0.934,0.972,0.958,0.951,0.963
map3,0.951,0.987,0.972,1.0,0.944,0.999,0.966,0.968,0.961,0.937,0.952,0.96,0.937,0.936,0.938,0.954,0.991,0.958,0.958,0.97
map4,0.96,0.948,0.94,0.944,1.0,0.944,0.94,0.931,0.933,0.956,0.953,0.935,0.959,0.958,0.947,0.963,0.936,0.958,0.972,0.954


# Run Simpson's

In [6]:
simp = []
simp_maps1 = []
simp_maps2 = []

with Pool(16) as pool:
    iterable =  [(i, 'simp') for i in range(single_its)]
    for n1, n2, sts, sts2 in tqdm.tqdm(pool.istarmap(run_comparisons, iterable),
                       total=len(iterable)):
        simp_maps1.append(n1)
        simp_maps2.append(n2)
        simp.append(sts)
        if simp_maps1 !=  simp_maps2:
            simp.append(sts2)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:19<00:00,  1.99s/it]


### Simpson's Dataframe

In [7]:
df_simp = single_df(map_set, simp, 'simp')
df_simp.head(5)

Unnamed: 0,map0,map1,map2,map3,map4,map5,map6,map7,map8,map9,map10,map11,map12,map13,map14,map15,map16,map17,map18,map19
map0,0.0,0.00518,0.004092,0.004491,0.001614,0.004490696,0.006787,0.006762,0.006689,0.001163,0.003904,0.006722,0.000464,0.000319,0.00032,0.001211,0.004202,0.003717,0.004681,0.007769
map1,0.00518,0.0,0.001087,0.000689,0.003566,0.0006889872,0.001607,0.001583,0.001509,0.004017,0.001275,0.001542,0.005644,0.004861,0.00486,0.006391,0.000978,0.001463,0.000499,0.00259
map2,0.004092,0.001087,0.0,0.000398,0.002478,0.0003984811,0.002695,0.00267,0.002597,0.002929,0.000188,0.00263,0.004557,0.003774,0.003772,0.005303,0.000109,0.000375,0.000588,0.003677
map3,0.004491,0.000689,0.000398,0.0,0.002877,1.724726e-07,0.002296,0.002272,0.002198,0.003327,0.000586,0.002231,0.004955,0.004172,0.00417,0.005702,0.000289,0.000774,0.00019,0.003279
map4,0.001614,0.003566,0.002478,0.002877,0.0,0.002876878,0.005173,0.005149,0.005075,0.000451,0.002291,0.005108,0.002078,0.001295,0.001294,0.002825,0.002588,0.002103,0.003067,0.006156


# Run Shannon's

In [None]:
shan = []
shan_maps1 = []
shan_maps2 = []

with Pool(16) as pool:
    iterable =  [(i, 'shan') for i in range(single_its)]
    for n1, n2, sts, sts2 in tqdm.tqdm(pool.istarmap(run_comparisons, iterable),
                       total=len(iterable)):
        shan_maps1.append(n1)
        shan_maps2.append(n2)
        shan.append(sts)
        if shan_maps1 !=  shan_maps2:
            shan.append(sts2)

### Shannon's DF

In [None]:
df_shan = single_df(map_set, shan, 'shannon')
df_shan.head(5)