### Notebook to visualize HAC clustering results from larger holdout data

Plot HAC clustering from sklearn article: [Plot Hierarchical Clustering Dendrogram](https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html#sphx-glr-auto-examples-cluster-plot-agglomerative-dendrogram-py)

In [2]:
%matplotlib inline
import os
import gc
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Visualization for dendrogram
from scipy.cluster.hierarchy import dendrogram

In [4]:
def pkl_loader(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [10]:
basedatadir = './large_hac1'

In [17]:
def file_getter(basedatadir=None, expname=None, nclusters=None, cexpname=None,score_only=True):
    """ TODO in future specify clustering expname"""
    datadir = os.path.join(basedatadir, f"{expname}/nclusters-{nclusters}/HAC")
    try:
        tmp = glob.glob(os.path.join(datadir, f'*{cexpname}.pkl'))
        a = [i for i in tmp]
        print("Begin loading result files")
    except Exception as e:
        print(e)
        
    scores =  pkl_loader(
            glob.glob(os.path.join(datadir, f"score-hac_{cexpname}.pkl"))[0]
    )
    if not score_only:
        omodel =  pkl_loader(
            glob.glob(os.path.join(datadir, f"original-hac_{cexpname}.pkl"))[0]
        )
        rmodel =  pkl_loader(
            glob.glob(os.path.join(datadir, f"reprot-hac_{cexpname}.pkl"))[0]
        )
        return scores, omodel, rmodel
    else:
        return scores

---------------
#### RI

In [18]:
scoreRI2000 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4583049",
                          nclusters=2000, score_only=True)

Begin loading result files


In [25]:
scoreRI2000

{'nclusters-2000': {'ami': -0.0030743676592053857,
  'cls': 0.7890443390587428,
  'hs': 0.682182417322676}}

In [27]:
scoreRI2000 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4585449",
                          nclusters=2000, score_only=True)

Begin loading result files


In [28]:
scoreRI2000

{'nclusters-2000': {'ami': -0.003906495780173582,
  'cls': 0.7258206222790605,
  'hs': 0.6240351687680873}}

In [20]:
scoreRI100 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4583334",
                          nclusters=100, score_only=True)

Begin loading result files


In [21]:
scoreRI100

{'nclusters-100': {'ami': 0.49775915133408816,
  'cls': 0.5250588276607263,
  'hs': 0.5837425612029344}}

In [29]:
scoreRI20 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4585692",
                          nclusters=20, score_only=True)
scoreRI20

Begin loading result files


{'nclusters-20': {'ami': 0.3727912117267518,
  'cls': 0.3694863967388016,
  'hs': 0.3875373646654667}}

In [30]:
scoreRI10 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4585694",
                          nclusters=10, score_only=True)
scoreRI10

Begin loading result files


{'nclusters-10': {'ami': 0.2749294249104568,
  'cls': 0.270921330463771,
  'hs': 0.2827657385164835}}

In [32]:
scoresRI = []
for cexpname, nclusters in zip(
    [4586180,4586181,4586182,4586183,4586184,4586185,4586186,4586179 ],
    [2,4,8,16,32,64,128,256]
    ):
    tmp_score = file_getter(basedatadir=basedatadir, expname='67011582', cexpname=str(cexpname),
                          nclusters=nclusters, score_only=True)
    scoresRI.append(tmp_score)

Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files


In [33]:
scoresRI

[{'nclusters-2': {'ami': -0.00012688068273619858,
   'cls': 8.3908108888613345e-16,
   'hs': 1.1139569852771803e-15}},
 {'nclusters-4': {'ami': 0.1623027487483164,
   'cls': 0.1551559466782175,
   'hs': 0.1711232043322862}},
 {'nclusters-8': {'ami': 0.19606965608493074,
   'cls': 0.19580718330938102,
   'hs': 0.1990257661609034}},
 {'nclusters-16': {'ami': 0.36363988634507727,
   'cls': 0.3496807006622599,
   'hs': 0.38705005566309214}},
 {'nclusters-32': {'ami': 0.4564321286457516,
   'cls': 0.4437581757689382,
   'hs': 0.4941132735877916}},
 {'nclusters-64': {'ami': 0.47742376135150927,
   'cls': 0.4839587039323898,
   'hs': 0.5392947849802405}},
 {'nclusters-128': {'ami': 0.5072805550995583,
   'cls': 0.5456484135659448,
   'hs': 0.6083612546860472}},
 {'nclusters-256': {'ami': 0.5068012896885425,
   'cls': 0.5838944078925472,
   'hs': 0.6627520560607516}}]

--------------
#### RI after left out non structured patches

In [54]:
scoreRI674, orimodel, rimodel = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4590898",
                          nclusters=674, score_only=False)
scoreRI674

Begin loading result files




{'nclusters-674': {'ami': -0.004311490039035771,
  'cls': 0.7694366410700049,
  'hs': 0.7102996621677017}}

In [55]:
orimodel_labels = orimodel.labels_
rimodel_labels = rimodel.labels_

In [56]:
orimodel_labels[:100]

array([618, 667, 626, 665, 673, 634, 597, 636, 661, 455, 500, 586, 642,
       655, 497, 575, 438, 644, 447, 580, 361, 593, 606, 559, 408, 382,
       525, 448, 540, 552, 424, 550, 553, 464, 545, 466, 378, 491, 565,
       533, 468, 592, 549, 418, 445, 627, 641, 602, 508, 643, 616, 435,
       663, 620, 567, 617, 566, 651, 442, 649, 428, 465, 622, 600, 390,
       437, 614, 625, 546, 434, 652, 539, 611, 409, 232, 325, 395, 555,
       571, 547, 345, 456, 463, 647, 274, 223, 591, 503, 427, 521, 355,
       623, 231, 467, 495, 543, 576, 487, 419, 431])

In [57]:
rimodel_labels[:100]

array([  6, 578, 393, 441, 176, 115,  52, 175, 303, 330, 393, 309, 176,
       330,  64, 243, 176, 115,  52, 175, 303, 114, 617, 243, 328, 578,
       393, 441, 328, 517,  70, 552, 176, 114, 617, 636, 327,  43, 127,
       494, 303, 114, 617, 243, 176, 115, 617, 175, 176, 115, 617, 636,
        46, 114,  64, 504,  20, 578, 333, 336,  14, 119,  70, 552,  14,
       114, 456, 326,  20, 115,  52, 175, 209, 142, 134,  10, 638, 458,
        61, 206, 477, 162, 195,  33, 120, 578, 123, 166, 328, 112, 123,
       441, 124, 288,  54, 464, 124, 458,  61, 206])

In [63]:
np.max(rimodel_labels)

673

In [71]:
np.where(rimodel_labels == 578)[0]//4

array([  0,   6,  14,  21,  33,  37,  44,  59,  65, 155, 178, 262, 263,
       281, 302, 319, 403, 483, 484, 505, 512, 528, 529, 536, 556, 597,
       607, 611, 612])

After debugging of selected index 

In [73]:
scoresRI = []
for cexpname, nclusters in zip(
    [4591360, 4591361, 4591362,4591363,4591364, 4591365, 4591366,4591359],
    [2,4,8,16,32,64,128,256]
    ):
    tmp_score = file_getter(basedatadir=basedatadir, expname='67011582', cexpname=str(cexpname),
                          nclusters=nclusters, score_only=True)
    scoresRI.append(tmp_score)

Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files


In [74]:
scoresRI 

[{'nclusters-2': {'ami': -0.0003187614035202872, 'cls': 0.0, 'hs': 0.0}},
 {'nclusters-4': {'ami': -0.0012338901741523265,
   'cls': 1.6017132519074586e-16,
   'hs': 1.6726832409356116e-16}},
 {'nclusters-8': {'ami': 0.2371615569549225,
   'cls': 0.2285255201999385,
   'hs': 0.2546356478011065}},
 {'nclusters-16': {'ami': 0.2868797017755245,
   'cls': 0.28999011436304134,
   'hs': 0.30946702197881837}},
 {'nclusters-32': {'ami': 0.36934627648230267,
   'cls': 0.3961926336582112,
   'hs': 0.4173295135427449}},
 {'nclusters-64': {'ami': 0.392455738298038,
   'cls': 0.4791146875565044,
   'hs': 0.49484173270304715}},
 {'nclusters-128': {'ami': 0.37965359793293607,
   'cls': 0.5620717915066537,
   'hs': 0.5785309653574207}},
 {'nclusters-256': {'ami': 0.3249241553894448,
   'cls': 0.6398029584166206,
   'hs': 0.6525864333600652}}]

In [77]:
# copy size increase from 4 to 12
scoreRI300 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4595995",
                          nclusters=300, score_only=True)
scoreRI300

Begin loading result files


{'nclusters-300': {'ami': -0.016010367780507573,
  'nmi': 0.5224396879957052,
  'cls': 0.538280117350661,
  'hs': 0.507504907072814,
  'ars': -0.003850189810453388}}

In [79]:
# copy size increase from 12 to 24
scoreRI300 = file_getter(basedatadir=basedatadir, expname='67011582', cexpname="4596305",
                          nclusters=300, score_only=True)
scoreRI300

Begin loading result files


{'nclusters-300': {'ami': 0.08048845589254608,
  'nmi': 0.4691174533448532,
  'cls': 0.47970336474892916,
  'hs': 0.4589886660334527,
  'ars': 0.01260017657611253}}

In [83]:
scoresRI = []
for cexpname, nclusters in zip(
    [4596433,4596434,4596479,4596481,4596507,4596513,4596432],
    [2,4,8,16,32,64,128]
    ):
    tmp_score = file_getter(basedatadir=basedatadir, expname='67011582', cexpname=str(cexpname),
                          nclusters=nclusters, score_only=True)
    scoresRI.append(tmp_score)

Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files


In [84]:
scoresRI

[{'nclusters-2': {'ami': -0.00010393628141146943,
   'nmi': 3.438735347612704e-08,
   'cls': 3.383067363938149e-08,
   'hs': 3.4962660018309707e-08,
   'ars': -0.00010748196611937699}},
 {'nclusters-4': {'ami': -8.061710320814813e-05,
   'nmi': 0.00040254349569967195,
   'cls': 0.0003812375531387048,
   'hs': 0.0004263718240421079,
   'ars': -0.0002964264207650843}},
 {'nclusters-8': {'ami': 0.09040616516847451,
   'nmi': 0.09205709610453243,
   'cls': 0.08515839122957922,
   'hs': 0.10017206266333763,
   'ars': 0.05496634724730505}},
 {'nclusters-16': {'ami': 0.17643446182834527,
   'nmi': 0.18150740921904535,
   'cls': 0.17373826822137634,
   'hs': 0.19000390900748876,
   'ars': 0.07659856549735698}},
 {'nclusters-32': {'ami': 0.2051621427401649,
   'nmi': 0.2227250937085756,
   'cls': 0.21251787666856842,
   'hs': 0.233962283958033,
   'ars': 0.06302294169651051}},
 {'nclusters-64': {'ami': 0.22595606407249735,
   'nmi': 0.2839857444953736,
   'cls': 0.26925407510923055,
   'hs': 0.

-------------
#### NRI

In [22]:
scoreNRI2000 = file_getter(basedatadir=basedatadir,
                           expname='m2_02_global_2000_2018_band28_29_31', cexpname="4583443",
                           nclusters=2000, score_only=True)

Begin loading result files


In [24]:
scoreNRI2000

{'nclusters-2000': {'ami': 0.3396396788708435,
  'cls': 0.8894683429034593,
  'hs': 0.750603589555923}}

In [23]:
scoreNRI100 = file_getter(basedatadir=basedatadir,
                           expname='m2_02_global_2000_2018_band28_29_31', cexpname="4583475",
                           nclusters=100, score_only=True)

Begin loading result files


In [26]:
scoreNRI100

{'nclusters-100': {'ami': 0.6400035633954669,
  'cls': 0.6671413142366469,
  'hs': 0.6935826582810746}}

In [34]:
# copy_size 4
scoresNRI = []
for cexpname, nclusters in zip(
    [4586634,4586635,4586636,4586637,4586638,4586639,4586640,4586630],
    [2,4,8,16,32,64,128,256]
    ):
    tmp_score = file_getter(basedatadir=basedatadir, 
                            expname='m2_02_global_2000_2018_band28_29_31', cexpname=str(cexpname),
                          nclusters=nclusters, score_only=True)
    scoresNRI.append(tmp_score)

Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files


In [35]:
scoresNRI 

[{'nclusters-2': {'ami': 0.4128594447505487,
   'cls': 0.47886543767205425,
   'hs': 0.3633319633431876}},
 {'nclusters-4': {'ami': 0.7667494909799751,
   'cls': 0.7476850590746659,
   'hs': 0.7880485465401278}},
 {'nclusters-8': {'ami': 0.6477535971893129,
   'cls': 0.6367982398847724,
   'hs': 0.6650142485913162}},
 {'nclusters-16': {'ami': 0.6606824208484383,
   'cls': 0.6583685809797838,
   'hs': 0.683374325395819}},
 {'nclusters-32': {'ami': 0.6727681693574846,
   'cls': 0.6863959105393157,
   'hs': 0.7180655271574099}},
 {'nclusters-64': {'ami': 0.7329132118741402,
   'cls': 0.7680096277501182,
   'hs': 0.7993717226610066}},
 {'nclusters-128': {'ami': 0.7253333110071576,
   'cls': 0.8247853408445397,
   'hs': 0.8070757610746954}},
 {'nclusters-256': {'ami': 0.5981808135158303,
   'cls': 0.881317781635395,
   'hs': 0.7321460932830813}}]

In [36]:
scoreNRI10 = file_getter(basedatadir=basedatadir,
                           expname='m2_02_global_2000_2018_band28_29_31', cexpname="4585656",
                           nclusters=10, score_only=True)

Begin loading result files


In [37]:
scoreNRI10

{'nclusters-10': {'ami': 0.7572164790708689,
  'cls': 0.7280917000833951,
  'hs': 0.795763362793223}}

--------------
#### NRI after left out non structured patches

In [72]:
scoreNRI674, orimodel, rimodel = file_getter(basedatadir=basedatadir,
                            expname='m2_02_global_2000_2018_band28_29_31', cexpname="4591099",
                            nclusters=674, score_only=False)
scoreNRI674

Begin loading result files




{'nclusters-674': {'ami': 0.13615622130444227,
  'cls': 0.80655517579,
  'hs': 0.7387182640492443}}

After debugging 

In [75]:
scoresNRI = []
for cexpname, nclusters in zip(
    [4591373,4591429,4591449,4591450,4591451,4591452,4591587,4591372 ],
    [2,4,8,16,32,64,128,256]
    ):
    tmp_score = file_getter(basedatadir=basedatadir, 
                            expname='m2_02_global_2000_2018_band28_29_31', cexpname=str(cexpname),
                          nclusters=nclusters, score_only=True)
    scoresNRI.append(tmp_score)

Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files
Begin loading result files


In [76]:
scoresNRI

[{'nclusters-2': {'ami': 0.6345534910991922,
   'cls': 0.6243102293391076,
   'hs': 0.6453455036011481}},
 {'nclusters-4': {'ami': 0.49519415777385545,
   'cls': 0.50291832107349,
   'hs': 0.48897604689229973}},
 {'nclusters-8': {'ami': 0.4234319868539766,
   'cls': 0.4334429667685152,
   'hs': 0.41908500382967107}},
 {'nclusters-16': {'ami': 0.4174203241392342,
   'cls': 0.4303661031763206,
   'hs': 0.4237860791018749}},
 {'nclusters-32': {'ami': 0.4136720532550572,
   'cls': 0.44784490276575956,
   'hs': 0.4495099159397826}},
 {'nclusters-64': {'ami': 0.4235108582888609,
   'cls': 0.514021114901282,
   'hs': 0.5251646761032006}},
 {'nclusters-128': {'ami': 0.41570301231502427,
   'cls': 0.5873827361268931,
   'hs': 0.6115586157904154}},
 {'nclusters-256': {'ami': 0.3806636412411501,
   'cls': 0.6776555603769518,
   'hs': 0.6829078622547784}}]

In [78]:
### copy_size = 12
scoreNRI300, orimodel, rimodel = file_getter(basedatadir=basedatadir,
                            expname='m2_02_global_2000_2018_band28_29_31', cexpname="4596123",
                            nclusters=300, score_only=False)
scoreNRI300

Begin loading result files


{'nclusters-300': {'ami': 0.19720699847391798,
  'nmi': 0.6107663706735132,
  'cls': 0.6371547883788087,
  'hs': 0.5864768323858783,
  'ars': 0.05756680248680777}}

In [80]:
### copy_size = 12
scoreNRI300, orimodel, rimodel = file_getter(basedatadir=basedatadir,
                            expname='m2_02_global_2000_2018_band28_29_31', cexpname="4596308",
                            nclusters=300, score_only=False)
scoreNRI300

Begin loading result files


{'nclusters-300': {'ami': 0.24760808481968782,
  'nmi': 0.5609934733133397,
  'cls': 0.5766054839125432,
  'hs': 0.5462045889378914,
  'ars': 0.06051352551642502}}