# Data pre-processing

## Prepare data for context change prediction

In [None]:
# Import Module
import os
import pandas as pd
  
# Change the directory
path = "/content/drive/MyDrive/IR Disentanglement Project/Data/test"
os.chdir(path)

# iterate through all file
annotations_files_list,ascii_files_list = [],[]
for filename in os.listdir():
    if filename.endswith("annotation.txt"):
        annotations_files_list.append(filename)
    if filename.endswith("ascii.txt"):
        ascii_files_list.append(filename)

annotations_files_list.sort()
ascii_files_list.sort()

dataset_as_list = []
print("no of files: ", len(annotations_files_list), len(ascii_files_list))
for i in range(len(annotations_files_list)):
  annotation_file = open(f"{path}/{annotations_files_list[i]}","r")
  ascii_file = open(f"{path}/{ascii_files_list[i]}","r")
  
  ascii_messages = ascii_file.readlines()
  annotations = set([])
  for text in annotation_file.readlines():
    l = text.strip().split(" ")
    annotations.add(l[0].strip() + "-" + l[1].strip())
    annotations.add(l[1].strip() + "-" + l[0].strip())

  if(ascii_messages[999][0] == "["):
    ascii_messages[999] = ascii_messages[999][8:].strip()
  elif(ascii_messages[999][0] == "="):
    ascii_messages[999] = ascii_messages[999][4:].strip()
  else:
    ascii_messages[999] = ascii_messages[999].strip()

  for j in range(1000, len(ascii_messages)):

    if(ascii_messages[j][0] == "["):
      ascii_messages[j] = ascii_messages[j][8:].strip()
    elif(ascii_messages[j][0] == "="):
      ascii_messages[j] = ascii_messages[j][4:].strip()
    else:
      ascii_messages[j] = ascii_messages[j].strip()

    dataset_as_list.append([ascii_messages[j-1], ascii_messages[j], 1 if str(j-1)+"-"+str(j) in annotations else 0])
    if str(j)+"-"+str(j) in annotations:
      dataset_as_list.append([ascii_messages[j], ascii_messages[j], 1])
  
  annotation_file.close()
  ascii_file.close()

df = pd.DataFrame(dataset_as_list, columns=['line1','line2','label'])
df.to_csv('/content/drive/MyDrive/IR Disentanglement Project/Data/test_dataset.csv')
print("final dataset: ", df.shape)

## Prepare data for cluster/topic prediction

In [None]:
# Import Module
import os
import pandas as pd
  
# Change the directory
path = "/content/drive/MyDrive/IR Disentanglement Project/Data/dev"
os.chdir(path)

# iterate through all file
annotations_files_list,ascii_files_list = [],[]
for filename in os.listdir():
    if filename.endswith("annotation.txt"):
        annotations_files_list.append(filename)
    if filename.endswith("ascii.txt"):
        ascii_files_list.append(filename)


annotations_files_list.sort()
ascii_files_list.sort()
print("\nno of files: ", len(annotations_files_list), len(ascii_files_list))

# Iterate through all the files
for i in range(len(annotations_files_list)):

  dataset_as_list = []
  annotation_file = open(f"{path}/{annotations_files_list[i]}","r")
  ascii_file = open(f"{path}/{ascii_files_list[i]}","r")
  ascii_messages = ascii_file.readlines()
  cluster_details = dict({})
  cluster_number = 0

  # Format individual chat text
  for j in range(999, len(ascii_messages)):
    if(ascii_messages[j][0] == "["):
      ascii_messages[j] = ascii_messages[j][8:].strip()
    elif(ascii_messages[j][0] == "="):
      ascii_messages[j] = ascii_messages[j][4:].strip()
    else:
      ascii_messages[j] = ascii_messages[j].strip()

  # Assign cluster numbers as per annotations
  for text in annotation_file.readlines():
    l = text.strip().split(" ")
    ln1, ln2 = int(l[0].strip()), int(l[1].strip())
    if(ln2 not in cluster_details):
      if(ln1 in cluster_details):
        cluster_details[ln2] = cluster_details[ln1]
        dataset_as_list.append([ascii_messages[ln2], cluster_details[ln2]])
      else:
        cluster_number += 1
        cluster_details[ln2] = cluster_number
        dataset_as_list.append([ascii_messages[ln2], cluster_details[ln2]])
  
  # Save the cluster data as CSV
  df = pd.DataFrame(dataset_as_list, columns=['line','cluster number'])
  df.to_csv(f"/content/drive/MyDrive/IR Disentanglement Project/Data/pre-processed/cluster prediction/dev/{annotations_files_list[i][:10]}_cluster_data.csv")
  print("\nFile name: {} and shape: {}".format(annotations_files_list[i], df.shape))
  print("Number of clusters: ", cluster_number)
  annotation_file.close()
  ascii_file.close()

# Imports and Data loading

In [1]:
import torch
import torchtext.vocab as vocab
import random
import math
import time
import argparse
import os
import shutil
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import BertTokenizer, BertModel, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data import TensorDataset
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

os.chdir("/home/cs21mtech12001/IR Disentanglement Project/Data/context change")

train_dataset = pd.read_csv("train_dataset.csv", index_col=[0])
train_dataset = train_dataset.sample(frac=1).reset_index(drop=True)

dev_dataset = pd.read_csv("dev_dataset.csv", index_col=[0])
dev_dataset = dev_dataset.sample(frac=1).reset_index(drop=True)

test_dataset = pd.read_csv("test_dataset.csv", index_col=[0])
test_dataset = test_dataset.sample(frac=1).reset_index(drop=True)

print(train_dataset.shape, dev_dataset.shape, test_dataset.shape)
train_dataset = train_dataset.iloc[:16000, :]

print(train_dataset.columns)
print(dev_dataset.columns)
print(test_dataset.columns)

print(train_dataset.head)
print(dev_dataset.head)
print(test_dataset.head)

(84217, 3) (2962, 3) (5922, 3)
Index(['line1', 'line2', 'label'], dtype='object')
Index(['line1', 'line2', 'label'], dtype='object')
Index(['line1', 'line2', 'label'], dtype='object')
<bound method NDFrame.head of                                                    line1                                              line2  label
0                      <alessandro2> http://xdccing.com/                  <alessandro2> http://xdccing.com/      1
1      <berlylabs> anyone know how to get yahoo games...                                    <hellppmme> hey      0
2      <robbiethe1st> Ziber: You want to take a scree...                         <Ziber> robbiethe1st: yes.      1
3      <ubotu> Sorry, I don't know anything about ubu...                      <todger> hi there everyone :)      0
4                                         <SMJ> 't be it  <Crash_O-D> Xfce I'm having trouble with every...      0
...                                                  ...                                        

# Check and set device

In [2]:
SEED = 1234
random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)      
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# Encode every pair of sentances with labels

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
LABEL_COLUMN = "label"

def prepare_data(df, max_len, batch_size):
    input_ids = []
    attention_masks = []

    for i in df.index:
        # Encode system and user utterance pair
        encoded_dict = tokenizer.encode_plus(
                            df['line1'][i].lower(), df['line2'][i].lower(),
                            add_special_tokens = True,
                            max_length = max_len,
                            padding='max_length', 
                            truncation=True,
                            return_attention_mask = True,
                            return_tensors = 'pt',
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df[LABEL_COLUMN])
    
    dataset = TensorDataset(input_ids, attention_masks, labels)
    dataloader = DataLoader(
            dataset,
            sampler = RandomSampler(dataset),
            batch_size = batch_size
        )
    return dataloader

# Define neural network model with BeRT

In [7]:
class Model(nn.Module):
    def __init__(self, num_labels):
        super(Model, self).__init__()
        self.encode = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
        self.drop_out = nn.Dropout(0.3)
        self.l1 = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_masks):
        outputs = self.encode(input_ids, attention_masks)
        input1 = torch.mean(outputs[2][-2], dim=1)
        input1 = self.drop_out(input1)
        output1 = self.l1(input1)
        return output1

# Method for Evaluation

In [4]:
def evaluate_metrics(dataloader, model):
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
    y_true = []
    y_pred = []
    model.eval()
    with torch.no_grad():
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_attn_mask = batch[1].to(device)
            labels = batch[2].to(device)
             
            outputs = model(b_input_ids, b_attn_mask)
            loss = criterion(outputs, labels)
            total_loss = total_loss + loss.item()
            
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            y_true.extend(labels.cpu().numpy().tolist()) 
            y_pred.extend(predicted.cpu().numpy().tolist()) 
            
    avg_loss = total_loss/len(dataloader)
    print("MCC : {}".format(matthews_corrcoef(y_true, y_pred)))
    print("Classification Report")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix")
    print(confusion_matrix(y_true, y_pred))
    return avg_loss

# Prepare train, dev and test dataloders

In [5]:
num_labels = len(train_dataset[LABEL_COLUMN].unique())
print("Number of labels : {}".format(num_labels))

# Set class weights to handle imbalanced class ratios (if required)
class_weights = torch.ones(num_labels)
print("class weights : {}".format(class_weights))

MAX_LEN = 350
print("Max length : {}".format(MAX_LEN))

batch_size = 16
print("Batch size : {}".format(batch_size))

print("Loading Train data")
train_dataloader = prepare_data(train_dataset, MAX_LEN, batch_size)
print("Loading Test data")
test_dataloader = prepare_data(test_dataset, MAX_LEN, batch_size)
print("Loading Validation data")
valid_dataloader = prepare_data(dev_dataset, MAX_LEN, batch_size)

print("Size of Train loader : {}".format(len(train_dataloader)))
print("Size of Valid loader : {}".format(len(valid_dataloader)))
print("Size of Test loader : {}".format(len(test_dataloader)))

Number of labels : 2
class weights : tensor([1., 1.])
Max length : 350
Batch size : 16
Loading Train data
Loading Test data
Loading Validation data
Size of Train loader : 1000
Size of Valid loader : 186
Size of Test loader : 371


# Define loss, optimizer and scheduler

In [8]:
torch.cuda.empty_cache()

model = Model(num_labels)
model.to(device)

clip = 2.0
num_epoch = 5
best_valid_loss = 9999
best_test_loss = 9999
best_train_loss = 0
best_model = 0
model_copy = type(model)(num_labels)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

total_steps = len(train_dataloader) * num_epoch
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.se

# Train the model

In [16]:
print("Starting training ...")

for epoch in range(num_epoch):
    model.train()
    print("Epoch {} --------------------------".format(epoch+1))
    running_loss = 0.0
    for i, batch in enumerate(train_dataloader):
        print("-----------batch : ", i)
        b_input_ids = batch[0].to(device)
        b_attn_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(b_input_ids, b_attn_mask)
        loss = criterion(outputs, b_labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        scheduler.step()

    print("Training Accuracy :-")
    train_loss = evaluate_metrics(train_dataloader, model)
    print("Validation Accuracy :-")
    valid_loss = evaluate_metrics(valid_dataloader, model)
    print("Test Accuracy :-")
    test_loss = evaluate_metrics(test_dataloader, model)
    print("Epoch {} : Train loss = {} : Valid loss = {} : Test loss = {}".format(epoch + 1, train_loss, valid_loss, test_loss))
    if(valid_loss < best_valid_loss):
        best_valid_loss = valid_loss
        best_test_loss = test_loss
        best_train_loss = train_loss
        best_model = epoch+1
        model_copy.load_state_dict(model.state_dict())
        print("Model {} copied".format(epoch+1))

print('Finished Training ...')

Starting training ...
Epoch 1 --------------------------
-----------batch :  0
-----------batch :  1
-----------batch :  2
-----------batch :  3
-----------batch :  4
-----------batch :  5
-----------batch :  6
-----------batch :  7
-----------batch :  8
-----------batch :  9
-----------batch :  10
-----------batch :  11
-----------batch :  12
-----------batch :  13
-----------batch :  14
-----------batch :  15
-----------batch :  16
-----------batch :  17
-----------batch :  18
-----------batch :  19
-----------batch :  20
-----------batch :  21
-----------batch :  22
-----------batch :  23
-----------batch :  24
-----------batch :  25
-----------batch :  26
-----------batch :  27
-----------batch :  28
-----------batch :  29
-----------batch :  30
-----------batch :  31
-----------batch :  32
-----------batch :  33
-----------batch :  34
-----------batch :  35
-----------batch :  36
-----------batch :  37
-----------batch :  38
-----------batch :  39
-----------batch :  40
----------

# Save the best model

In [24]:
PATH = os.path.join("Saved models" , 'topic_change_best_model_1.pt')
torch.save(model_copy.state_dict(), 'topic_change_best_model_1.pt')
model.to('cpu')
model_copy.to(device)
print("---Best model---")
print("Epoch {} : Train loss = {} : Validation Loss = {} : Test loss = {}".format(best_model, best_train_loss, best_valid_loss, best_test_loss))
print("Training Accuracy :-")
train_loss = evaluate_metrics(train_dataloader, model_copy)
print("Validation Accuracy :-")
valid_loss = evaluate_metrics(valid_dataloader, model_copy)
print("Test Accuracy :-")
test_loss = evaluate_metrics(test_dataloader, model_copy)
print("Verifying Epoch {} : Train loss = {} : Validation Loss = {} : Test loss = {}".format(best_model, train_loss, valid_loss, test_loss))
print("done")

---Best model---
Epoch 1 : Train loss = 0.12317277078388725 : Validation Loss = 0.2510647229619965 : Test loss = 0.2878855159029244
Training Accuracy :-
MCC : 0.8903642303119027
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.94      0.96     10350
           1       0.90      0.96      0.93      5650

    accuracy                           0.95     16000
   macro avg       0.94      0.95      0.94     16000
weighted avg       0.95      0.95      0.95     16000

Confusion Matrix
[[9751  599]
 [ 223 5427]]
Validation Accuracy :-
MCC : 0.7727525101371343
Classification Report
              precision    recall  f1-score   support

           0       0.95      0.88      0.91      1929
           1       0.81      0.91      0.85      1033

    accuracy                           0.89      2962
   macro avg       0.88      0.90      0.88      2962
weighted avg       0.90      0.89      0.89      2962

Confusion Matrix
[[1706  223]
 [

# Predict clusters 

## Method to predict clusters from set of messages

In [9]:

# Load the best saved model
model = Model(2)
model.to(device)
model.load_state_dict(torch.load("/home/cs21mtech12001/IR Disentanglement Project/Saved models/topic_change_best_model_1.pt", map_location=device))

def predict_clusters(messages):
    
    # Initialize cluster details with first message
    predicted_clusters_as_messages = {}
    predicted_clusters_as_numbers = {}
    cluster_number = 1
    predicted_clusters_as_messages[cluster_number] = [messages["line"][0]]
    predicted_clusters_as_numbers[cluster_number] = [0]
    
    # Predict cluster by checking similarity with last message of other clusters
    for i in range(1, len(messages), 1):
        matched_cluster = -1
        
        # Check with last message of all previous clusters
        for j in predicted_clusters_as_messages.keys():
            
            # Check with jth cluster if no relatively recent match is there
            if(matched_cluster == -1 or 
               predicted_clusters_as_numbers[matched_cluster][-1] < predicted_clusters_as_numbers[j][-1]):
                
                # Encode the pair of messages to determine similarity
                encoded_dict = tokenizer.encode_plus(
                                    messages['line'][i].lower(), predicted_clusters_as_messages[j][-1].lower(),
                                    add_special_tokens = True,
                                    max_length = MAX_LEN,
                                    padding='max_length', 
                                    truncation=True,
                                    return_attention_mask = True,
                                    return_tensors = 'pt',
                                   )
                input_ids = encoded_dict['input_ids']
                attention_masks = encoded_dict['attention_mask']
    
                # Predict the similarity with jth cluster
                model.eval()
                with torch.no_grad():
                    input_ids = input_ids.to(device)
                    attention_masks = attention_masks.to(device)
                    outputs = model(input_ids, attention_masks)
                    _, predicted = torch.max(outputs, 1)
            
                # Consider jth cluster is the most recent match
                if(int(predicted) == 1):
                    matched_cluster = j
        
        # If still cluster not found then either it's a self-link or start of a new conversation
        if(matched_cluster != -1):
            predicted_clusters_as_messages[matched_cluster].append(messages['line'][i])
            predicted_clusters_as_numbers[matched_cluster].append(i)
        else:
            cluster_number += 1
            predicted_clusters_as_messages[cluster_number] = [messages["line"][i]]
            predicted_clusters_as_numbers[cluster_number] = [i]
    
    # Return the cluster details 
    return predicted_clusters_as_messages, predicted_clusters_as_numbers

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Predict cluster/threads from test data

In [11]:
# Change the directory
path = "/home/cs21mtech12001/IR Disentanglement Project/Data/cluster prediction/test"
os.chdir(path)

# Iterate through all files to collect the names
for file_name in os.listdir():
    
    # Read ground truth cluster-data
    cluster_data = pd.read_csv(file_name, index_col=[0])
    print("\nFile name: {}, Shape: {}, Actual number of clusters: {}".format(file_name, cluster_data.shape, cluster_data["cluster number"].max()))
        
    # Predict the clusters
    predicted_clusters_as_messages, predicted_clusters_as_numbers = predict_clusters(cluster_data)
    print("Predicted number of clusters: {}".format(len(predicted_clusters_as_messages)))
    print("20th cluster messages: {}".format(predicted_clusters_as_messages[19]))
    print("20th cluster as numbers: {}".format(predicted_clusters_as_numbers[19]))
        


File name: 2015-03-18_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 85


KeyboardInterrupt: 

## Predict cluster/threads for user given messages

In [20]:
messages = ["<Gajesh> good morning everyone!",\
            "<Gajesh> Can anyone explain what happend in class today? as I couldn't attend that class",\
            "<Tamal> good morning Gajesh, sir covered rest of the evaluation matrices in today's class",\
            "<Gajesh> got it, what are the topics exactly Tamal?",\
            "<Vimal> Does anyone have today's slides? I can't find in classroom",\
            "<Aman> Today's slides are not yet uploaded in classroom",\
            "<Vimal> thanks Aman",\
            "<Tamal> @Gajesh topics were DGC, NDGC, RR etc.",\
            "<Gajesh> cool man, Thanks!"]

# Convert to pandas dataframe
df = pd.DataFrame(messages, columns=['line'])
print("\nNumber of messages: ", len(df))

# Predict the clusters
predicted_clusters_as_messages, predicted_clusters_as_numbers = predict_clusters(df)
print("\nPredicted number of clusters: {}".format(len(predicted_clusters_as_messages)))
print("\nPredicted clusters: {}".format(predicted_clusters_as_messages))


Number of messages:  9

Predicted number of clusters: 3

Predicted clusters: {1: ['<Gajesh> good morning everyone!', "<Gajesh> Can anyone explain what happend in class today? as I couldn't attend that class", "<Tamal> good morning Gajesh, sir covered rest of the evaluation matrices in today's class", '<Gajesh> got it, what are the topics exactly Tamal?', '<Tamal> @Gajesh topics were DGC, NDGC, RR etc.', '<Gajesh> cool man, Thanks!'], 2: ["<Vimal> Does anyone have today's slides? I can't find in classroom"], 3: ["<Aman> Today's slides are not yet uploaded in classroom", '<Vimal> thanks Aman']}


# Evaluation of clusters

## Find Variation of Information(VI)

### Method get the ground truth clusters

In [10]:
def get_clusters(cluster_data):
    cluster_details = {}
    for index, row in cluster_data.iterrows():
        if(row["cluster number"] in cluster_details):
            cluster_details[row["cluster number"]].append(index)
        else:
            cluster_details[row["cluster number"]] = [index]
    return cluster_details

### Get ground truth and predicted clusters for test data

In [11]:
# Change the directory
path = "/home/cs21mtech12001/IR Disentanglement Project/Data/cluster prediction/test"
os.chdir(path)

all_predicted_clusters_as_messages = {}
all_predicted_clusters_as_numbers = {}
all_ground_truth_clusters_as_numbers = {}

# Iterate through all files to collect the names
for file_name in os.listdir():
    
    # Read cluster data
    cluster_data = pd.read_csv(file_name, index_col=[0])
    print("\nFile name: {}, Shape: {}, Actual number of clusters: {}".format(file_name, cluster_data.shape, cluster_data["cluster number"].max()))
    
    # Get ground truth clusters
    ground_truth_cluster_details = get_clusters(cluster_data)
    all_ground_truth_clusters_as_numbers[file_name[:10]] = ground_truth_cluster_details
    
    # Get predicted clusters
    print("Predicting clusters ...")
    predicted_clusters_as_messages, predicted_clusters_as_numbers = predict_clusters(cluster_data)
    all_predicted_clusters_as_messages[file_name[:10]] = predicted_clusters_as_messages
    all_predicted_clusters_as_numbers[file_name[:10]] = predicted_clusters_as_numbers

# Save all results for future use
with open('/home/cs21mtech12001/IR Disentanglement Project/Results/all_predicted_clusters_as_messages_all_1.txt','w') as data: 
      data.write(str(all_predicted_clusters_as_messages))
with open('/home/cs21mtech12001/IR Disentanglement Project/Results/all_predicted_clusters_as_numbers_all_1.txt', 'w') as data: 
      data.write(str(all_predicted_clusters_as_numbers))
with open('/home/cs21mtech12001/IR Disentanglement Project/Results/all_ground_truth_clusters_as_numbers_all_1.txt','w') as data: 
      data.write(str(all_ground_truth_clusters_as_numbers))


File name: 2015-03-18_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 85
Predicting clusters ...

File name: 2013-09-01_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 65
Predicting clusters ...

File name: 2010-08-17_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 93
Predicting clusters ...

File name: 2005-07-06_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 155
Predicting clusters ...

File name: 2014-06-18_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 77
Predicting clusters ...

File name: 2016-06-08_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 105
Predicting clusters ...

File name: 2008-07-14_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 83
Predicting clusters ...

File name: 2007-01-11_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 183
Predicting clusters ...

File name: 2016-02-22_cluster_data.csv, Shape: (500, 2), Actual number of clusters: 54
Predicting cl

### Method to build contingency matrix

In [12]:
def clusters_to_contingency(gt_clusters, predicted_clusters):
    
    contingency_table = {}
    counts_predicted_clusters = {}
    counts_gt_clusters = {}

    # Update contingency table
    for file_name in gt_clusters:
        for p_cluster in predicted_clusters[file_name]:
            current = {}
            contingency_table[file_name + "_pc_" + str(p_cluster)] = current
            for gt_cluster in gt_clusters[file_name]:
                count = len(set(predicted_clusters[file_name][p_cluster]).intersection(set(gt_clusters[file_name][gt_cluster])))
                if count > 0:
                    current[file_name + "_gtc_" + str(gt_cluster)] = count
    
    # Update predicted clusters count(rows)
    for file_name in predicted_clusters:
        for p_cluster in predicted_clusters[file_name]:
            counts_predicted_clusters[file_name + "_pc_" + str(p_cluster)] = len(predicted_clusters[file_name][p_cluster])
    
    # Update ground truth clusters count(columns)
    for file_name in gt_clusters:
        for gt_cluster in gt_clusters[file_name]:
            counts_gt_clusters[file_name + "_gtc_" + str(gt_cluster)] = len(gt_clusters[file_name][gt_cluster])
        
    return contingency_table, counts_predicted_clusters, counts_gt_clusters

### Method to calculate VI

In [13]:
def calculate_variation_of_information(contingency, row_sums, col_sums):
    total = 0.0
    for row in row_sums:
        total += row_sums[row]

    H_UV = 0.0
    I_UV = 0.0
    for row in contingency:
        for col in contingency[row]:
            num = contingency[row][col]
            H_UV -= (num / total) * math.log(num / total, 2)
            I_UV += (num / total) * math.log(num * total / (row_sums[row] * col_sums[col]), 2)

    H_U = 0.0
    for row in row_sums:
        num = row_sums[row]
        H_U -= (num / total) * math.log(num / total, 2)
    H_V = 0.0
    for col in col_sums:
        num = col_sums[col]
        H_V -= (num / total) * math.log(num / total, 2)

    max_score = math.log(total, 2)
    VI = H_UV - I_UV

    scaled_VI = VI / max_score
    print("{:5.2f}   1 - Scaled VI".format(100 - 100 * scaled_VI))

### Build contingency table

In [14]:
contingency_table, counts_predicted_clusters, counts_gt_clusters = clusters_to_contingency(all_ground_truth_clusters_as_numbers, all_predicted_clusters_as_numbers)

### Calculate VI

In [15]:
#contingency_table = {'2015-03-18_pc_1': {'2015-03-18_gtc_1': 12}, '2015-03-18_pc_2': {'2015-03-18_gtc_1': 5}, '2015-03-18_pc_3': {'2015-03-18_gtc_2': 3, '2015-03-18_gtc_38': 2, '2015-03-18_gtc_39': 1, '2015-03-18_gtc_42': 15, '2015-03-18_gtc_43': 1, '2015-03-18_gtc_48': 7, '2015-03-18_gtc_69': 7}, '2015-03-18_pc_4': {'2015-03-18_gtc_1': 46}, '2015-03-18_pc_5': {'2015-03-18_gtc_1': 58}, '2015-03-18_pc_6': {'2015-03-18_gtc_3': 1}, '2015-03-18_pc_7': {'2015-03-18_gtc_4': 1, '2015-03-18_gtc_49': 1, '2015-03-18_gtc_50': 6}, '2015-03-18_pc_8': {'2015-03-18_gtc_5': 1, '2015-03-18_gtc_70': 1, '2015-03-18_gtc_72': 1, '2015-03-18_gtc_76': 1, '2015-03-18_gtc_77': 1}, '2015-03-18_pc_9': {'2015-03-18_gtc_1': 16, '2015-03-18_gtc_7': 3}, '2015-03-18_pc_10': {'2015-03-18_gtc_6': 1, '2015-03-18_gtc_12': 1, '2015-03-18_gtc_30': 1}, '2015-03-18_pc_11': {'2015-03-18_gtc_7': 4}, '2015-03-18_pc_12': {'2015-03-18_gtc_1': 4, '2015-03-18_gtc_7': 2, '2015-03-18_gtc_69': 8, '2015-03-18_gtc_83': 4}, '2015-03-18_pc_13': {'2015-03-18_gtc_8': 1}, '2015-03-18_pc_14': {'2015-03-18_gtc_9': 2}, '2015-03-18_pc_15': {'2015-03-18_gtc_10': 3}, '2015-03-18_pc_16': {'2015-03-18_gtc_10': 5, '2015-03-18_gtc_13': 9}, '2015-03-18_pc_17': {'2015-03-18_gtc_10': 1}, '2015-03-18_pc_18': {'2015-03-18_gtc_11': 6}, '2015-03-18_pc_19': {'2015-03-18_gtc_13': 6, '2015-03-18_gtc_33': 2, '2015-03-18_gtc_34': 2}, '2015-03-18_pc_20': {'2015-03-18_gtc_13': 14}, '2015-03-18_pc_21': {'2015-03-18_gtc_13': 3, '2015-03-18_gtc_58': 2, '2015-03-18_gtc_64': 1}, '2015-03-18_pc_22': {'2015-03-18_gtc_13': 2}, '2015-03-18_pc_23': {'2015-03-18_gtc_14': 1}, '2015-03-18_pc_24': {'2015-03-18_gtc_15': 1, '2015-03-18_gtc_36': 1, '2015-03-18_gtc_37': 1}, '2015-03-18_pc_25': {'2015-03-18_gtc_16': 1, '2015-03-18_gtc_17': 1, '2015-03-18_gtc_18': 4, '2015-03-18_gtc_19': 1, '2015-03-18_gtc_20': 1, '2015-03-18_gtc_21': 1, '2015-03-18_gtc_22': 1, '2015-03-18_gtc_23': 1, '2015-03-18_gtc_24': 1, '2015-03-18_gtc_25': 1, '2015-03-18_gtc_26': 1, '2015-03-18_gtc_27': 1, '2015-03-18_gtc_28': 1, '2015-03-18_gtc_29': 1}, '2015-03-18_pc_26': {'2015-03-18_gtc_18': 1}, '2015-03-18_pc_27': {'2015-03-18_gtc_31': 1, '2015-03-18_gtc_32': 1, '2015-03-18_gtc_42': 5, '2015-03-18_gtc_48': 3}, '2015-03-18_pc_28': {'2015-03-18_gtc_33': 1}, '2015-03-18_pc_29': {'2015-03-18_gtc_33': 3}, '2015-03-18_pc_30': {'2015-03-18_gtc_33': 4}, '2015-03-18_pc_31': {'2015-03-18_gtc_34': 1}, '2015-03-18_pc_32': {'2015-03-18_gtc_34': 1}, '2015-03-18_pc_33': {'2015-03-18_gtc_35': 3}, '2015-03-18_pc_34': {'2015-03-18_gtc_38': 17, '2015-03-18_gtc_39': 3}, '2015-03-18_pc_35': {'2015-03-18_gtc_40': 1}, '2015-03-18_pc_36': {'2015-03-18_gtc_38': 9, '2015-03-18_gtc_41': 5}, '2015-03-18_pc_37': {'2015-03-18_gtc_41': 9}, '2015-03-18_pc_38': {'2015-03-18_gtc_42': 1, '2015-03-18_gtc_43': 2}, '2015-03-18_pc_39': {'2015-03-18_gtc_44': 1, '2015-03-18_gtc_60': 1, '2015-03-18_gtc_65': 1, '2015-03-18_gtc_66': 1}, '2015-03-18_pc_40': {'2015-03-18_gtc_45': 18}, '2015-03-18_pc_41': {'2015-03-18_gtc_46': 2}, '2015-03-18_pc_42': {'2015-03-18_gtc_45': 3}, '2015-03-18_pc_43': {'2015-03-18_gtc_42': 1, '2015-03-18_gtc_47': 3, '2015-03-18_gtc_54': 1, '2015-03-18_gtc_55': 1, '2015-03-18_gtc_56': 4}, '2015-03-18_pc_44': {'2015-03-18_gtc_51': 1, '2015-03-18_gtc_52': 4, '2015-03-18_gtc_58': 2}, '2015-03-18_pc_45': {'2015-03-18_gtc_53': 1}, '2015-03-18_pc_46': {'2015-03-18_gtc_57': 1, '2015-03-18_gtc_58': 4, '2015-03-18_gtc_59': 1}, '2015-03-18_pc_47': {'2015-03-18_gtc_59': 3}, '2015-03-18_pc_48': {'2015-03-18_gtc_59': 1}, '2015-03-18_pc_49': {'2015-03-18_gtc_61': 1}, '2015-03-18_pc_50': {'2015-03-18_gtc_58': 4, '2015-03-18_gtc_62': 3, '2015-03-18_gtc_63': 1, '2015-03-18_gtc_85': 1}, '2015-03-18_pc_51': {'2015-03-18_gtc_64': 2}, '2015-03-18_pc_52': {'2015-03-18_gtc_58': 3, '2015-03-18_gtc_64': 5, '2015-03-18_gtc_84': 2}, '2015-03-18_pc_53': {'2015-03-18_gtc_67': 1}, '2015-03-18_pc_54': {'2015-03-18_gtc_68': 1}, '2015-03-18_pc_55': {'2015-03-18_gtc_69': 7, '2015-03-18_gtc_73': 8, '2015-03-18_gtc_78': 3, '2015-03-18_gtc_82': 1}, '2015-03-18_pc_56': {'2015-03-18_gtc_71': 2}, '2015-03-18_pc_57': {'2015-03-18_gtc_73': 8}, '2015-03-18_pc_58': {'2015-03-18_gtc_74': 1, '2015-03-18_gtc_78': 1}, '2015-03-18_pc_59': {'2015-03-18_gtc_73': 1}, '2015-03-18_pc_60': {'2015-03-18_gtc_75': 3}, '2015-03-18_pc_61': {'2015-03-18_gtc_79': 2, '2015-03-18_gtc_80': 6}, '2015-03-18_pc_62': {'2015-03-18_gtc_81': 2}, '2015-03-18_pc_63': {'2015-03-18_gtc_81': 4, '2015-03-18_gtc_83': 1}, '2015-03-18_pc_64': {'2015-03-18_gtc_83': 2}, '2015-03-18_pc_65': {'2015-03-18_gtc_83': 3}, '2013-09-01_pc_1': {'2013-09-01_gtc_1': 1, '2013-09-01_gtc_2': 7, '2013-09-01_gtc_3': 1}, '2013-09-01_pc_2': {'2013-09-01_gtc_2': 16, '2013-09-01_gtc_9': 1}, '2013-09-01_pc_3': {'2013-09-01_gtc_4': 1, '2013-09-01_gtc_35': 7}, '2013-09-01_pc_4': {'2013-09-01_gtc_5': 1, '2013-09-01_gtc_22': 1}, '2013-09-01_pc_5': {'2013-09-01_gtc_6': 1, '2013-09-01_gtc_9': 2, '2013-09-01_gtc_10': 3}, '2013-09-01_pc_6': {'2013-09-01_gtc_2': 7, '2013-09-01_gtc_7': 1, '2013-09-01_gtc_10': 11, '2013-09-01_gtc_14': 2, '2013-09-01_gtc_18': 1, '2013-09-01_gtc_27': 1, '2013-09-01_gtc_29': 1, '2013-09-01_gtc_31': 2, '2013-09-01_gtc_36': 8}, '2013-09-01_pc_7': {'2013-09-01_gtc_8': 2}, '2013-09-01_pc_8': {'2013-09-01_gtc_8': 1}, '2013-09-01_pc_9': {'2013-09-01_gtc_2': 6, '2013-09-01_gtc_10': 2, '2013-09-01_gtc_12': 1, '2013-09-01_gtc_14': 3, '2013-09-01_gtc_36': 14, '2013-09-01_gtc_38': 7}, '2013-09-01_pc_10': {'2013-09-01_gtc_10': 4, '2013-09-01_gtc_11': 4, '2013-09-01_gtc_14': 1}, '2013-09-01_pc_11': {'2013-09-01_gtc_13': 2}, '2013-09-01_pc_12': {'2013-09-01_gtc_10': 2, '2013-09-01_gtc_14': 8}, '2013-09-01_pc_13': {'2013-09-01_gtc_10': 17, '2013-09-01_gtc_36': 2}, '2013-09-01_pc_14': {'2013-09-01_gtc_15': 1}, '2013-09-01_pc_15': {'2013-09-01_gtc_10': 1, '2013-09-01_gtc_16': 3}, '2013-09-01_pc_16': {'2013-09-01_gtc_14': 5}, '2013-09-01_pc_17': {'2013-09-01_gtc_17': 3}, '2013-09-01_pc_18': {'2013-09-01_gtc_10': 2, '2013-09-01_gtc_49': 2, '2013-09-01_gtc_50': 3, '2013-09-01_gtc_51': 1, '2013-09-01_gtc_55': 5, '2013-09-01_gtc_56': 2}, '2013-09-01_pc_19': {'2013-09-01_gtc_18': 5}, '2013-09-01_pc_20': {'2013-09-01_gtc_19': 1}, '2013-09-01_pc_21': {'2013-09-01_gtc_18': 1, '2013-09-01_gtc_20': 6}, '2013-09-01_pc_22': {'2013-09-01_gtc_20': 1, '2013-09-01_gtc_21': 2}, '2013-09-01_pc_23': {'2013-09-01_gtc_18': 2, '2013-09-01_gtc_25': 3}, '2013-09-01_pc_24': {'2013-09-01_gtc_23': 1, '2013-09-01_gtc_50': 1}, '2013-09-01_pc_25': {'2013-09-01_gtc_24': 2}, '2013-09-01_pc_26': {'2013-09-01_gtc_25': 2}, '2013-09-01_pc_27': {'2013-09-01_gtc_26': 1}, '2013-09-01_pc_28': {'2013-09-01_gtc_25': 9, '2013-09-01_gtc_27': 1}, '2013-09-01_pc_29': {'2013-09-01_gtc_25': 2}, '2013-09-01_pc_30': {'2013-09-01_gtc_25': 11}, '2013-09-01_pc_31': {'2013-09-01_gtc_27': 5, '2013-09-01_gtc_29': 2, '2013-09-01_gtc_42': 6, '2013-09-01_gtc_56': 1, '2013-09-01_gtc_57': 3}, '2013-09-01_pc_32': {'2013-09-01_gtc_25': 3}, '2013-09-01_pc_33': {'2013-09-01_gtc_28': 1}, '2013-09-01_pc_34': {'2013-09-01_gtc_25': 1, '2013-09-01_gtc_27': 1, '2013-09-01_gtc_29': 6, '2013-09-01_gtc_31': 3}, '2013-09-01_pc_35': {'2013-09-01_gtc_29': 4, '2013-09-01_gtc_36': 20}, '2013-09-01_pc_36': {'2013-09-01_gtc_30': 1, '2013-09-01_gtc_32': 1, '2013-09-01_gtc_33': 1}, '2013-09-01_pc_37': {'2013-09-01_gtc_31': 1}, '2013-09-01_pc_38': {'2013-09-01_gtc_34': 1}, '2013-09-01_pc_39': {'2013-09-01_gtc_36': 3}, '2013-09-01_pc_40': {'2013-09-01_gtc_37': 3}, '2013-09-01_pc_41': {'2013-09-01_gtc_36': 6, '2013-09-01_gtc_38': 1, '2013-09-01_gtc_47': 1, '2013-09-01_gtc_57': 12}, '2013-09-01_pc_42': {'2013-09-01_gtc_39': 7, '2013-09-01_gtc_50': 1, '2013-09-01_gtc_52': 2}, '2013-09-01_pc_43': {'2013-09-01_gtc_40': 2}, '2013-09-01_pc_44': {'2013-09-01_gtc_41': 1}, '2013-09-01_pc_45': {'2013-09-01_gtc_42': 2, '2013-09-01_gtc_44': 6}, '2013-09-01_pc_46': {'2013-09-01_gtc_43': 3, '2013-09-01_gtc_47': 1, '2013-09-01_gtc_49': 3}, '2013-09-01_pc_47': {'2013-09-01_gtc_44': 11}, '2013-09-01_pc_48': {'2013-09-01_gtc_45': 1}, '2013-09-01_pc_49': {'2013-09-01_gtc_45': 1}, '2013-09-01_pc_50': {'2013-09-01_gtc_46': 1}, '2013-09-01_pc_51': {'2013-09-01_gtc_47': 2}, '2013-09-01_pc_52': {'2013-09-01_gtc_44': 4, '2013-09-01_gtc_50': 1}, '2013-09-01_pc_53': {'2013-09-01_gtc_39': 5}, '2013-09-01_pc_54': {'2013-09-01_gtc_47': 1}, '2013-09-01_pc_55': {'2013-09-01_gtc_48': 1, '2013-09-01_gtc_49': 4, '2013-09-01_gtc_65': 1}, '2013-09-01_pc_56': {'2013-09-01_gtc_51': 4}, '2013-09-01_pc_57': {'2013-09-01_gtc_51': 1, '2013-09-01_gtc_52': 1, '2013-09-01_gtc_53': 1}, '2013-09-01_pc_58': {'2013-09-01_gtc_54': 1, '2013-09-01_gtc_62': 1}, '2013-09-01_pc_59': {'2013-09-01_gtc_55': 2, '2013-09-01_gtc_57': 8}, '2013-09-01_pc_60': {'2013-09-01_gtc_56': 3, '2013-09-01_gtc_57': 6, '2013-09-01_gtc_58': 3}, '2013-09-01_pc_61': {'2013-09-01_gtc_57': 4, '2013-09-01_gtc_58': 3}, '2013-09-01_pc_62': {'2013-09-01_gtc_58': 1}, '2013-09-01_pc_63': {'2013-09-01_gtc_57': 1, '2013-09-01_gtc_58': 1}, '2013-09-01_pc_64': {'2013-09-01_gtc_57': 10, '2013-09-01_gtc_60': 5, '2013-09-01_gtc_61': 14, '2013-09-01_gtc_63': 3, '2013-09-01_gtc_64': 2}, '2013-09-01_pc_65': {'2013-09-01_gtc_59': 1}, '2013-09-01_pc_66': {'2013-09-01_gtc_57': 21}, '2013-09-01_pc_67': {'2013-09-01_gtc_60': 4, '2013-09-01_gtc_61': 10}, '2013-09-01_pc_68': {'2013-09-01_gtc_61': 8}, '2013-09-01_pc_69': {'2013-09-01_gtc_63': 2}, '2010-08-17_pc_1': {'2010-08-17_gtc_1': 19, '2010-08-17_gtc_24': 1}, '2010-08-17_pc_2': {'2010-08-17_gtc_2': 1, '2010-08-17_gtc_15': 10, '2010-08-17_gtc_20': 2, '2010-08-17_gtc_40': 3}, '2010-08-17_pc_3': {'2010-08-17_gtc_3': 1}, '2010-08-17_pc_4': {'2010-08-17_gtc_3': 17}, '2010-08-17_pc_5': {'2010-08-17_gtc_4': 1}, '2010-08-17_pc_6': {'2010-08-17_gtc_1': 1}, '2010-08-17_pc_7': {'2010-08-17_gtc_5': 1, '2010-08-17_gtc_42': 1}, '2010-08-17_pc_8': {'2010-08-17_gtc_6': 1, '2010-08-17_gtc_12': 1, '2010-08-17_gtc_85': 1}, '2010-08-17_pc_9': {'2010-08-17_gtc_1': 3, '2010-08-17_gtc_7': 7, '2010-08-17_gtc_16': 3}, '2010-08-17_pc_10': {'2010-08-17_gtc_8': 1}, '2010-08-17_pc_11': {'2010-08-17_gtc_1': 3, '2010-08-17_gtc_9': 1, '2010-08-17_gtc_11': 2}, '2010-08-17_pc_12': {'2010-08-17_gtc_10': 2, '2010-08-17_gtc_15': 5, '2010-08-17_gtc_24': 1, '2010-08-17_gtc_29': 1, '2010-08-17_gtc_38': 1, '2010-08-17_gtc_40': 6, '2010-08-17_gtc_52': 1, '2010-08-17_gtc_54': 11}, '2010-08-17_pc_13': {'2010-08-17_gtc_13': 4, '2010-08-17_gtc_14': 2}, '2010-08-17_pc_14': {'2010-08-17_gtc_14': 3, '2010-08-17_gtc_15': 16, '2010-08-17_gtc_21': 6, '2010-08-17_gtc_24': 1, '2010-08-17_gtc_40': 4}, '2010-08-17_pc_15': {'2010-08-17_gtc_1': 3, '2010-08-17_gtc_7': 3}, '2010-08-17_pc_16': {'2010-08-17_gtc_17': 6}, '2010-08-17_pc_17': {'2010-08-17_gtc_15': 2, '2010-08-17_gtc_18': 2, '2010-08-17_gtc_21': 4, '2010-08-17_gtc_33': 1}, '2010-08-17_pc_18': {'2010-08-17_gtc_19': 1}, '2010-08-17_pc_19': {'2010-08-17_gtc_22': 1}, '2010-08-17_pc_20': {'2010-08-17_gtc_15': 13, '2010-08-17_gtc_20': 4, '2010-08-17_gtc_71': 1}, '2010-08-17_pc_21': {'2010-08-17_gtc_15': 45, '2010-08-17_gtc_21': 6, '2010-08-17_gtc_23': 3, '2010-08-17_gtc_53': 2}, '2010-08-17_pc_22': {'2010-08-17_gtc_24': 16}, '2010-08-17_pc_23': {'2010-08-17_gtc_25': 1}, '2010-08-17_pc_24': {'2010-08-17_gtc_23': 2}, '2010-08-17_pc_25': {'2010-08-17_gtc_26': 1}, '2010-08-17_pc_26': {'2010-08-17_gtc_27': 1}, '2010-08-17_pc_27': {'2010-08-17_gtc_15': 4, '2010-08-17_gtc_21': 2, '2010-08-17_gtc_31': 1}, '2010-08-17_pc_28': {'2010-08-17_gtc_28': 1}, '2010-08-17_pc_29': {'2010-08-17_gtc_30': 1}, '2010-08-17_pc_30': {'2010-08-17_gtc_32': 1, '2010-08-17_gtc_44': 1, '2010-08-17_gtc_77': 1, '2010-08-17_gtc_80': 1, '2010-08-17_gtc_84': 1, '2010-08-17_gtc_88': 1, '2010-08-17_gtc_89': 1, '2010-08-17_gtc_90': 1}, '2010-08-17_pc_31': {'2010-08-17_gtc_21': 1, '2010-08-17_gtc_24': 1, '2010-08-17_gtc_34': 2, '2010-08-17_gtc_38': 2}, '2010-08-17_pc_32': {'2010-08-17_gtc_34': 1}, '2010-08-17_pc_33': {'2010-08-17_gtc_35': 1}, '2010-08-17_pc_34': {'2010-08-17_gtc_36': 3, '2010-08-17_gtc_61': 3, '2010-08-17_gtc_64': 2}, '2010-08-17_pc_35': {'2010-08-17_gtc_24': 4, '2010-08-17_gtc_37': 3}, '2010-08-17_pc_36': {'2010-08-17_gtc_37': 1}, '2010-08-17_pc_37': {'2010-08-17_gtc_38': 2, '2010-08-17_gtc_46': 2}, '2010-08-17_pc_38': {'2010-08-17_gtc_39': 1}, '2010-08-17_pc_39': {'2010-08-17_gtc_40': 1}, '2010-08-17_pc_40': {'2010-08-17_gtc_37': 5, '2010-08-17_gtc_40': 1}, '2010-08-17_pc_41': {'2010-08-17_gtc_41': 1}, '2010-08-17_pc_42': {'2010-08-17_gtc_37': 6}, '2010-08-17_pc_43': {'2010-08-17_gtc_43': 1}, '2010-08-17_pc_44': {'2010-08-17_gtc_45': 1, '2010-08-17_gtc_59': 1}, '2010-08-17_pc_45': {'2010-08-17_gtc_15': 1, '2010-08-17_gtc_62': 6}, '2010-08-17_pc_46': {'2010-08-17_gtc_47': 9}, '2010-08-17_pc_47': {'2010-08-17_gtc_48': 3}, '2010-08-17_pc_48': {'2010-08-17_gtc_49': 1}, '2010-08-17_pc_49': {'2010-08-17_gtc_50': 7}, '2010-08-17_pc_50': {'2010-08-17_gtc_51': 5}, '2010-08-17_pc_51': {'2010-08-17_gtc_53': 1}, '2010-08-17_pc_52': {'2010-08-17_gtc_55': 1}, '2010-08-17_pc_53': {'2010-08-17_gtc_15': 6, '2010-08-17_gtc_56': 1}, '2010-08-17_pc_54': {'2010-08-17_gtc_57': 4}, '2010-08-17_pc_55': {'2010-08-17_gtc_58': 4, '2010-08-17_gtc_92': 1}, '2010-08-17_pc_56': {'2010-08-17_gtc_51': 7}, '2010-08-17_pc_57': {'2010-08-17_gtc_58': 7}, '2010-08-17_pc_58': {'2010-08-17_gtc_60': 1}, '2010-08-17_pc_59': {'2010-08-17_gtc_58': 3, '2010-08-17_gtc_71': 1}, '2010-08-17_pc_60': {'2010-08-17_gtc_62': 1}, '2010-08-17_pc_61': {'2010-08-17_gtc_63': 3}, '2010-08-17_pc_62': {'2010-08-17_gtc_51': 1}, '2010-08-17_pc_63': {'2010-08-17_gtc_58': 3}, '2010-08-17_pc_64': {'2010-08-17_gtc_65': 5, '2010-08-17_gtc_75': 5}, '2010-08-17_pc_65': {'2010-08-17_gtc_58': 3}, '2010-08-17_pc_66': {'2010-08-17_gtc_66': 1}, '2010-08-17_pc_67': {'2010-08-17_gtc_67': 1}, '2010-08-17_pc_68': {'2010-08-17_gtc_68': 2}, '2010-08-17_pc_69': {'2010-08-17_gtc_69': 1, '2010-08-17_gtc_72': 2}, '2010-08-17_pc_70': {'2010-08-17_gtc_70': 4, '2010-08-17_gtc_78': 1}, '2010-08-17_pc_71': {'2010-08-17_gtc_73': 2}, '2010-08-17_pc_72': {'2010-08-17_gtc_73': 7}, '2010-08-17_pc_73': {'2010-08-17_gtc_74': 1, '2010-08-17_gtc_75': 2}, '2010-08-17_pc_74': {'2010-08-17_gtc_75': 8, '2010-08-17_gtc_92': 1}, '2010-08-17_pc_75': {'2010-08-17_gtc_70': 3}, '2010-08-17_pc_76': {'2010-08-17_gtc_76': 9}, '2010-08-17_pc_77': {'2010-08-17_gtc_79': 4}, '2010-08-17_pc_78': {'2010-08-17_gtc_75': 5}, '2010-08-17_pc_79': {'2010-08-17_gtc_79': 1, '2010-08-17_gtc_81': 4, '2010-08-17_gtc_82': 1}, '2010-08-17_pc_80': {'2010-08-17_gtc_81': 6}, '2010-08-17_pc_81': {'2010-08-17_gtc_81': 3}, '2010-08-17_pc_82': {'2010-08-17_gtc_81': 1}, '2010-08-17_pc_83': {'2010-08-17_gtc_83': 1}, '2010-08-17_pc_84': {'2010-08-17_gtc_86': 1}, '2010-08-17_pc_85': {'2010-08-17_gtc_87': 8}, '2010-08-17_pc_86': {'2010-08-17_gtc_76': 2}, '2010-08-17_pc_87': {'2010-08-17_gtc_91': 1, '2010-08-17_gtc_93': 1}, '2005-07-06_pc_1': {'2005-07-06_gtc_1': 1}, '2005-07-06_pc_2': {'2005-07-06_gtc_2': 1}, '2005-07-06_pc_3': {'2005-07-06_gtc_1': 4, '2005-07-06_gtc_5': 6}, '2005-07-06_pc_4': {'2005-07-06_gtc_3': 1, '2005-07-06_gtc_26': 3}, '2005-07-06_pc_5': {'2005-07-06_gtc_4': 1}, '2005-07-06_pc_6': {'2005-07-06_gtc_5': 4, '2005-07-06_gtc_131': 3}, '2005-07-06_pc_7': {'2005-07-06_gtc_5': 24, '2005-07-06_gtc_10': 3, '2005-07-06_gtc_16': 6, '2005-07-06_gtc_38': 2, '2005-07-06_gtc_52': 2}, '2005-07-06_pc_8': {'2005-07-06_gtc_6': 1}, '2005-07-06_pc_9': {'2005-07-06_gtc_7': 1, '2005-07-06_gtc_70': 1}, '2005-07-06_pc_10': {'2005-07-06_gtc_8': 1}, '2005-07-06_pc_11': {'2005-07-06_gtc_9': 1}, '2005-07-06_pc_12': {'2005-07-06_gtc_10': 2, '2005-07-06_gtc_16': 6}, '2005-07-06_pc_13': {'2005-07-06_gtc_11': 1}, '2005-07-06_pc_14': {'2005-07-06_gtc_12': 1}, '2005-07-06_pc_15': {'2005-07-06_gtc_13': 1, '2005-07-06_gtc_18': 1}, '2005-07-06_pc_16': {'2005-07-06_gtc_14': 1}, '2005-07-06_pc_17': {'2005-07-06_gtc_15': 1}, '2005-07-06_pc_18': {'2005-07-06_gtc_16': 2}, '2005-07-06_pc_19': {'2005-07-06_gtc_17': 1}, '2005-07-06_pc_20': {'2005-07-06_gtc_19': 1}, '2005-07-06_pc_21': {'2005-07-06_gtc_20': 1}, '2005-07-06_pc_22': {'2005-07-06_gtc_16': 4}, '2005-07-06_pc_23': {'2005-07-06_gtc_21': 1}, '2005-07-06_pc_24': {'2005-07-06_gtc_22': 1}, '2005-07-06_pc_25': {'2005-07-06_gtc_23': 1, '2005-07-06_gtc_40': 1}, '2005-07-06_pc_26': {'2005-07-06_gtc_24': 1, '2005-07-06_gtc_46': 1, '2005-07-06_gtc_110': 1, '2005-07-06_gtc_111': 1}, '2005-07-06_pc_27': {'2005-07-06_gtc_5': 3, '2005-07-06_gtc_118': 6}, '2005-07-06_pc_28': {'2005-07-06_gtc_25': 1}, '2005-07-06_pc_29': {'2005-07-06_gtc_26': 7, '2005-07-06_gtc_37': 2, '2005-07-06_gtc_38': 14, '2005-07-06_gtc_52': 1, '2005-07-06_gtc_58': 13}, '2005-07-06_pc_30': {'2005-07-06_gtc_27': 1}, '2005-07-06_pc_31': {'2005-07-06_gtc_28': 1}, '2005-07-06_pc_32': {'2005-07-06_gtc_29': 1, '2005-07-06_gtc_34': 1, '2005-07-06_gtc_44': 1}, '2005-07-06_pc_33': {'2005-07-06_gtc_30': 1}, '2005-07-06_pc_34': {'2005-07-06_gtc_31': 1, '2005-07-06_gtc_32': 1}, '2005-07-06_pc_35': {'2005-07-06_gtc_33': 1}, '2005-07-06_pc_36': {'2005-07-06_gtc_35': 1}, '2005-07-06_pc_37': {'2005-07-06_gtc_36': 1, '2005-07-06_gtc_38': 2, '2005-07-06_gtc_58': 1}, '2005-07-06_pc_38': {'2005-07-06_gtc_37': 6, '2005-07-06_gtc_38': 4, '2005-07-06_gtc_58': 8}, '2005-07-06_pc_39': {'2005-07-06_gtc_39': 1}, '2005-07-06_pc_40': {'2005-07-06_gtc_37': 13, '2005-07-06_gtc_62': 14, '2005-07-06_gtc_74': 2, '2005-07-06_gtc_79': 2}, '2005-07-06_pc_41': {'2005-07-06_gtc_41': 2}, '2005-07-06_pc_42': {'2005-07-06_gtc_42': 1}, '2005-07-06_pc_43': {'2005-07-06_gtc_43': 1}, '2005-07-06_pc_44': {'2005-07-06_gtc_45': 1}, '2005-07-06_pc_45': {'2005-07-06_gtc_47': 1}, '2005-07-06_pc_46': {'2005-07-06_gtc_48': 1}, '2005-07-06_pc_47': {'2005-07-06_gtc_49': 1}, '2005-07-06_pc_48': {'2005-07-06_gtc_38': 1}, '2005-07-06_pc_49': {'2005-07-06_gtc_37': 5, '2005-07-06_gtc_50': 1, '2005-07-06_gtc_62': 26, '2005-07-06_gtc_94': 1}, '2005-07-06_pc_50': {'2005-07-06_gtc_51': 1}, '2005-07-06_pc_51': {'2005-07-06_gtc_53': 1}, '2005-07-06_pc_52': {'2005-07-06_gtc_54': 1}, '2005-07-06_pc_53': {'2005-07-06_gtc_55': 1}, '2005-07-06_pc_54': {'2005-07-06_gtc_56': 1}, '2005-07-06_pc_55': {'2005-07-06_gtc_57': 1}, '2005-07-06_pc_56': {'2005-07-06_gtc_58': 10, '2005-07-06_gtc_71': 3, '2005-07-06_gtc_80': 6, '2005-07-06_gtc_84': 1}, '2005-07-06_pc_57': {'2005-07-06_gtc_59': 1}, '2005-07-06_pc_58': {'2005-07-06_gtc_58': 2}, '2005-07-06_pc_59': {'2005-07-06_gtc_60': 1}, '2005-07-06_pc_60': {'2005-07-06_gtc_61': 1}, '2005-07-06_pc_61': {'2005-07-06_gtc_63': 1}, '2005-07-06_pc_62': {'2005-07-06_gtc_64': 6}, '2005-07-06_pc_63': {'2005-07-06_gtc_65': 1}, '2005-07-06_pc_64': {'2005-07-06_gtc_66': 1, '2005-07-06_gtc_113': 1}, '2005-07-06_pc_65': {'2005-07-06_gtc_67': 1}, '2005-07-06_pc_66': {'2005-07-06_gtc_68': 1, '2005-07-06_gtc_128': 1}, '2005-07-06_pc_67': {'2005-07-06_gtc_69': 10, '2005-07-06_gtc_94': 4}, '2005-07-06_pc_68': {'2005-07-06_gtc_72': 1}, '2005-07-06_pc_69': {'2005-07-06_gtc_73': 8}, '2005-07-06_pc_70': {'2005-07-06_gtc_74': 2}, '2005-07-06_pc_71': {'2005-07-06_gtc_75': 1}, '2005-07-06_pc_72': {'2005-07-06_gtc_76': 10}, '2005-07-06_pc_73': {'2005-07-06_gtc_62': 3, '2005-07-06_gtc_76': 4, '2005-07-06_gtc_94': 1}, '2005-07-06_pc_74': {'2005-07-06_gtc_77': 1}, '2005-07-06_pc_75': {'2005-07-06_gtc_78': 1}, '2005-07-06_pc_76': {'2005-07-06_gtc_79': 1}, '2005-07-06_pc_77': {'2005-07-06_gtc_81': 1, '2005-07-06_gtc_91': 1, '2005-07-06_gtc_93': 1}, '2005-07-06_pc_78': {'2005-07-06_gtc_82': 1}, '2005-07-06_pc_79': {'2005-07-06_gtc_83': 1, '2005-07-06_gtc_90': 1}, '2005-07-06_pc_80': {'2005-07-06_gtc_84': 1, '2005-07-06_gtc_85': 2}, '2005-07-06_pc_81': {'2005-07-06_gtc_86': 1}, '2005-07-06_pc_82': {'2005-07-06_gtc_80': 1}, '2005-07-06_pc_83': {'2005-07-06_gtc_87': 1}, '2005-07-06_pc_84': {'2005-07-06_gtc_88': 1}, '2005-07-06_pc_85': {'2005-07-06_gtc_89': 1}, '2005-07-06_pc_86': {'2005-07-06_gtc_92': 7}, '2005-07-06_pc_87': {'2005-07-06_gtc_95': 1}, '2005-07-06_pc_88': {'2005-07-06_gtc_96': 1}, '2005-07-06_pc_89': {'2005-07-06_gtc_97': 1}, '2005-07-06_pc_90': {'2005-07-06_gtc_98': 1, '2005-07-06_gtc_125': 1}, '2005-07-06_pc_91': {'2005-07-06_gtc_99': 1}, '2005-07-06_pc_92': {'2005-07-06_gtc_100': 1}, '2005-07-06_pc_93': {'2005-07-06_gtc_101': 1}, '2005-07-06_pc_94': {'2005-07-06_gtc_102': 1}, '2005-07-06_pc_95': {'2005-07-06_gtc_103': 1}, '2005-07-06_pc_96': {'2005-07-06_gtc_104': 1, '2005-07-06_gtc_107': 1}, '2005-07-06_pc_97': {'2005-07-06_gtc_105': 1, '2005-07-06_gtc_126': 1}, '2005-07-06_pc_98': {'2005-07-06_gtc_106': 1}, '2005-07-06_pc_99': {'2005-07-06_gtc_37': 7, '2005-07-06_gtc_139': 1, '2005-07-06_gtc_143': 1}, '2005-07-06_pc_100': {'2005-07-06_gtc_108': 1, '2005-07-06_gtc_112': 1}, '2005-07-06_pc_101': {'2005-07-06_gtc_109': 1}, '2005-07-06_pc_102': {'2005-07-06_gtc_114': 1}, '2005-07-06_pc_103': {'2005-07-06_gtc_115': 1, '2005-07-06_gtc_116': 1, '2005-07-06_gtc_118': 4}, '2005-07-06_pc_104': {'2005-07-06_gtc_117': 1}, '2005-07-06_pc_105': {'2005-07-06_gtc_119': 1}, '2005-07-06_pc_106': {'2005-07-06_gtc_118': 1}, '2005-07-06_pc_107': {'2005-07-06_gtc_120': 1}, '2005-07-06_pc_108': {'2005-07-06_gtc_118': 1, '2005-07-06_gtc_139': 1}, '2005-07-06_pc_109': {'2005-07-06_gtc_121': 2}, '2005-07-06_pc_110': {'2005-07-06_gtc_92': 5, '2005-07-06_gtc_118': 5, '2005-07-06_gtc_139': 1}, '2005-07-06_pc_111': {'2005-07-06_gtc_116': 2}, '2005-07-06_pc_112': {'2005-07-06_gtc_122': 1}, '2005-07-06_pc_113': {'2005-07-06_gtc_123': 1}, '2005-07-06_pc_114': {'2005-07-06_gtc_124': 1}, '2005-07-06_pc_115': {'2005-07-06_gtc_127': 1}, '2005-07-06_pc_116': {'2005-07-06_gtc_129': 1, '2005-07-06_gtc_131': 1}, '2005-07-06_pc_117': {'2005-07-06_gtc_130': 1}, '2005-07-06_pc_118': {'2005-07-06_gtc_132': 1}, '2005-07-06_pc_119': {'2005-07-06_gtc_131': 9}, '2005-07-06_pc_120': {'2005-07-06_gtc_133': 1}, '2005-07-06_pc_121': {'2005-07-06_gtc_134': 1, '2005-07-06_gtc_137': 1}, '2005-07-06_pc_122': {'2005-07-06_gtc_135': 1}, '2005-07-06_pc_123': {'2005-07-06_gtc_136': 1}, '2005-07-06_pc_124': {'2005-07-06_gtc_138': 1, '2005-07-06_gtc_148': 1}, '2005-07-06_pc_125': {'2005-07-06_gtc_139': 8, '2005-07-06_gtc_155': 1}, '2005-07-06_pc_126': {'2005-07-06_gtc_37': 1, '2005-07-06_gtc_139': 4, '2005-07-06_gtc_142': 7, '2005-07-06_gtc_150': 5}, '2005-07-06_pc_127': {'2005-07-06_gtc_140': 1}, '2005-07-06_pc_128': {'2005-07-06_gtc_141': 1}, '2005-07-06_pc_129': {'2005-07-06_gtc_142': 1}, '2005-07-06_pc_130': {'2005-07-06_gtc_142': 3}, '2005-07-06_pc_131': {'2005-07-06_gtc_144': 1}, '2005-07-06_pc_132': {'2005-07-06_gtc_139': 6, '2005-07-06_gtc_152': 5}, '2005-07-06_pc_133': {'2005-07-06_gtc_145': 1}, '2005-07-06_pc_134': {'2005-07-06_gtc_146': 1}, '2005-07-06_pc_135': {'2005-07-06_gtc_142': 3, '2005-07-06_gtc_150': 2}, '2005-07-06_pc_136': {'2005-07-06_gtc_147': 1}, '2005-07-06_pc_137': {'2005-07-06_gtc_149': 1}, '2005-07-06_pc_138': {'2005-07-06_gtc_151': 1}, '2005-07-06_pc_139': {'2005-07-06_gtc_152': 2}, '2005-07-06_pc_140': {'2005-07-06_gtc_150': 1, '2005-07-06_gtc_153': 1}, '2005-07-06_pc_141': {'2005-07-06_gtc_154': 1}, '2014-06-18_pc_1': {'2014-06-18_gtc_1': 1, '2014-06-18_gtc_2': 16}, '2014-06-18_pc_2': {'2014-06-18_gtc_3': 1, '2014-06-18_gtc_19': 1, '2014-06-18_gtc_34': 1, '2014-06-18_gtc_40': 1, '2014-06-18_gtc_56': 1, '2014-06-18_gtc_64': 1}, '2014-06-18_pc_3': {'2014-06-18_gtc_4': 2}, '2014-06-18_pc_4': {'2014-06-18_gtc_5': 5}, '2014-06-18_pc_5': {'2014-06-18_gtc_5': 3}, '2014-06-18_pc_6': {'2014-06-18_gtc_6': 3, '2014-06-18_gtc_8': 1, '2014-06-18_gtc_9': 2}, '2014-06-18_pc_7': {'2014-06-18_gtc_6': 1, '2014-06-18_gtc_9': 4, '2014-06-18_gtc_10': 2}, '2014-06-18_pc_8': {'2014-06-18_gtc_6': 1, '2014-06-18_gtc_10': 1, '2014-06-18_gtc_12': 5, '2014-06-18_gtc_49': 1, '2014-06-18_gtc_51': 7}, '2014-06-18_pc_9': {'2014-06-18_gtc_7': 1}, '2014-06-18_pc_10': {'2014-06-18_gtc_10': 7, '2014-06-18_gtc_12': 24}, '2014-06-18_pc_11': {'2014-06-18_gtc_10': 1}, '2014-06-18_pc_12': {'2014-06-18_gtc_11': 1}, '2014-06-18_pc_13': {'2014-06-18_gtc_2': 1, '2014-06-18_gtc_12': 3}, '2014-06-18_pc_14': {'2014-06-18_gtc_12': 20}, '2014-06-18_pc_15': {'2014-06-18_gtc_13': 1, '2014-06-18_gtc_22': 1, '2014-06-18_gtc_23': 1, '2014-06-18_gtc_24': 1}, '2014-06-18_pc_16': {'2014-06-18_gtc_14': 1}, '2014-06-18_pc_17': {'2014-06-18_gtc_15': 4}, '2014-06-18_pc_18': {'2014-06-18_gtc_15': 32, '2014-06-18_gtc_16': 1}, '2014-06-18_pc_19': {'2014-06-18_gtc_17': 1}, '2014-06-18_pc_20': {'2014-06-18_gtc_18': 2}, '2014-06-18_pc_21': {'2014-06-18_gtc_12': 4}, '2014-06-18_pc_22': {'2014-06-18_gtc_20': 2, '2014-06-18_gtc_30': 2, '2014-06-18_gtc_31': 3}, '2014-06-18_pc_23': {'2014-06-18_gtc_21': 1}, '2014-06-18_pc_24': {'2014-06-18_gtc_15': 1, '2014-06-18_gtc_25': 1}, '2014-06-18_pc_25': {'2014-06-18_gtc_15': 3, '2014-06-18_gtc_25': 3, '2014-06-18_gtc_51': 4}, '2014-06-18_pc_26': {'2014-06-18_gtc_25': 25}, '2014-06-18_pc_27': {'2014-06-18_gtc_26': 1, '2014-06-18_gtc_27': 1}, '2014-06-18_pc_28': {'2014-06-18_gtc_28': 3}, '2014-06-18_pc_29': {'2014-06-18_gtc_29': 7}, '2014-06-18_pc_30': {'2014-06-18_gtc_32': 1, '2014-06-18_gtc_41': 1, '2014-06-18_gtc_44': 2, '2014-06-18_gtc_53': 7, '2014-06-18_gtc_54': 10}, '2014-06-18_pc_31': {'2014-06-18_gtc_33': 1}, '2014-06-18_pc_32': {'2014-06-18_gtc_25': 2, '2014-06-18_gtc_35': 7}, '2014-06-18_pc_33': {'2014-06-18_gtc_25': 1}, '2014-06-18_pc_34': {'2014-06-18_gtc_36': 1, '2014-06-18_gtc_37': 1}, '2014-06-18_pc_35': {'2014-06-18_gtc_12': 4, '2014-06-18_gtc_25': 2, '2014-06-18_gtc_45': 1, '2014-06-18_gtc_53': 9}, '2014-06-18_pc_36': {'2014-06-18_gtc_12': 5}, '2014-06-18_pc_37': {'2014-06-18_gtc_12': 1, '2014-06-18_gtc_38': 1, '2014-06-18_gtc_39': 4, '2014-06-18_gtc_45': 2, '2014-06-18_gtc_46': 1, '2014-06-18_gtc_53': 1}, '2014-06-18_pc_38': {'2014-06-18_gtc_41': 2, '2014-06-18_gtc_51': 2, '2014-06-18_gtc_53': 1, '2014-06-18_gtc_55': 2, '2014-06-18_gtc_59': 23, '2014-06-18_gtc_67': 3, '2014-06-18_gtc_71': 3}, '2014-06-18_pc_39': {'2014-06-18_gtc_42': 4, '2014-06-18_gtc_43': 1, '2014-06-18_gtc_50': 1, '2014-06-18_gtc_66': 1, '2014-06-18_gtc_69': 1}, '2014-06-18_pc_40': {'2014-06-18_gtc_41': 9, '2014-06-18_gtc_42': 2, '2014-06-18_gtc_47': 1, '2014-06-18_gtc_49': 1, '2014-06-18_gtc_51': 3, '2014-06-18_gtc_52': 2}, '2014-06-18_pc_41': {'2014-06-18_gtc_42': 5}, '2014-06-18_pc_42': {'2014-06-18_gtc_45': 1, '2014-06-18_gtc_53': 12}, '2014-06-18_pc_43': {'2014-06-18_gtc_44': 1, '2014-06-18_gtc_47': 2}, '2014-06-18_pc_44': {'2014-06-18_gtc_46': 1, '2014-06-18_gtc_47': 1}, '2014-06-18_pc_45': {'2014-06-18_gtc_48': 1}, '2014-06-18_pc_46': {'2014-06-18_gtc_42': 5}, '2014-06-18_pc_47': {'2014-06-18_gtc_49': 1}, '2014-06-18_pc_48': {'2014-06-18_gtc_51': 2, '2014-06-18_gtc_55': 8}, '2014-06-18_pc_49': {'2014-06-18_gtc_54': 8}, '2014-06-18_pc_50': {'2014-06-18_gtc_55': 22}, '2014-06-18_pc_51': {'2014-06-18_gtc_55': 17, '2014-06-18_gtc_67': 1}, '2014-06-18_pc_52': {'2014-06-18_gtc_51': 1}, '2014-06-18_pc_53': {'2014-06-18_gtc_57': 4}, '2014-06-18_pc_54': {'2014-06-18_gtc_58': 2}, '2014-06-18_pc_55': {'2014-06-18_gtc_60': 1}, '2014-06-18_pc_56': {'2014-06-18_gtc_61': 1, '2014-06-18_gtc_62': 27}, '2014-06-18_pc_57': {'2014-06-18_gtc_54': 5, '2014-06-18_gtc_55': 1, '2014-06-18_gtc_62': 1}, '2014-06-18_pc_58': {'2014-06-18_gtc_63': 1, '2014-06-18_gtc_65': 1, '2014-06-18_gtc_67': 2}, '2014-06-18_pc_59': {'2014-06-18_gtc_68': 1}, '2014-06-18_pc_60': {'2014-06-18_gtc_70': 1}, '2014-06-18_pc_61': {'2014-06-18_gtc_72': 1, '2014-06-18_gtc_73': 1, '2014-06-18_gtc_75': 1}, '2014-06-18_pc_62': {'2014-06-18_gtc_74': 1}, '2014-06-18_pc_63': {'2014-06-18_gtc_76': 4}, '2014-06-18_pc_64': {'2014-06-18_gtc_77': 1}, '2014-06-18_pc_65': {'2014-06-18_gtc_77': 2}, '2016-06-08_pc_1': {'2016-06-08_gtc_1': 1, '2016-06-08_gtc_4': 1, '2016-06-08_gtc_11': 1, '2016-06-08_gtc_91': 3}, '2016-06-08_pc_2': {'2016-06-08_gtc_1': 1, '2016-06-08_gtc_2': 9}, '2016-06-08_pc_3': {'2016-06-08_gtc_3': 3, '2016-06-08_gtc_43': 5}, '2016-06-08_pc_4': {'2016-06-08_gtc_5': 1, '2016-06-08_gtc_6': 1, '2016-06-08_gtc_11': 7}, '2016-06-08_pc_5': {'2016-06-08_gtc_7': 1, '2016-06-08_gtc_8': 1, '2016-06-08_gtc_9': 2}, '2016-06-08_pc_6': {'2016-06-08_gtc_10': 8}, '2016-06-08_pc_7': {'2016-06-08_gtc_12': 4, '2016-06-08_gtc_15': 6}, '2016-06-08_pc_8': {'2016-06-08_gtc_13': 1}, '2016-06-08_pc_9': {'2016-06-08_gtc_14': 1}, '2016-06-08_pc_10': {'2016-06-08_gtc_15': 7, '2016-06-08_gtc_17': 1, '2016-06-08_gtc_21': 6}, '2016-06-08_pc_11': {'2016-06-08_gtc_16': 1}, '2016-06-08_pc_12': {'2016-06-08_gtc_15': 4}, '2016-06-08_pc_13': {'2016-06-08_gtc_12': 2}, '2016-06-08_pc_14': {'2016-06-08_gtc_18': 1}, '2016-06-08_pc_15': {'2016-06-08_gtc_19': 1, '2016-06-08_gtc_25': 1, '2016-06-08_gtc_28': 1}, '2016-06-08_pc_16': {'2016-06-08_gtc_20': 1}, '2016-06-08_pc_17': {'2016-06-08_gtc_22': 7}, '2016-06-08_pc_18': {'2016-06-08_gtc_23': 1}, '2016-06-08_pc_19': {'2016-06-08_gtc_24': 1, '2016-06-08_gtc_26': 1, '2016-06-08_gtc_27': 8, '2016-06-08_gtc_29': 2, '2016-06-08_gtc_38': 4, '2016-06-08_gtc_48': 3, '2016-06-08_gtc_74': 2}, '2016-06-08_pc_20': {'2016-06-08_gtc_11': 1, '2016-06-08_gtc_22': 6, '2016-06-08_gtc_59': 1, '2016-06-08_gtc_60': 3}, '2016-06-08_pc_21': {'2016-06-08_gtc_22': 1, '2016-06-08_gtc_27': 1, '2016-06-08_gtc_38': 2}, '2016-06-08_pc_22': {'2016-06-08_gtc_27': 13, '2016-06-08_gtc_29': 4}, '2016-06-08_pc_23': {'2016-06-08_gtc_29': 7}, '2016-06-08_pc_24': {'2016-06-08_gtc_30': 1, '2016-06-08_gtc_31': 1, '2016-06-08_gtc_34': 1}, '2016-06-08_pc_25': {'2016-06-08_gtc_29': 2, '2016-06-08_gtc_32': 2}, '2016-06-08_pc_26': {'2016-06-08_gtc_32': 2}, '2016-06-08_pc_27': {'2016-06-08_gtc_33': 1, '2016-06-08_gtc_35': 1}, '2016-06-08_pc_28': {'2016-06-08_gtc_29': 3, '2016-06-08_gtc_33': 2}, '2016-06-08_pc_29': {'2016-06-08_gtc_36': 1, '2016-06-08_gtc_40': 1, '2016-06-08_gtc_41': 1, '2016-06-08_gtc_46': 1, '2016-06-08_gtc_64': 1}, '2016-06-08_pc_30': {'2016-06-08_gtc_27': 2, '2016-06-08_gtc_38': 1, '2016-06-08_gtc_94': 7}, '2016-06-08_pc_31': {'2016-06-08_gtc_37': 1}, '2016-06-08_pc_32': {'2016-06-08_gtc_27': 1}, '2016-06-08_pc_33': {'2016-06-08_gtc_29': 3, '2016-06-08_gtc_38': 4, '2016-06-08_gtc_39': 2, '2016-06-08_gtc_69': 1, '2016-06-08_gtc_72': 2, '2016-06-08_gtc_74': 35}, '2016-06-08_pc_34': {'2016-06-08_gtc_39': 1}, '2016-06-08_pc_35': {'2016-06-08_gtc_42': 2, '2016-06-08_gtc_45': 1}, '2016-06-08_pc_36': {'2016-06-08_gtc_42': 1}, '2016-06-08_pc_37': {'2016-06-08_gtc_44': 6}, '2016-06-08_pc_38': {'2016-06-08_gtc_44': 6}, '2016-06-08_pc_39': {'2016-06-08_gtc_44': 2}, '2016-06-08_pc_40': {'2016-06-08_gtc_43': 1}, '2016-06-08_pc_41': {'2016-06-08_gtc_47': 1, '2016-06-08_gtc_50': 1, '2016-06-08_gtc_51': 1, '2016-06-08_gtc_52': 1}, '2016-06-08_pc_42': {'2016-06-08_gtc_48': 9, '2016-06-08_gtc_49': 1, '2016-06-08_gtc_74': 17, '2016-06-08_gtc_80': 3}, '2016-06-08_pc_43': {'2016-06-08_gtc_49': 1}, '2016-06-08_pc_44': {'2016-06-08_gtc_48': 2, '2016-06-08_gtc_53': 1, '2016-06-08_gtc_54': 5, '2016-06-08_gtc_70': 1}, '2016-06-08_pc_45': {'2016-06-08_gtc_53': 3}, '2016-06-08_pc_46': {'2016-06-08_gtc_54': 9}, '2016-06-08_pc_47': {'2016-06-08_gtc_55': 4, '2016-06-08_gtc_69': 5, '2016-06-08_gtc_84': 1}, '2016-06-08_pc_48': {'2016-06-08_gtc_56': 1, '2016-06-08_gtc_57': 1, '2016-06-08_gtc_58': 1}, '2016-06-08_pc_49': {'2016-06-08_gtc_60': 2, '2016-06-08_gtc_98': 1, '2016-06-08_gtc_100': 8, '2016-06-08_gtc_104': 7}, '2016-06-08_pc_50': {'2016-06-08_gtc_60': 2}, '2016-06-08_pc_51': {'2016-06-08_gtc_61': 1, '2016-06-08_gtc_68': 1, '2016-06-08_gtc_73': 1, '2016-06-08_gtc_92': 2}, '2016-06-08_pc_52': {'2016-06-08_gtc_62': 1, '2016-06-08_gtc_63': 1}, '2016-06-08_pc_53': {'2016-06-08_gtc_65': 1}, '2016-06-08_pc_54': {'2016-06-08_gtc_66': 4}, '2016-06-08_pc_55': {'2016-06-08_gtc_65': 8}, '2016-06-08_pc_56': {'2016-06-08_gtc_65': 2, '2016-06-08_gtc_67': 3, '2016-06-08_gtc_92': 3}, '2016-06-08_pc_57': {'2016-06-08_gtc_69': 8}, '2016-06-08_pc_58': {'2016-06-08_gtc_69': 11, '2016-06-08_gtc_71': 2}, '2016-06-08_pc_59': {'2016-06-08_gtc_71': 1}, '2016-06-08_pc_60': {'2016-06-08_gtc_69': 1, '2016-06-08_gtc_84': 2}, '2016-06-08_pc_61': {'2016-06-08_gtc_74': 3}, '2016-06-08_pc_62': {'2016-06-08_gtc_75': 1}, '2016-06-08_pc_63': {'2016-06-08_gtc_76': 1}, '2016-06-08_pc_64': {'2016-06-08_gtc_74': 6, '2016-06-08_gtc_77': 2}, '2016-06-08_pc_65': {'2016-06-08_gtc_74': 22, '2016-06-08_gtc_82': 2}, '2016-06-08_pc_66': {'2016-06-08_gtc_78': 1, '2016-06-08_gtc_79': 1, '2016-06-08_gtc_83': 1, '2016-06-08_gtc_87': 1, '2016-06-08_gtc_88': 1, '2016-06-08_gtc_97': 1}, '2016-06-08_pc_67': {'2016-06-08_gtc_80': 5}, '2016-06-08_pc_68': {'2016-06-08_gtc_74': 4, '2016-06-08_gtc_81': 2, '2016-06-08_gtc_82': 4}, '2016-06-08_pc_69': {'2016-06-08_gtc_85': 1, '2016-06-08_gtc_89': 3, '2016-06-08_gtc_100': 1}, '2016-06-08_pc_70': {'2016-06-08_gtc_86': 1, '2016-06-08_gtc_89': 5, '2016-06-08_gtc_90': 1}, '2016-06-08_pc_71': {'2016-06-08_gtc_89': 2}, '2016-06-08_pc_72': {'2016-06-08_gtc_89': 1}, '2016-06-08_pc_73': {'2016-06-08_gtc_91': 10, '2016-06-08_gtc_98': 1, '2016-06-08_gtc_99': 1}, '2016-06-08_pc_74': {'2016-06-08_gtc_89': 1}, '2016-06-08_pc_75': {'2016-06-08_gtc_93': 1, '2016-06-08_gtc_98': 1}, '2016-06-08_pc_76': {'2016-06-08_gtc_95': 1}, '2016-06-08_pc_77': {'2016-06-08_gtc_96': 2}, '2016-06-08_pc_78': {'2016-06-08_gtc_96': 2}, '2016-06-08_pc_79': {'2016-06-08_gtc_98': 1}, '2016-06-08_pc_80': {'2016-06-08_gtc_100': 4, '2016-06-08_gtc_101': 1, '2016-06-08_gtc_104': 1}, '2016-06-08_pc_81': {'2016-06-08_gtc_102': 3}, '2016-06-08_pc_82': {'2016-06-08_gtc_103': 1}, '2016-06-08_pc_83': {'2016-06-08_gtc_104': 2}, '2016-06-08_pc_84': {'2016-06-08_gtc_105': 1}}
#counts_predicted_clusters = {'2015-03-18_pc_1': 12, '2015-03-18_pc_2': 5, '2015-03-18_pc_3': 36, '2015-03-18_pc_4': 46, '2015-03-18_pc_5': 58, '2015-03-18_pc_6': 1, '2015-03-18_pc_7': 8, '2015-03-18_pc_8': 5, '2015-03-18_pc_9': 19, '2015-03-18_pc_10': 3, '2015-03-18_pc_11': 4, '2015-03-18_pc_12': 18, '2015-03-18_pc_13': 1, '2015-03-18_pc_14': 2, '2015-03-18_pc_15': 3, '2015-03-18_pc_16': 14, '2015-03-18_pc_17': 1, '2015-03-18_pc_18': 6, '2015-03-18_pc_19': 10, '2015-03-18_pc_20': 14, '2015-03-18_pc_21': 6, '2015-03-18_pc_22': 2, '2015-03-18_pc_23': 1, '2015-03-18_pc_24': 3, '2015-03-18_pc_25': 17, '2015-03-18_pc_26': 1, '2015-03-18_pc_27': 10, '2015-03-18_pc_28': 1, '2015-03-18_pc_29': 3, '2015-03-18_pc_30': 4, '2015-03-18_pc_31': 1, '2015-03-18_pc_32': 1, '2015-03-18_pc_33': 3, '2015-03-18_pc_34': 20, '2015-03-18_pc_35': 1, '2015-03-18_pc_36': 14, '2015-03-18_pc_37': 9, '2015-03-18_pc_38': 3, '2015-03-18_pc_39': 4, '2015-03-18_pc_40': 18, '2015-03-18_pc_41': 2, '2015-03-18_pc_42': 3, '2015-03-18_pc_43': 10, '2015-03-18_pc_44': 7, '2015-03-18_pc_45': 1, '2015-03-18_pc_46': 6, '2015-03-18_pc_47': 3, '2015-03-18_pc_48': 1, '2015-03-18_pc_49': 1, '2015-03-18_pc_50': 9, '2015-03-18_pc_51': 2, '2015-03-18_pc_52': 10, '2015-03-18_pc_53': 1, '2015-03-18_pc_54': 1, '2015-03-18_pc_55': 19, '2015-03-18_pc_56': 2, '2015-03-18_pc_57': 8, '2015-03-18_pc_58': 2, '2015-03-18_pc_59': 1, '2015-03-18_pc_60': 3, '2015-03-18_pc_61': 8, '2015-03-18_pc_62': 2, '2015-03-18_pc_63': 5, '2015-03-18_pc_64': 2, '2015-03-18_pc_65': 3, '2013-09-01_pc_1': 9, '2013-09-01_pc_2': 17, '2013-09-01_pc_3': 8, '2013-09-01_pc_4': 2, '2013-09-01_pc_5': 6, '2013-09-01_pc_6': 34, '2013-09-01_pc_7': 2, '2013-09-01_pc_8': 1, '2013-09-01_pc_9': 33, '2013-09-01_pc_10': 9, '2013-09-01_pc_11': 2, '2013-09-01_pc_12': 10, '2013-09-01_pc_13': 19, '2013-09-01_pc_14': 1, '2013-09-01_pc_15': 4, '2013-09-01_pc_16': 5, '2013-09-01_pc_17': 3, '2013-09-01_pc_18': 15, '2013-09-01_pc_19': 5, '2013-09-01_pc_20': 1, '2013-09-01_pc_21': 7, '2013-09-01_pc_22': 3, '2013-09-01_pc_23': 5, '2013-09-01_pc_24': 2, '2013-09-01_pc_25': 2, '2013-09-01_pc_26': 2, '2013-09-01_pc_27': 1, '2013-09-01_pc_28': 10, '2013-09-01_pc_29': 2, '2013-09-01_pc_30': 11, '2013-09-01_pc_31': 17, '2013-09-01_pc_32': 3, '2013-09-01_pc_33': 1, '2013-09-01_pc_34': 11, '2013-09-01_pc_35': 24, '2013-09-01_pc_36': 3, '2013-09-01_pc_37': 1, '2013-09-01_pc_38': 1, '2013-09-01_pc_39': 3, '2013-09-01_pc_40': 3, '2013-09-01_pc_41': 20, '2013-09-01_pc_42': 10, '2013-09-01_pc_43': 2, '2013-09-01_pc_44': 1, '2013-09-01_pc_45': 8, '2013-09-01_pc_46': 7, '2013-09-01_pc_47': 11, '2013-09-01_pc_48': 1, '2013-09-01_pc_49': 1, '2013-09-01_pc_50': 1, '2013-09-01_pc_51': 2, '2013-09-01_pc_52': 5, '2013-09-01_pc_53': 5, '2013-09-01_pc_54': 1, '2013-09-01_pc_55': 6, '2013-09-01_pc_56': 4, '2013-09-01_pc_57': 3, '2013-09-01_pc_58': 2, '2013-09-01_pc_59': 10, '2013-09-01_pc_60': 12, '2013-09-01_pc_61': 7, '2013-09-01_pc_62': 1, '2013-09-01_pc_63': 2, '2013-09-01_pc_64': 34, '2013-09-01_pc_65': 1, '2013-09-01_pc_66': 21, '2013-09-01_pc_67': 14, '2013-09-01_pc_68': 8, '2013-09-01_pc_69': 2, '2010-08-17_pc_1': 20, '2010-08-17_pc_2': 16, '2010-08-17_pc_3': 1, '2010-08-17_pc_4': 17, '2010-08-17_pc_5': 1, '2010-08-17_pc_6': 1, '2010-08-17_pc_7': 2, '2010-08-17_pc_8': 3, '2010-08-17_pc_9': 13, '2010-08-17_pc_10': 1, '2010-08-17_pc_11': 6, '2010-08-17_pc_12': 28, '2010-08-17_pc_13': 6, '2010-08-17_pc_14': 30, '2010-08-17_pc_15': 6, '2010-08-17_pc_16': 6, '2010-08-17_pc_17': 9, '2010-08-17_pc_18': 1, '2010-08-17_pc_19': 1, '2010-08-17_pc_20': 18, '2010-08-17_pc_21': 56, '2010-08-17_pc_22': 16, '2010-08-17_pc_23': 1, '2010-08-17_pc_24': 2, '2010-08-17_pc_25': 1, '2010-08-17_pc_26': 1, '2010-08-17_pc_27': 7, '2010-08-17_pc_28': 1, '2010-08-17_pc_29': 1, '2010-08-17_pc_30': 8, '2010-08-17_pc_31': 6, '2010-08-17_pc_32': 1, '2010-08-17_pc_33': 1, '2010-08-17_pc_34': 8, '2010-08-17_pc_35': 7, '2010-08-17_pc_36': 1, '2010-08-17_pc_37': 4, '2010-08-17_pc_38': 1, '2010-08-17_pc_39': 1, '2010-08-17_pc_40': 6, '2010-08-17_pc_41': 1, '2010-08-17_pc_42': 6, '2010-08-17_pc_43': 1, '2010-08-17_pc_44': 2, '2010-08-17_pc_45': 7, '2010-08-17_pc_46': 9, '2010-08-17_pc_47': 3, '2010-08-17_pc_48': 1, '2010-08-17_pc_49': 7, '2010-08-17_pc_50': 5, '2010-08-17_pc_51': 1, '2010-08-17_pc_52': 1, '2010-08-17_pc_53': 7, '2010-08-17_pc_54': 4, '2010-08-17_pc_55': 5, '2010-08-17_pc_56': 7, '2010-08-17_pc_57': 7, '2010-08-17_pc_58': 1, '2010-08-17_pc_59': 4, '2010-08-17_pc_60': 1, '2010-08-17_pc_61': 3, '2010-08-17_pc_62': 1, '2010-08-17_pc_63': 3, '2010-08-17_pc_64': 10, '2010-08-17_pc_65': 3, '2010-08-17_pc_66': 1, '2010-08-17_pc_67': 1, '2010-08-17_pc_68': 2, '2010-08-17_pc_69': 3, '2010-08-17_pc_70': 5, '2010-08-17_pc_71': 2, '2010-08-17_pc_72': 7, '2010-08-17_pc_73': 3, '2010-08-17_pc_74': 9, '2010-08-17_pc_75': 3, '2010-08-17_pc_76': 9, '2010-08-17_pc_77': 4, '2010-08-17_pc_78': 5, '2010-08-17_pc_79': 6, '2010-08-17_pc_80': 6, '2010-08-17_pc_81': 3, '2010-08-17_pc_82': 1, '2010-08-17_pc_83': 1, '2010-08-17_pc_84': 1, '2010-08-17_pc_85': 8, '2010-08-17_pc_86': 2, '2010-08-17_pc_87': 2, '2005-07-06_pc_1': 1, '2005-07-06_pc_2': 1, '2005-07-06_pc_3': 10, '2005-07-06_pc_4': 4, '2005-07-06_pc_5': 1, '2005-07-06_pc_6': 7, '2005-07-06_pc_7': 37, '2005-07-06_pc_8': 1, '2005-07-06_pc_9': 2, '2005-07-06_pc_10': 1, '2005-07-06_pc_11': 1, '2005-07-06_pc_12': 8, '2005-07-06_pc_13': 1, '2005-07-06_pc_14': 1, '2005-07-06_pc_15': 2, '2005-07-06_pc_16': 1, '2005-07-06_pc_17': 1, '2005-07-06_pc_18': 2, '2005-07-06_pc_19': 1, '2005-07-06_pc_20': 1, '2005-07-06_pc_21': 1, '2005-07-06_pc_22': 4, '2005-07-06_pc_23': 1, '2005-07-06_pc_24': 1, '2005-07-06_pc_25': 2, '2005-07-06_pc_26': 4, '2005-07-06_pc_27': 9, '2005-07-06_pc_28': 1, '2005-07-06_pc_29': 37, '2005-07-06_pc_30': 1, '2005-07-06_pc_31': 1, '2005-07-06_pc_32': 3, '2005-07-06_pc_33': 1, '2005-07-06_pc_34': 2, '2005-07-06_pc_35': 1, '2005-07-06_pc_36': 1, '2005-07-06_pc_37': 4, '2005-07-06_pc_38': 18, '2005-07-06_pc_39': 1, '2005-07-06_pc_40': 31, '2005-07-06_pc_41': 2, '2005-07-06_pc_42': 1, '2005-07-06_pc_43': 1, '2005-07-06_pc_44': 1, '2005-07-06_pc_45': 1, '2005-07-06_pc_46': 1, '2005-07-06_pc_47': 1, '2005-07-06_pc_48': 1, '2005-07-06_pc_49': 33, '2005-07-06_pc_50': 1, '2005-07-06_pc_51': 1, '2005-07-06_pc_52': 1, '2005-07-06_pc_53': 1, '2005-07-06_pc_54': 1, '2005-07-06_pc_55': 1, '2005-07-06_pc_56': 20, '2005-07-06_pc_57': 1, '2005-07-06_pc_58': 2, '2005-07-06_pc_59': 1, '2005-07-06_pc_60': 1, '2005-07-06_pc_61': 1, '2005-07-06_pc_62': 6, '2005-07-06_pc_63': 1, '2005-07-06_pc_64': 2, '2005-07-06_pc_65': 1, '2005-07-06_pc_66': 2, '2005-07-06_pc_67': 14, '2005-07-06_pc_68': 1, '2005-07-06_pc_69': 8, '2005-07-06_pc_70': 2, '2005-07-06_pc_71': 1, '2005-07-06_pc_72': 10, '2005-07-06_pc_73': 8, '2005-07-06_pc_74': 1, '2005-07-06_pc_75': 1, '2005-07-06_pc_76': 1, '2005-07-06_pc_77': 3, '2005-07-06_pc_78': 1, '2005-07-06_pc_79': 2, '2005-07-06_pc_80': 3, '2005-07-06_pc_81': 1, '2005-07-06_pc_82': 1, '2005-07-06_pc_83': 1, '2005-07-06_pc_84': 1, '2005-07-06_pc_85': 1, '2005-07-06_pc_86': 7, '2005-07-06_pc_87': 1, '2005-07-06_pc_88': 1, '2005-07-06_pc_89': 1, '2005-07-06_pc_90': 2, '2005-07-06_pc_91': 1, '2005-07-06_pc_92': 1, '2005-07-06_pc_93': 1, '2005-07-06_pc_94': 1, '2005-07-06_pc_95': 1, '2005-07-06_pc_96': 2, '2005-07-06_pc_97': 2, '2005-07-06_pc_98': 1, '2005-07-06_pc_99': 9, '2005-07-06_pc_100': 2, '2005-07-06_pc_101': 1, '2005-07-06_pc_102': 1, '2005-07-06_pc_103': 6, '2005-07-06_pc_104': 1, '2005-07-06_pc_105': 1, '2005-07-06_pc_106': 1, '2005-07-06_pc_107': 1, '2005-07-06_pc_108': 2, '2005-07-06_pc_109': 2, '2005-07-06_pc_110': 11, '2005-07-06_pc_111': 2, '2005-07-06_pc_112': 1, '2005-07-06_pc_113': 1, '2005-07-06_pc_114': 1, '2005-07-06_pc_115': 1, '2005-07-06_pc_116': 2, '2005-07-06_pc_117': 1, '2005-07-06_pc_118': 1, '2005-07-06_pc_119': 9, '2005-07-06_pc_120': 1, '2005-07-06_pc_121': 2, '2005-07-06_pc_122': 1, '2005-07-06_pc_123': 1, '2005-07-06_pc_124': 2, '2005-07-06_pc_125': 9, '2005-07-06_pc_126': 17, '2005-07-06_pc_127': 1, '2005-07-06_pc_128': 1, '2005-07-06_pc_129': 1, '2005-07-06_pc_130': 3, '2005-07-06_pc_131': 1, '2005-07-06_pc_132': 11, '2005-07-06_pc_133': 1, '2005-07-06_pc_134': 1, '2005-07-06_pc_135': 5, '2005-07-06_pc_136': 1, '2005-07-06_pc_137': 1, '2005-07-06_pc_138': 1, '2005-07-06_pc_139': 2, '2005-07-06_pc_140': 2, '2005-07-06_pc_141': 1, '2014-06-18_pc_1': 17, '2014-06-18_pc_2': 6, '2014-06-18_pc_3': 2, '2014-06-18_pc_4': 5, '2014-06-18_pc_5': 3, '2014-06-18_pc_6': 6, '2014-06-18_pc_7': 7, '2014-06-18_pc_8': 15, '2014-06-18_pc_9': 1, '2014-06-18_pc_10': 31, '2014-06-18_pc_11': 1, '2014-06-18_pc_12': 1, '2014-06-18_pc_13': 4, '2014-06-18_pc_14': 20, '2014-06-18_pc_15': 4, '2014-06-18_pc_16': 1, '2014-06-18_pc_17': 4, '2014-06-18_pc_18': 33, '2014-06-18_pc_19': 1, '2014-06-18_pc_20': 2, '2014-06-18_pc_21': 4, '2014-06-18_pc_22': 7, '2014-06-18_pc_23': 1, '2014-06-18_pc_24': 2, '2014-06-18_pc_25': 10, '2014-06-18_pc_26': 25, '2014-06-18_pc_27': 2, '2014-06-18_pc_28': 3, '2014-06-18_pc_29': 7, '2014-06-18_pc_30': 21, '2014-06-18_pc_31': 1, '2014-06-18_pc_32': 9, '2014-06-18_pc_33': 1, '2014-06-18_pc_34': 2, '2014-06-18_pc_35': 16, '2014-06-18_pc_36': 5, '2014-06-18_pc_37': 10, '2014-06-18_pc_38': 36, '2014-06-18_pc_39': 8, '2014-06-18_pc_40': 18, '2014-06-18_pc_41': 5, '2014-06-18_pc_42': 13, '2014-06-18_pc_43': 3, '2014-06-18_pc_44': 2, '2014-06-18_pc_45': 1, '2014-06-18_pc_46': 5, '2014-06-18_pc_47': 1, '2014-06-18_pc_48': 10, '2014-06-18_pc_49': 8, '2014-06-18_pc_50': 22, '2014-06-18_pc_51': 18, '2014-06-18_pc_52': 1, '2014-06-18_pc_53': 4, '2014-06-18_pc_54': 2, '2014-06-18_pc_55': 1, '2014-06-18_pc_56': 28, '2014-06-18_pc_57': 7, '2014-06-18_pc_58': 4, '2014-06-18_pc_59': 1, '2014-06-18_pc_60': 1, '2014-06-18_pc_61': 3, '2014-06-18_pc_62': 1, '2014-06-18_pc_63': 4, '2014-06-18_pc_64': 1, '2014-06-18_pc_65': 2, '2016-06-08_pc_1': 6, '2016-06-08_pc_2': 10, '2016-06-08_pc_3': 8, '2016-06-08_pc_4': 9, '2016-06-08_pc_5': 4, '2016-06-08_pc_6': 8, '2016-06-08_pc_7': 10, '2016-06-08_pc_8': 1, '2016-06-08_pc_9': 1, '2016-06-08_pc_10': 14, '2016-06-08_pc_11': 1, '2016-06-08_pc_12': 4, '2016-06-08_pc_13': 2, '2016-06-08_pc_14': 1, '2016-06-08_pc_15': 3, '2016-06-08_pc_16': 1, '2016-06-08_pc_17': 7, '2016-06-08_pc_18': 1, '2016-06-08_pc_19': 21, '2016-06-08_pc_20': 11, '2016-06-08_pc_21': 4, '2016-06-08_pc_22': 17, '2016-06-08_pc_23': 7, '2016-06-08_pc_24': 3, '2016-06-08_pc_25': 4, '2016-06-08_pc_26': 2, '2016-06-08_pc_27': 2, '2016-06-08_pc_28': 5, '2016-06-08_pc_29': 5, '2016-06-08_pc_30': 10, '2016-06-08_pc_31': 1, '2016-06-08_pc_32': 1, '2016-06-08_pc_33': 47, '2016-06-08_pc_34': 1, '2016-06-08_pc_35': 3, '2016-06-08_pc_36': 1, '2016-06-08_pc_37': 6, '2016-06-08_pc_38': 6, '2016-06-08_pc_39': 2, '2016-06-08_pc_40': 1, '2016-06-08_pc_41': 4, '2016-06-08_pc_42': 30, '2016-06-08_pc_43': 1, '2016-06-08_pc_44': 9, '2016-06-08_pc_45': 3, '2016-06-08_pc_46': 9, '2016-06-08_pc_47': 10, '2016-06-08_pc_48': 3, '2016-06-08_pc_49': 18, '2016-06-08_pc_50': 2, '2016-06-08_pc_51': 5, '2016-06-08_pc_52': 2, '2016-06-08_pc_53': 1, '2016-06-08_pc_54': 4, '2016-06-08_pc_55': 8, '2016-06-08_pc_56': 8, '2016-06-08_pc_57': 8, '2016-06-08_pc_58': 13, '2016-06-08_pc_59': 1, '2016-06-08_pc_60': 3, '2016-06-08_pc_61': 3, '2016-06-08_pc_62': 1, '2016-06-08_pc_63': 1, '2016-06-08_pc_64': 8, '2016-06-08_pc_65': 24, '2016-06-08_pc_66': 6, '2016-06-08_pc_67': 5, '2016-06-08_pc_68': 10, '2016-06-08_pc_69': 5, '2016-06-08_pc_70': 7, '2016-06-08_pc_71': 2, '2016-06-08_pc_72': 1, '2016-06-08_pc_73': 12, '2016-06-08_pc_74': 1, '2016-06-08_pc_75': 2, '2016-06-08_pc_76': 1, '2016-06-08_pc_77': 2, '2016-06-08_pc_78': 2, '2016-06-08_pc_79': 1, '2016-06-08_pc_80': 6, '2016-06-08_pc_81': 3, '2016-06-08_pc_82': 1, '2016-06-08_pc_83': 2, '2016-06-08_pc_84': 1}
#counts_gt_clusters = {'2015-03-18_gtc_1': 141, '2015-03-18_gtc_2': 3, '2015-03-18_gtc_3': 1, '2015-03-18_gtc_4': 1, '2015-03-18_gtc_5': 1, '2015-03-18_gtc_6': 1, '2015-03-18_gtc_7': 9, '2015-03-18_gtc_8': 1, '2015-03-18_gtc_9': 2, '2015-03-18_gtc_10': 9, '2015-03-18_gtc_11': 6, '2015-03-18_gtc_12': 1, '2015-03-18_gtc_13': 34, '2015-03-18_gtc_14': 1, '2015-03-18_gtc_15': 1, '2015-03-18_gtc_16': 1, '2015-03-18_gtc_17': 1, '2015-03-18_gtc_18': 5, '2015-03-18_gtc_19': 1, '2015-03-18_gtc_20': 1, '2015-03-18_gtc_21': 1, '2015-03-18_gtc_22': 1, '2015-03-18_gtc_23': 1, '2015-03-18_gtc_24': 1, '2015-03-18_gtc_25': 1, '2015-03-18_gtc_26': 1, '2015-03-18_gtc_27': 1, '2015-03-18_gtc_28': 1, '2015-03-18_gtc_29': 1, '2015-03-18_gtc_30': 1, '2015-03-18_gtc_31': 1, '2015-03-18_gtc_32': 1, '2015-03-18_gtc_33': 10, '2015-03-18_gtc_34': 4, '2015-03-18_gtc_35': 3, '2015-03-18_gtc_36': 1, '2015-03-18_gtc_37': 1, '2015-03-18_gtc_38': 28, '2015-03-18_gtc_39': 4, '2015-03-18_gtc_40': 1, '2015-03-18_gtc_41': 14, '2015-03-18_gtc_42': 22, '2015-03-18_gtc_43': 3, '2015-03-18_gtc_44': 1, '2015-03-18_gtc_45': 21, '2015-03-18_gtc_46': 2, '2015-03-18_gtc_47': 3, '2015-03-18_gtc_48': 10, '2015-03-18_gtc_49': 1, '2015-03-18_gtc_50': 6, '2015-03-18_gtc_51': 1, '2015-03-18_gtc_52': 4, '2015-03-18_gtc_53': 1, '2015-03-18_gtc_54': 1, '2015-03-18_gtc_55': 1, '2015-03-18_gtc_56': 4, '2015-03-18_gtc_57': 1, '2015-03-18_gtc_58': 15, '2015-03-18_gtc_59': 5, '2015-03-18_gtc_60': 1, '2015-03-18_gtc_61': 1, '2015-03-18_gtc_62': 3, '2015-03-18_gtc_63': 1, '2015-03-18_gtc_64': 8, '2015-03-18_gtc_65': 1, '2015-03-18_gtc_66': 1, '2015-03-18_gtc_67': 1, '2015-03-18_gtc_68': 1, '2015-03-18_gtc_69': 22, '2015-03-18_gtc_70': 1, '2015-03-18_gtc_71': 2, '2015-03-18_gtc_72': 1, '2015-03-18_gtc_73': 17, '2015-03-18_gtc_74': 1, '2015-03-18_gtc_75': 3, '2015-03-18_gtc_76': 1, '2015-03-18_gtc_77': 1, '2015-03-18_gtc_78': 4, '2015-03-18_gtc_79': 2, '2015-03-18_gtc_80': 6, '2015-03-18_gtc_81': 6, '2015-03-18_gtc_82': 1, '2015-03-18_gtc_83': 10, '2015-03-18_gtc_84': 2, '2015-03-18_gtc_85': 1, '2013-09-01_gtc_1': 1, '2013-09-01_gtc_2': 36, '2013-09-01_gtc_3': 1, '2013-09-01_gtc_4': 1, '2013-09-01_gtc_5': 1, '2013-09-01_gtc_6': 1, '2013-09-01_gtc_7': 1, '2013-09-01_gtc_8': 3, '2013-09-01_gtc_9': 3, '2013-09-01_gtc_10': 42, '2013-09-01_gtc_11': 4, '2013-09-01_gtc_12': 1, '2013-09-01_gtc_13': 2, '2013-09-01_gtc_14': 19, '2013-09-01_gtc_15': 1, '2013-09-01_gtc_16': 3, '2013-09-01_gtc_17': 3, '2013-09-01_gtc_18': 9, '2013-09-01_gtc_19': 1, '2013-09-01_gtc_20': 7, '2013-09-01_gtc_21': 2, '2013-09-01_gtc_22': 1, '2013-09-01_gtc_23': 1, '2013-09-01_gtc_24': 2, '2013-09-01_gtc_25': 31, '2013-09-01_gtc_26': 1, '2013-09-01_gtc_27': 8, '2013-09-01_gtc_28': 1, '2013-09-01_gtc_29': 13, '2013-09-01_gtc_30': 1, '2013-09-01_gtc_31': 6, '2013-09-01_gtc_32': 1, '2013-09-01_gtc_33': 1, '2013-09-01_gtc_34': 1, '2013-09-01_gtc_35': 7, '2013-09-01_gtc_36': 53, '2013-09-01_gtc_37': 3, '2013-09-01_gtc_38': 8, '2013-09-01_gtc_39': 12, '2013-09-01_gtc_40': 2, '2013-09-01_gtc_41': 1, '2013-09-01_gtc_42': 8, '2013-09-01_gtc_43': 3, '2013-09-01_gtc_44': 21, '2013-09-01_gtc_45': 2, '2013-09-01_gtc_46': 1, '2013-09-01_gtc_47': 5, '2013-09-01_gtc_48': 1, '2013-09-01_gtc_49': 9, '2013-09-01_gtc_50': 6, '2013-09-01_gtc_51': 6, '2013-09-01_gtc_52': 3, '2013-09-01_gtc_53': 1, '2013-09-01_gtc_54': 1, '2013-09-01_gtc_55': 7, '2013-09-01_gtc_56': 6, '2013-09-01_gtc_57': 65, '2013-09-01_gtc_58': 8, '2013-09-01_gtc_59': 1, '2013-09-01_gtc_60': 9, '2013-09-01_gtc_61': 32, '2013-09-01_gtc_62': 1, '2013-09-01_gtc_63': 5, '2013-09-01_gtc_64': 2, '2013-09-01_gtc_65': 1, '2010-08-17_gtc_1': 29, '2010-08-17_gtc_2': 1, '2010-08-17_gtc_3': 18, '2010-08-17_gtc_4': 1, '2010-08-17_gtc_5': 1, '2010-08-17_gtc_6': 1, '2010-08-17_gtc_7': 10, '2010-08-17_gtc_8': 1, '2010-08-17_gtc_9': 1, '2010-08-17_gtc_10': 2, '2010-08-17_gtc_11': 2, '2010-08-17_gtc_12': 1, '2010-08-17_gtc_13': 4, '2010-08-17_gtc_14': 5, '2010-08-17_gtc_15': 102, '2010-08-17_gtc_16': 3, '2010-08-17_gtc_17': 6, '2010-08-17_gtc_18': 2, '2010-08-17_gtc_19': 1, '2010-08-17_gtc_20': 6, '2010-08-17_gtc_21': 19, '2010-08-17_gtc_22': 1, '2010-08-17_gtc_23': 5, '2010-08-17_gtc_24': 24, '2010-08-17_gtc_25': 1, '2010-08-17_gtc_26': 1, '2010-08-17_gtc_27': 1, '2010-08-17_gtc_28': 1, '2010-08-17_gtc_29': 1, '2010-08-17_gtc_30': 1, '2010-08-17_gtc_31': 1, '2010-08-17_gtc_32': 1, '2010-08-17_gtc_33': 1, '2010-08-17_gtc_34': 3, '2010-08-17_gtc_35': 1, '2010-08-17_gtc_36': 3, '2010-08-17_gtc_37': 15, '2010-08-17_gtc_38': 5, '2010-08-17_gtc_39': 1, '2010-08-17_gtc_40': 15, '2010-08-17_gtc_41': 1, '2010-08-17_gtc_42': 1, '2010-08-17_gtc_43': 1, '2010-08-17_gtc_44': 1, '2010-08-17_gtc_45': 1, '2010-08-17_gtc_46': 2, '2010-08-17_gtc_47': 9, '2010-08-17_gtc_48': 3, '2010-08-17_gtc_49': 1, '2010-08-17_gtc_50': 7, '2010-08-17_gtc_51': 13, '2010-08-17_gtc_52': 1, '2010-08-17_gtc_53': 3, '2010-08-17_gtc_54': 11, '2010-08-17_gtc_55': 1, '2010-08-17_gtc_56': 1, '2010-08-17_gtc_57': 4, '2010-08-17_gtc_58': 20, '2010-08-17_gtc_59': 1, '2010-08-17_gtc_60': 1, '2010-08-17_gtc_61': 3, '2010-08-17_gtc_62': 7, '2010-08-17_gtc_63': 3, '2010-08-17_gtc_64': 2, '2010-08-17_gtc_65': 5, '2010-08-17_gtc_66': 1, '2010-08-17_gtc_67': 1, '2010-08-17_gtc_68': 2, '2010-08-17_gtc_69': 1, '2010-08-17_gtc_70': 7, '2010-08-17_gtc_71': 2, '2010-08-17_gtc_72': 2, '2010-08-17_gtc_73': 9, '2010-08-17_gtc_74': 1, '2010-08-17_gtc_75': 20, '2010-08-17_gtc_76': 11, '2010-08-17_gtc_77': 1, '2010-08-17_gtc_78': 1, '2010-08-17_gtc_79': 5, '2010-08-17_gtc_80': 1, '2010-08-17_gtc_81': 14, '2010-08-17_gtc_82': 1, '2010-08-17_gtc_83': 1, '2010-08-17_gtc_84': 1, '2010-08-17_gtc_85': 1, '2010-08-17_gtc_86': 1, '2010-08-17_gtc_87': 8, '2010-08-17_gtc_88': 1, '2010-08-17_gtc_89': 1, '2010-08-17_gtc_90': 1, '2010-08-17_gtc_91': 1, '2010-08-17_gtc_92': 2, '2010-08-17_gtc_93': 1, '2005-07-06_gtc_1': 5, '2005-07-06_gtc_2': 1, '2005-07-06_gtc_3': 1, '2005-07-06_gtc_4': 1, '2005-07-06_gtc_5': 37, '2005-07-06_gtc_6': 1, '2005-07-06_gtc_7': 1, '2005-07-06_gtc_8': 1, '2005-07-06_gtc_9': 1, '2005-07-06_gtc_10': 5, '2005-07-06_gtc_11': 1, '2005-07-06_gtc_12': 1, '2005-07-06_gtc_13': 1, '2005-07-06_gtc_14': 1, '2005-07-06_gtc_15': 1, '2005-07-06_gtc_16': 18, '2005-07-06_gtc_17': 1, '2005-07-06_gtc_18': 1, '2005-07-06_gtc_19': 1, '2005-07-06_gtc_20': 1, '2005-07-06_gtc_21': 1, '2005-07-06_gtc_22': 1, '2005-07-06_gtc_23': 1, '2005-07-06_gtc_24': 1, '2005-07-06_gtc_25': 1, '2005-07-06_gtc_26': 10, '2005-07-06_gtc_27': 1, '2005-07-06_gtc_28': 1, '2005-07-06_gtc_29': 1, '2005-07-06_gtc_30': 1, '2005-07-06_gtc_31': 1, '2005-07-06_gtc_32': 1, '2005-07-06_gtc_33': 1, '2005-07-06_gtc_34': 1, '2005-07-06_gtc_35': 1, '2005-07-06_gtc_36': 1, '2005-07-06_gtc_37': 34, '2005-07-06_gtc_38': 23, '2005-07-06_gtc_39': 1, '2005-07-06_gtc_40': 1, '2005-07-06_gtc_41': 2, '2005-07-06_gtc_42': 1, '2005-07-06_gtc_43': 1, '2005-07-06_gtc_44': 1, '2005-07-06_gtc_45': 1, '2005-07-06_gtc_46': 1, '2005-07-06_gtc_47': 1, '2005-07-06_gtc_48': 1, '2005-07-06_gtc_49': 1, '2005-07-06_gtc_50': 1, '2005-07-06_gtc_51': 1, '2005-07-06_gtc_52': 3, '2005-07-06_gtc_53': 1, '2005-07-06_gtc_54': 1, '2005-07-06_gtc_55': 1, '2005-07-06_gtc_56': 1, '2005-07-06_gtc_57': 1, '2005-07-06_gtc_58': 34, '2005-07-06_gtc_59': 1, '2005-07-06_gtc_60': 1, '2005-07-06_gtc_61': 1, '2005-07-06_gtc_62': 43, '2005-07-06_gtc_63': 1, '2005-07-06_gtc_64': 6, '2005-07-06_gtc_65': 1, '2005-07-06_gtc_66': 1, '2005-07-06_gtc_67': 1, '2005-07-06_gtc_68': 1, '2005-07-06_gtc_69': 10, '2005-07-06_gtc_70': 1, '2005-07-06_gtc_71': 3, '2005-07-06_gtc_72': 1, '2005-07-06_gtc_73': 8, '2005-07-06_gtc_74': 4, '2005-07-06_gtc_75': 1, '2005-07-06_gtc_76': 14, '2005-07-06_gtc_77': 1, '2005-07-06_gtc_78': 1, '2005-07-06_gtc_79': 3, '2005-07-06_gtc_80': 7, '2005-07-06_gtc_81': 1, '2005-07-06_gtc_82': 1, '2005-07-06_gtc_83': 1, '2005-07-06_gtc_84': 2, '2005-07-06_gtc_85': 2, '2005-07-06_gtc_86': 1, '2005-07-06_gtc_87': 1, '2005-07-06_gtc_88': 1, '2005-07-06_gtc_89': 1, '2005-07-06_gtc_90': 1, '2005-07-06_gtc_91': 1, '2005-07-06_gtc_92': 12, '2005-07-06_gtc_93': 1, '2005-07-06_gtc_94': 6, '2005-07-06_gtc_95': 1, '2005-07-06_gtc_96': 1, '2005-07-06_gtc_97': 1, '2005-07-06_gtc_98': 1, '2005-07-06_gtc_99': 1, '2005-07-06_gtc_100': 1, '2005-07-06_gtc_101': 1, '2005-07-06_gtc_102': 1, '2005-07-06_gtc_103': 1, '2005-07-06_gtc_104': 1, '2005-07-06_gtc_105': 1, '2005-07-06_gtc_106': 1, '2005-07-06_gtc_107': 1, '2005-07-06_gtc_108': 1, '2005-07-06_gtc_109': 1, '2005-07-06_gtc_110': 1, '2005-07-06_gtc_111': 1, '2005-07-06_gtc_112': 1, '2005-07-06_gtc_113': 1, '2005-07-06_gtc_114': 1, '2005-07-06_gtc_115': 1, '2005-07-06_gtc_116': 3, '2005-07-06_gtc_117': 1, '2005-07-06_gtc_118': 17, '2005-07-06_gtc_119': 1, '2005-07-06_gtc_120': 1, '2005-07-06_gtc_121': 2, '2005-07-06_gtc_122': 1, '2005-07-06_gtc_123': 1, '2005-07-06_gtc_124': 1, '2005-07-06_gtc_125': 1, '2005-07-06_gtc_126': 1, '2005-07-06_gtc_127': 1, '2005-07-06_gtc_128': 1, '2005-07-06_gtc_129': 1, '2005-07-06_gtc_130': 1, '2005-07-06_gtc_131': 13, '2005-07-06_gtc_132': 1, '2005-07-06_gtc_133': 1, '2005-07-06_gtc_134': 1, '2005-07-06_gtc_135': 1, '2005-07-06_gtc_136': 1, '2005-07-06_gtc_137': 1, '2005-07-06_gtc_138': 1, '2005-07-06_gtc_139': 21, '2005-07-06_gtc_140': 1, '2005-07-06_gtc_141': 1, '2005-07-06_gtc_142': 14, '2005-07-06_gtc_143': 1, '2005-07-06_gtc_144': 1, '2005-07-06_gtc_145': 1, '2005-07-06_gtc_146': 1, '2005-07-06_gtc_147': 1, '2005-07-06_gtc_148': 1, '2005-07-06_gtc_149': 1, '2005-07-06_gtc_150': 8, '2005-07-06_gtc_151': 1, '2005-07-06_gtc_152': 7, '2005-07-06_gtc_153': 1, '2005-07-06_gtc_154': 1, '2005-07-06_gtc_155': 1, '2014-06-18_gtc_1': 1, '2014-06-18_gtc_2': 17, '2014-06-18_gtc_3': 1, '2014-06-18_gtc_4': 2, '2014-06-18_gtc_5': 8, '2014-06-18_gtc_6': 5, '2014-06-18_gtc_7': 1, '2014-06-18_gtc_8': 1, '2014-06-18_gtc_9': 6, '2014-06-18_gtc_10': 11, '2014-06-18_gtc_11': 1, '2014-06-18_gtc_12': 66, '2014-06-18_gtc_13': 1, '2014-06-18_gtc_14': 1, '2014-06-18_gtc_15': 40, '2014-06-18_gtc_16': 1, '2014-06-18_gtc_17': 1, '2014-06-18_gtc_18': 2, '2014-06-18_gtc_19': 1, '2014-06-18_gtc_20': 2, '2014-06-18_gtc_21': 1, '2014-06-18_gtc_22': 1, '2014-06-18_gtc_23': 1, '2014-06-18_gtc_24': 1, '2014-06-18_gtc_25': 34, '2014-06-18_gtc_26': 1, '2014-06-18_gtc_27': 1, '2014-06-18_gtc_28': 3, '2014-06-18_gtc_29': 7, '2014-06-18_gtc_30': 2, '2014-06-18_gtc_31': 3, '2014-06-18_gtc_32': 1, '2014-06-18_gtc_33': 1, '2014-06-18_gtc_34': 1, '2014-06-18_gtc_35': 7, '2014-06-18_gtc_36': 1, '2014-06-18_gtc_37': 1, '2014-06-18_gtc_38': 1, '2014-06-18_gtc_39': 4, '2014-06-18_gtc_40': 1, '2014-06-18_gtc_41': 12, '2014-06-18_gtc_42': 16, '2014-06-18_gtc_43': 1, '2014-06-18_gtc_44': 3, '2014-06-18_gtc_45': 4, '2014-06-18_gtc_46': 2, '2014-06-18_gtc_47': 4, '2014-06-18_gtc_48': 1, '2014-06-18_gtc_49': 3, '2014-06-18_gtc_50': 1, '2014-06-18_gtc_51': 19, '2014-06-18_gtc_52': 2, '2014-06-18_gtc_53': 30, '2014-06-18_gtc_54': 23, '2014-06-18_gtc_55': 50, '2014-06-18_gtc_56': 1, '2014-06-18_gtc_57': 4, '2014-06-18_gtc_58': 2, '2014-06-18_gtc_59': 23, '2014-06-18_gtc_60': 1, '2014-06-18_gtc_61': 1, '2014-06-18_gtc_62': 28, '2014-06-18_gtc_63': 1, '2014-06-18_gtc_64': 1, '2014-06-18_gtc_65': 1, '2014-06-18_gtc_66': 1, '2014-06-18_gtc_67': 6, '2014-06-18_gtc_68': 1, '2014-06-18_gtc_69': 1, '2014-06-18_gtc_70': 1, '2014-06-18_gtc_71': 3, '2014-06-18_gtc_72': 1, '2014-06-18_gtc_73': 1, '2014-06-18_gtc_74': 1, '2014-06-18_gtc_75': 1, '2014-06-18_gtc_76': 4, '2014-06-18_gtc_77': 3, '2016-06-08_gtc_1': 2, '2016-06-08_gtc_2': 9, '2016-06-08_gtc_3': 3, '2016-06-08_gtc_4': 1, '2016-06-08_gtc_5': 1, '2016-06-08_gtc_6': 1, '2016-06-08_gtc_7': 1, '2016-06-08_gtc_8': 1, '2016-06-08_gtc_9': 2, '2016-06-08_gtc_10': 8, '2016-06-08_gtc_11': 9, '2016-06-08_gtc_12': 6, '2016-06-08_gtc_13': 1, '2016-06-08_gtc_14': 1, '2016-06-08_gtc_15': 17, '2016-06-08_gtc_16': 1, '2016-06-08_gtc_17': 1, '2016-06-08_gtc_18': 1, '2016-06-08_gtc_19': 1, '2016-06-08_gtc_20': 1, '2016-06-08_gtc_21': 6, '2016-06-08_gtc_22': 14, '2016-06-08_gtc_23': 1, '2016-06-08_gtc_24': 1, '2016-06-08_gtc_25': 1, '2016-06-08_gtc_26': 1, '2016-06-08_gtc_27': 25, '2016-06-08_gtc_28': 1, '2016-06-08_gtc_29': 21, '2016-06-08_gtc_30': 1, '2016-06-08_gtc_31': 1, '2016-06-08_gtc_32': 4, '2016-06-08_gtc_33': 3, '2016-06-08_gtc_34': 1, '2016-06-08_gtc_35': 1, '2016-06-08_gtc_36': 1, '2016-06-08_gtc_37': 1, '2016-06-08_gtc_38': 11, '2016-06-08_gtc_39': 3, '2016-06-08_gtc_40': 1, '2016-06-08_gtc_41': 1, '2016-06-08_gtc_42': 3, '2016-06-08_gtc_43': 6, '2016-06-08_gtc_44': 14, '2016-06-08_gtc_45': 1, '2016-06-08_gtc_46': 1, '2016-06-08_gtc_47': 1, '2016-06-08_gtc_48': 14, '2016-06-08_gtc_49': 2, '2016-06-08_gtc_50': 1, '2016-06-08_gtc_51': 1, '2016-06-08_gtc_52': 1, '2016-06-08_gtc_53': 4, '2016-06-08_gtc_54': 14, '2016-06-08_gtc_55': 4, '2016-06-08_gtc_56': 1, '2016-06-08_gtc_57': 1, '2016-06-08_gtc_58': 1, '2016-06-08_gtc_59': 1, '2016-06-08_gtc_60': 7, '2016-06-08_gtc_61': 1, '2016-06-08_gtc_62': 1, '2016-06-08_gtc_63': 1, '2016-06-08_gtc_64': 1, '2016-06-08_gtc_65': 11, '2016-06-08_gtc_66': 4, '2016-06-08_gtc_67': 3, '2016-06-08_gtc_68': 1, '2016-06-08_gtc_69': 26, '2016-06-08_gtc_70': 1, '2016-06-08_gtc_71': 3, '2016-06-08_gtc_72': 2, '2016-06-08_gtc_73': 1, '2016-06-08_gtc_74': 89, '2016-06-08_gtc_75': 1, '2016-06-08_gtc_76': 1, '2016-06-08_gtc_77': 2, '2016-06-08_gtc_78': 1, '2016-06-08_gtc_79': 1, '2016-06-08_gtc_80': 8, '2016-06-08_gtc_81': 2, '2016-06-08_gtc_82': 6, '2016-06-08_gtc_83': 1, '2016-06-08_gtc_84': 3, '2016-06-08_gtc_85': 1, '2016-06-08_gtc_86': 1, '2016-06-08_gtc_87': 1, '2016-06-08_gtc_88': 1, '2016-06-08_gtc_89': 12, '2016-06-08_gtc_90': 1, '2016-06-08_gtc_91': 13, '2016-06-08_gtc_92': 5, '2016-06-08_gtc_93': 1, '2016-06-08_gtc_94': 7, '2016-06-08_gtc_95': 1, '2016-06-08_gtc_96': 4, '2016-06-08_gtc_97': 1, '2016-06-08_gtc_98': 4, '2016-06-08_gtc_99': 1, '2016-06-08_gtc_100': 13, '2016-06-08_gtc_101': 1, '2016-06-08_gtc_102': 3, '2016-06-08_gtc_103': 1, '2016-06-08_gtc_104': 10, '2016-06-08_gtc_105': 1}

calculate_variation_of_information(contingency_table, counts_predicted_clusters, counts_gt_clusters)

81.69   1 - Scaled VI


## One-to-One Overlap

In [16]:
from ortools.graph import pywrapgraph

def one_to_one(contingency, row_sums, col_sums):
    row_to_num = {}
    col_to_num = {}
    num_to_row = []
    num_to_col = []
    for row_num, row in enumerate(row_sums):
        row_to_num[row] = row_num
        num_to_row.append(row)
    for col_num, col in enumerate(col_sums):
        col_to_num[col] = col_num
        num_to_col.append(col)

    min_cost_flow = pywrapgraph.SimpleMinCostFlow()
    start_nodes = []
    end_nodes = []
    capacities = []
    costs = []
    source = len(num_to_row) + len(num_to_col)
    sink = len(num_to_row) + len(num_to_col) + 1
    supplies = []
    tasks = min(len(num_to_row), len(num_to_col))
    for row, row_num in row_to_num.items():
        start_nodes.append(source)
        end_nodes.append(row_num)
        capacities.append(1)
        costs.append(0)
        supplies.append(0)
    for col, col_num in col_to_num.items():
        start_nodes.append(col_num + len(num_to_row))
        end_nodes.append(sink)
        capacities.append(1)
        costs.append(0)
        supplies.append(0)
    supplies.append(tasks)
    supplies.append(-tasks)
    for row, row_num in row_to_num.items():
        for col, col_num in col_to_num.items():
            cost = 0
            if col in contingency[row]:
                cost = - contingency[row][col]
            start_nodes.append(row_num)
            end_nodes.append(col_num + len(num_to_row))
            capacities.append(1)
            costs.append(cost)

    # Add each arc.
    for i in range(len(start_nodes)):
        min_cost_flow.AddArcWithCapacityAndUnitCost(start_nodes[i], end_nodes[i],
                                                    capacities[i], costs[i])
  
    # Add node supplies.
    for i in range(len(supplies)):
        min_cost_flow.SetNodeSupply(i, supplies[i])

    # Find the minimum cost flow.
    min_cost_flow.Solve()

    # Score.
    total_count = sum(v for _, v in row_sums.items())
    overlap = 0
    for arc in range(min_cost_flow.NumArcs()):
        # Can ignore arcs leading out of source or into sink.
        if min_cost_flow.Tail(arc)!=source and min_cost_flow.Head(arc)!=sink:
            # Arcs in the solution have a flow value of 1. Their start and end nodes
            # give an assignment of worker to task.
            if min_cost_flow.Flow(arc) > 0:
                row_num = min_cost_flow.Tail(arc)
                col_num = min_cost_flow.Head(arc)
                col = num_to_col[col_num - len(num_to_row)]
                row = num_to_row[row_num]
                if col in contingency[row]:
                    overlap += contingency[row][col]
    print("{:5.2f}   one-to-one".format(overlap * 100 / total_count))

In [19]:
one_to_one(contingency_table, counts_predicted_clusters, counts_gt_clusters)

52.88   one-to-one


## Exact Match score(P/R/F)

In [17]:
def exact_match(gold, auto, skip_single=True):
    # P/R/F over complete clusters
    total_gold = 0
    total_matched = 0
    for filename in gold:
        for cluster in gold[filename].values():
            if skip_single and len(cluster) == 1:
                continue
            total_gold += 1
            matched = False
            for ocluster in auto[filename].values():
                if len(set(ocluster).symmetric_difference(set(cluster))) == 0:
                    matched = True
                    break
            if matched:
                total_matched += 1
    match = []
    subsets = []
    supersets = []
    other = []
    prefix = []
    suffix = []
    gap_free = []
    match_counts = []
    subsets_counts = []
    supersets_counts = []
    other_counts = []
    prefix_counts = []
    suffix_counts = []
    gap_free_counts = []
    total_auto = 0
    for filename in auto:
        for cluster in auto[filename].values():
            if skip_single and len(cluster) == 1:
                continue
            total_auto += 1
            most_overlap = 0
            fraction = 0
            count = 0
            is_subset = False
            is_superset = False
            is_prefix = False
            is_suffix = False
            is_gap_free = False
            is_match = False
            for ocluster in gold[filename].values():
                if len(set(ocluster).symmetric_difference(set(cluster))) == 0:
                    is_match = True
                    break

                overlap = len(set(ocluster).intersection(set(cluster)))
                if overlap > most_overlap:
                    most_overlap = overlap
                    gaps = False
                    for v in ocluster:
                        if min(cluster) <= v <= max(cluster):
                            if v not in cluster:
                                gaps = True
                    fraction = 1 - (overlap / len(set(ocluster).union(set(cluster))))
                    count = len(set(ocluster).union(set(cluster))) - overlap

                    is_subset = (overlap == len(cluster))
                    is_superset = (overlap == len(ocluster))
                    if overlap == len(cluster) and (not gaps):
                        is_gap_free = True
                        if min(ocluster) == min(cluster):
                            is_prefix = True
                        if max(ocluster) == max(cluster):
                            is_suffix = True
            if is_match:
                match.append(fraction)
                match_counts.append(count)
            elif is_superset:
                supersets.append(fraction)
                supersets_counts.append(count)
            elif is_subset:
                subsets.append(fraction)
                subsets_counts.append(count)
                if is_prefix:
                    prefix.append(fraction)
                    prefix_counts.append(count)
                elif is_suffix:
                    suffix.append(fraction)
                    suffix_counts.append(count)
                elif is_gap_free:
                    gap_free.append(fraction)
                    gap_free_counts.append(count)
            else:
                other.append(fraction)
                other_counts.append(count)
    print("Property, Proportion, Av Frac, Av Count, Max Count, Min Count")
    if len(match) > 0:
        print("Match        {:5.2f} {:5.2f} {:5.2f}".format(100 * len(match) / total_auto, 100 * sum(match) / len(match), sum(match_counts) / len(match)), max(match_counts), min(match_counts))
    if len(supersets) > 0:
        print("Super        {:5.2f} {:5.2f} {:5.2f}".format(100 * len(supersets) / total_auto, 100 * sum(supersets) / len(supersets), sum(supersets_counts) / len(supersets)), max(supersets_counts), min(supersets_counts))
    if len(subsets) > 0:
        print("Sub          {:5.2f} {:5.2f} {:5.2f}".format(100 * len(subsets) / total_auto, 100 * sum(subsets) / len(subsets), sum(subsets_counts) / len(subsets)), max(subsets_counts), min(subsets_counts))
    if len(prefix) > 0:
        print("Sub-Prefix   {:5.2f} {:5.2f} {:5.2f}".format(100 * len(prefix) / total_auto, 100 * sum(prefix) / len(prefix), sum(prefix_counts) / len(prefix)))
    if len(suffix) > 0:
        print("Sub-Suffix   {:5.2f} {:5.2f} {:5.2f}".format(100 * len(suffix) / total_auto, 100 * sum(suffix) / len(suffix), sum(suffix_counts) / len(suffix)))
    if len(gap_free) > 0:
        print("Sub-GapFree  {:5.2f} {:5.2f} {:5.2f}".format(100 * len(gap_free) / total_auto, 100 * sum(gap_free) / len(gap_free), sum(gap_free_counts) / len(gap_free)))
    if len(other) > 0:
        print("Other        {:5.2f} {:5.2f} {:5.2f}".format(100 * len(other) / total_auto, 100 * sum(other) / len(other), sum(other_counts) / len(other)))

    p, r, f = 0.0, 0.0, 0.0
    if total_auto > 0:
        p = 100 * total_matched / total_auto
    if total_gold > 0:
        r = 100 * total_matched / total_gold
    if total_matched > 0:
        f = 2 * p * r / (p + r)
    print("{:5.2f}   Matched clusters precision".format(p))
    print("{:5.2f}   Matched clusters recall".format(r))
    print("{:5.2f}   Matched clusters f-score".format(f))

In [18]:
exact_match(all_ground_truth_clusters_as_numbers, all_predicted_clusters_as_numbers, skip_single=True)

Property, Proportion, Av Frac, Av Count, Max Count, Min Count
Match         9.38  0.00  0.00 0 0
Super        20.08 54.06  2.44 17 1
Sub          27.95 63.71 24.77 188 1
Sub-Prefix    5.63 57.57  6.43
Sub-Suffix    4.50 55.31  7.17
Sub-GapFree   3.38 75.36 24.28
Other        42.59 72.82 32.70
 9.38   Matched clusters precision
14.01   Matched clusters recall
11.24   Matched clusters f-score
