In [1]:
import numpy as np
import torch
import torch.nn as nn
from IPython.display import display, HTML
from transformers import DistilBertModel, DistilBertTokenizer, logging
import matplotlib
import matplotlib.pyplot as plt
from captum.attr import visualization as viz
from captum.attr import IntegratedGradients, LayerActivation, LayerIntegratedGradients 
from captum.attr import configure_interpretable_embedding_layer, remove_interpretable_embedding_layer
from captum.attr import Occlusion, FeatureAblation, ShapleyValueSampling
from captum.attr import LayerFeatureAblation
from captum.attr import LayerGradientXActivation
from captum._utils.models.linear_model import SkLearnLinearRegression, SkLearnLasso
import time
import pickle
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
import spacy
from spacy import displacy
import seaborn as sns
import pandas as pd
import numpy as np
import collections
from bs4 import BeautifulSoup
import requests
nlp = spacy.load('en_core_web_trf')
logging.set_verbosity_error()
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load model and set seed

In [3]:
# Bert mode
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

class BERT(nn.Module):
    def __init__(self, bert):
        
        super(BERT, self).__init__()
        
        # Distil Bert model
        self.bert = bert
        ## Additional layers
        # Dropout layer
        self.dropout = nn.Dropout(0.3)
        # Relu 
        self.relu =  nn.ReLU()
        # Linear I 
        self.fc1 = nn.Linear(768, 512)
        # Linear II (Out)
        #self.fc2 = nn.Linear(512, 170)
        self.fc2 = nn.Linear(512, 1881)
        # Softmax
        self.softmax = nn.LogSoftmax(dim=1)


    # Forward pass
    def forward(self, **kwargs):

        # Pass data trough bert and extract 
        cls_hs = self.bert(**kwargs)
        # Extract hidden state
        hidden_state = cls_hs.last_hidden_state
        # Only first is needed for classification
        pooler = hidden_state[:, 0]
        
        # Dense layer 1        
        x = self.fc1(pooler)
        # ReLU activation
        x = self.relu(x)
        # Drop out
        x = self.dropout(x)
        # Dense layer 2
        x = self.fc2(x)
        # Activation
        x = self.softmax(x)

        return x
    
# Load the entire model
model = BERT(bert)

# Load trained model (colab)
try:
    try:
        model_save_name = 'saved_weights_CHUNKS_BIRDS_1881.pt'
        path = F"/content/gdrive/My Drive/{model_save_name}"
        model.load_state_dict(torch.load(path))
        print('Google Success')

    except:
        model_save_name = 'saved_weights_BIRDS_1881.pt'
        path = "../../models/" + model_save_name
        model.load_state_dict(torch.load(path, 
                                         map_location=torch.device('cpu')))
        print('Local Success')
except:
    print('No pretrained model found.')
    
model.to(device)
model.eval()
model.zero_grad()

Local Success


In [4]:
# Set seeds
torch.manual_seed(333)
np.random.seed(333)

### Function and tokens

In [5]:
ref_token_id = tokenizer.pad_token_id # A token used for generating token reference
sep_token_id = tokenizer.sep_token_id # A token used as a separator between question and text and it is also added to the end of the text.
cls_token_id = tokenizer.cls_token_id # A token used for prepending to the concatenated question-text word sequence

In [6]:
# Modify the prediction output and define a custom forward
def predict(inputs, attentions):
    return model(input_ids=inputs, attention_mask=attentions)[0]

def custom_forward(inputs, attentions):
    preds = predict(inputs, attentions)
    return torch.exp(preds)

In [7]:
# Tokenize functions
def construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id):

    text_ids = tokenizer.encode(text, add_special_tokens=False)
    # construct input token ids
    input_ids = [cls_token_id] + text_ids + [sep_token_id]
    # construct reference token ids 
    ref_input_ids = [cls_token_id] + [ref_token_id] * len(text_ids) + [sep_token_id]

    return torch.tensor([input_ids], device=device), torch.tensor([ref_input_ids], device=device), len(text_ids)

def construct_input_ref_token_type_pair(input_ids, sep_ind=0):
    seq_len = input_ids.size(1)
    token_type_ids = torch.tensor([[0 if i <= sep_ind else 1 for i in range(seq_len)]], device=device)
    ref_token_type_ids = torch.zeros_like(token_type_ids, device=device)# * -1
    return token_type_ids, ref_token_type_ids

def construct_input_ref_pos_id_pair(input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
    # we could potentially also use random permutation with `torch.randperm(seq_length, device=device)`
    ref_position_ids = torch.zeros(seq_length, dtype=torch.long, device=device)

    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
    ref_position_ids = ref_position_ids.unsqueeze(0).expand_as(input_ids)
    return position_ids, ref_position_ids
    
def construct_attention_mask(input_ids):
    return torch.ones_like(input_ids)

In [8]:
# Summarize and vis functions
def summarize_attributions_ig(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

def summarize_attributions_occ(attributions):
    return attributions.sum(axis=0)

def token_to_words(attribution, tokens):
    
    words = []
    attributes = []

    for attribute, word in zip(attribution, tokens):

        attribute = attribute.cpu().detach().numpy()
        if word == '[CLS]' or word == '[SEP]':
            words.append(word)
            attributes.append([attribute])
        elif not word.startswith('##'):
            words.append(word)
            attributes.append([attribute])
        elif word.startswith('##'):
            words[-1] += word.strip('##')
            attributes[-1] = np.append(attributes[-1], attribute)

    attribution = [np.sum(mean) for mean in attributes]
    return attribution, words

def colorize(attribution, tokens):
    
    template = """  
    <mark class="entity" style="
    background: {}; 
    padding: 0.4em 0.0em; 
    margin: 0.0em; 
    line-height: 2; 
    border-radius: 0.0em;
    ">{}<span style="
    font-size: 0.8em; 
    font-weight: bold; 
    line-height: 1; 
    border-radius: 0.0em;   
    text-align-last:center;
    vertical-align: middle;
    margin-left: 0rem;
    "></span></mark>
    """

    colored_string = ''
    normalized_and_mapped = matplotlib.cm.ScalarMappable(cmap=matplotlib.cm.Greens).to_rgba(attribution)
    for idx, (word, color) in enumerate(zip(tokens, normalized_and_mapped)):
        
        word = word + ' '
        color = matplotlib.colors.rgb2hex(color[:3])
        if word.strip() == '[CLS]' or word.strip() == '[SEP]': 
            color = '#ffffff'
        #print(color)
        colored_string += template.format(color, word)

    return colored_string

In [9]:
lig = LayerIntegratedGradients(custom_forward, model.bert.embeddings)
occ = Occlusion(custom_forward)
ablator = FeatureAblation(custom_forward)
#lfa = LayerFeatureAblation(custom_forward, model.bert.embeddings)
SVS = ShapleyValueSampling(custom_forward)
la = LayerActivation(custom_forward, model.bert.embeddings)
lgxa = LayerGradientXActivation(custom_forward, model.bert.embeddings)

## Single Example

In [10]:
datadict = pickle.load(open('../../data/description/04_TRAIN_0000000-0001881_TRAITS_CHUNK_BIRDS.pkl', 'rb'))
text_list = [data for data in datadict['Ovenbird']]

In [11]:
text = 'Orange legs with long nails.'

In [12]:
input_ids, ref_input_ids, sep_id = construct_input_ref_pair(text, ref_token_id, sep_token_id, cls_token_id)
token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
attention_mask = construct_attention_mask(input_ids)

#indices = input_ids[0].detach().tolist()
indices = input_ids[0].type(torch.LongTensor)
all_tokens = tokenizer.convert_ids_to_tokens(indices)

In [13]:
# Integrated Gradients
attribution_ig = lig.attribute(inputs=(input_ids, attention_mask),
                                baselines=(ref_input_ids, attention_mask),
                                n_steps=20,
                                internal_batch_size=1,
                                return_convergence_delta=False)

In [14]:
# Integrated Gradients
attribution_ig_100 = lig.attribute(inputs=(input_ids, attention_mask),
                                baselines=(ref_input_ids, attention_mask),
                                n_steps=50,
                                internal_batch_size=1,
                                return_convergence_delta=False)

In [15]:
# Occlusion maps
attribution_occ = occ.attribute(inputs=(input_ids, attention_mask),
                                sliding_window_shapes=(tuple([1,]), tuple([1,])), 
                                strides=None, 
                                baselines=(ref_input_ids, attention_mask), 
                                target=None, 
                                additional_forward_args=None, 
                                perturbations_per_eval=1, 
                                show_progress=True)

Occlusion attribution: 100%|████████████████████| 17/17 [00:00<00:00, 31.12it/s]


In [16]:
# Occlusion maps
attribution_occ2 = occ.attribute(inputs=(input_ids, attention_mask),
                                sliding_window_shapes=(tuple([3,]), tuple([3,])), 
                                strides=(2, 2), 
                                baselines=(ref_input_ids, attention_mask), 
                                target=None, 
                                additional_forward_args=None, 
                                perturbations_per_eval=1, 
                                show_progress=True)

Occlusion attribution: 100%|██████████████████████| 9/9 [00:00<00:00, 31.11it/s]


In [17]:
attribution_abl = ablator.attribute(inputs=(input_ids, attention_mask), 
                                    baselines=(ref_input_ids, attention_mask),)

In [18]:
attribution_SVS = SVS.attribute(inputs=(input_ids, attention_mask),
                                baselines=(ref_input_ids, attention_mask), 
                               show_progress=True)

Shapley Value Sampling attribution: 100%|█████| 401/401 [00:12<00:00, 31.92it/s]


In [19]:
attribution_la = la.attribute(inputs=(input_ids, attention_mask),)

In [20]:
attribution_lgxa = lgxa.attribute(inputs=input_ids,
                                 additional_forward_args=attention_mask,
                                 attribute_to_layer_input=False)

In [21]:
#attribution_lfa = lfa.attribute(inputs=input_ids,
#                                 additional_forward_args=attention_mask,
#                                attribute_to_layer_input=False)

In [22]:
# Sum the matrices
attributions_ig_sum = summarize_attributions_ig(attribution_ig)
attribution_ig_100_sum = summarize_attributions_ig(attribution_ig_100)
attributions_occ_sum = summarize_attributions_occ(attribution_occ[0])
attribution_occ2_sum = summarize_attributions_occ(attribution_occ2[0])
attributions_abl_sum = summarize_attributions_occ(attribution_abl[0])
attributions_SVS_sum = attribution_SVS[0][0]
attributions_la_sum = summarize_attributions_ig(attribution_la)
attributions_lgxa_sum = summarize_attributions_ig(attribution_lgxa)


In [23]:
# Token to words
# Words are the same in this case
attributions_ig_words, words = token_to_words(attributions_ig_sum, all_tokens)
attributions_ig_100_words, words = token_to_words(attribution_ig_100_sum, all_tokens)
attributions_occ_words, words = token_to_words(attributions_occ_sum, all_tokens)
attributions_occ2_words, words = token_to_words(attribution_occ2_sum, all_tokens)
attributions_abl_words, words = token_to_words(attributions_occ_sum, all_tokens)
attributions_SVS_words, words = token_to_words(attributions_SVS_sum, all_tokens)
attributions_la_words, words = token_to_words(attributions_la_sum, all_tokens)
attributions_lgxa_words, words = token_to_words(attributions_lgxa_sum, all_tokens)


In [24]:
# IG
print('Layer Integrated Gradients')
string = colorize(attributions_ig_words, words)
display(HTML(string))
# IG
print('Layer Integrated Gradients (100 steps)')
string = colorize(attributions_ig_100_words, words)
display(HTML(string))
# OCC
print('Occlusion')
string = colorize(attributions_occ_words, words)
display(HTML(string))
# OCC
print('Occlusion (Window 3, stride 2)')
string = colorize(attributions_occ2_words, words)
display(HTML(string))
# Ablator
print('Feature Ablation')
string = colorize(attributions_abl_words, words)
display(HTML(string))
# Ablator
print('Shapley Value Sampling')
string = colorize(attributions_SVS_words, words)
display(HTML(string))
# Ablator
print('Layer Activation')
string = colorize(attributions_la_words, words)
display(HTML(string))
# Layer Gradient X Activation
print('Layer Gradient X Activation')
string = colorize(attributions_lgxa_words, words)
display(HTML(string))

Layer Integrated Gradients


Layer Integrated Gradients (100 steps)


Occlusion


Occlusion (Window 3, stride 2)


Feature Ablation


Shapley Value Sampling


Layer Activation


Layer Gradient X Activation


In [25]:
def explain(word):
    
    data = collections.defaultdict(list)
       
    # tokenize
    input_ids, ref_input_ids, sep_id = construct_input_ref_pair(word, ref_token_id, sep_token_id, cls_token_id)
    token_type_ids, ref_token_type_ids = construct_input_ref_token_type_pair(input_ids, sep_id)
    position_ids, ref_position_ids = construct_input_ref_pos_id_pair(input_ids)
    attention_mask = construct_attention_mask(input_ids)
    indices = input_ids[0].type(torch.LongTensor)
    all_tokens = tokenizer.convert_ids_to_tokens(indices)
    
    #### Layer IG 20 steps
    attribution_ig = lig.attribute(inputs=(input_ids, attention_mask),
                                    baselines=(ref_input_ids, attention_mask),
                                    n_steps=20,
                                    internal_batch_size=1,
                                    return_convergence_delta=False)
    attributions_ig_sum = summarize_attributions_ig(attribution_ig)
    attributions_ig_words, words = token_to_words(attributions_ig_sum, all_tokens)
    data['Words'] = words
    data['IG_20'] = attributions_ig_words
    ####
    
    #### Layer IG 100 steps
    attribution_ig_100 = lig.attribute(inputs=(input_ids, attention_mask),
                                    baselines=(ref_input_ids, attention_mask),
                                    n_steps=50,
                                    internal_batch_size=1,
                                    return_convergence_delta=False)
    attribution_ig_100_sum = summarize_attributions_ig(attribution_ig_100)
    attributions_ig_100_words, _ = token_to_words(attribution_ig_100_sum, all_tokens)
    #print(attribution_ig_100_sum)
    data['IG_100'] = attributions_ig_100_words
    ####
    
    ### Occlusion maps 
    attribution_occ = occ.attribute(inputs=(input_ids, attention_mask),
                                sliding_window_shapes=(tuple([1,]), tuple([1,])), 
                                baselines=(ref_input_ids, attention_mask),)
    attributions_occ_sum = summarize_attributions_occ(attribution_occ[0])
    attributions_occ_words, _ = token_to_words(attributions_occ_sum, all_tokens)
    data['Occ_1-1'] = attributions_occ_words
    #### Occlusion maps (3, 2)
    attribution_occ2 = occ.attribute(inputs=(input_ids, attention_mask),
                                sliding_window_shapes=(tuple([3,]), tuple([3,])), 
                                strides=(2, 2), 
                                baselines=(ref_input_ids, attention_mask),)
    attribution_occ2_sum = summarize_attributions_occ(attribution_occ2[0])
    attributions_occ2_words, _ = token_to_words(attribution_occ2_sum, all_tokens)
    data['Occ_3-2'] = attributions_occ2_words
    ####
    
    #### Shapley Value Sampling
    attribution_SVS = SVS.attribute(inputs=(input_ids, attention_mask),
                                baselines=(ref_input_ids, attention_mask),)
    attributions_SVS_sum = attribution_SVS[0][0]
    attributions_SVS_words, _ = token_to_words(attributions_SVS_sum, all_tokens)
    data['SVS'] = attributions_SVS_words
    ####
    
    ### Layer Activation
    attribution_la = la.attribute(inputs=(input_ids, attention_mask),)
    attributions_la_sum = summarize_attributions_ig(attribution_la)
    attributions_la_words, _ = token_to_words(attributions_la_sum, all_tokens)
    data['LA'] = attributions_la_words
    
    return data

In [26]:
# URL
URL = 'https://en.wikipedia.org/wiki/Glossary_of_bird_terms'
# Get the page
page = requests.get(URL, timeout=5)
soup = BeautifulSoup(page.content, "lxml", from_encoding="iso-8859-1")   
# Find embedded glossary
glossaries = soup.find_all('dt', {'class': 'glossary'})
parts = [part.text.lower().strip() for part in glossaries]
# Get additional anchors ("also know as...")
glossaries_other = soup.find_all('span', {'class': 'anchor'})
parts_other = [part['id'].lower().strip() for part in glossaries_other]
# Append and drop duplicates
parts = list(set((parts + parts_other)))
# Replace underscore with space
parts = [part.replace('_', ' ') for part in parts]

In [None]:
text_list = ['The plumage is backish with blue stripes',
             'Orange legs with long nails.',
             'This is jibberish']

In [None]:
#explain('Orange legs with long nails.')

In [None]:
datadict = pickle.load(open('../../data/description/04_TRAIN_0000000-0001881_TRAITS_CHUNK_BIRDS.pkl', 'rb'))
text_list = [data for data in datadict['Ovenbird']]

In [None]:
data = collections.defaultdict(list)
for idx, text in enumerate(tqdm_notebook(text_list[0:5])):
    d = explain(text)
    d['Sentence'] = len(d['Words']) * [idx]
    for key in d.keys():
        data[key] += d[key]

In [None]:
#data

In [None]:
df = pd.DataFrame.from_dict(data)

In [None]:
df

In [None]:
data_random = []

for idx in tqdm_notebook(df['Sentence'].unique()):
#for idx in range(50, 51):
    doc = nlp(text_list[idx])
    if len(doc) <= 3:
        continue
    # Check single
    words = [chunk.root.lemma_.lower() for chunk in doc.noun_chunks] 
    # Check multiple
    words += [chunk.root.text.lower() for chunk in doc.noun_chunks]
    # Drop duplicate
    words = list(set(words))
    #print(words)
    #print(words)
    traits =  set(words) & set(parts)
    #print(traits)
    if traits:
        # Yield the traits
        trait_list = list(traits)
        #print(trait_list)
        for trait in trait_list:
            for column in df.columns[1:-1]:
                index = df[df['Sentence'] == idx][column].sort_values(ascending=False)
                data_random.append((idx, column, df.iloc[index.index[0]].Words, trait))
                #print(f'{idx} -- {column} -- {df.iloc[index.index[0]].Words}')
                

## Annotations

In [25]:
import re

In [26]:
# Open the text file
location = "../../data/external/CUB_200_2011/CUB_200_2011/classes.txt"
with open(location) as f:
    lines = f.readlines()
# Init regex
regex = r'\d+\s\d+\.'
# Clean the list
CUB = [re.sub(regex, '', line).rstrip().replace('_', ' ') for line in lines]

In [27]:
# Open the text file
location = "../../data/external/CUB_200_2011/attributes.txt"
with open(location) as f:
    lines = f.readlines()

In [28]:
attribute_list = []

# Create a DF with attributes
for line in lines:
    line = line.strip()
    attributes = line.split()[1].split('::')
    part = attributes[0]
    adjective = attributes[1]
    attribute_list.append((part, adjective))
    
# Create DF        
df_attributes = pd.DataFrame(attribute_list, columns=['Part', 'Adj'])
# Reset the index
df_attributes.index = np.arange(1, len(df_attributes)+1)

In [29]:
# Open the text file
location = "../../data/external/CUB_200_2011/CUB_200_2011/attributes/image_attribute_labels.txt"
with open(location) as f:
    lines = f.readlines()
# Open the text file
location = "../../data/external/CUB_200_2011/CUB_200_2011/images.txt"
with open(location) as f:
    img_ids = f.readlines()

In [30]:
# Create DF will all present parts
CUB_dict = collections.defaultdict(list)
# Loop over lines
for line in tqdm(lines):
    data = line.split()
    # Skip non present
    if not int(data[2]):
        continue
    # Skip uncertain things
    if int(data[3]) < 3:
        continue
    # Otherwise append
    #print(CUB[int(img_ids[int(data[0])].split()[1][0:3]) - 1], data)
    CUB_dict[CUB[int(img_ids[int(data[0])-1].split()[1][0:3]) - 1]].append((df_attributes.loc[int(data[1]), ][0], df_attributes.loc[int(data[1]), ][1]))

100%|██████████████████████████████| 3677856/3677856 [01:46<00:00, 34516.07it/s]


In [31]:
# Drop duplicates
for bird in CUB_dict.keys():
    CUB_dict[bird] = list(set(CUB_dict[bird]))

In [32]:
description_CUB_dict = collections.defaultdict(list)
# Rewrite the sentence
for (part, adjective) in CUB_dict['Ovenbird']:
    part = part[4:].replace('_', ' ')

    description_CUB_dict['Overbird'].append((part, adjective))

In [None]:
#description_CUB_dict

## Similarity

In [45]:
import sys
sys.path.insert(0, '../../src/models/')
sys.path.insert(0, '../../src/features/')

from build_features import similarity_matrix as vector_values
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# create DataFrame using data
df_random = pd.DataFrame(data_random, columns =['Sentence', 'Attribution', 'Adjective', 'Part'])
df_cub = pd.DataFrame(description_CUB_dict['Overbird'], columns =['Part', 'Adjective'])

# Create trait
df_random['Trait'] = df_random['Part'] + ' ' + df_random['Adjective']
df_cub['Trait'] = df_cub['Part'] + ' ' + df_cub['Adjective']

# Create similarity column
df_random['Similarity'] = np.nan

In [None]:
df_random['Part'].unique()

In [None]:
CUB_data =  df_cub[df_cub['Trait'].str.startswith('upperparts')]['Trait']
random_data = df_random[df_random['Trait'].str.startswith('upperparts')]['Trait']

In [None]:
vv_random = vector_values(random_data)
vv_cub    = vector_values(CUB_data)

In [None]:
matrix = cosine_similarity(vv_random, vv_cub)
# Clear memory
del vv_random
del vv_cub

In [None]:
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(len(random_data), len(CUB_data)))
    ax = sns.heatmap(matrix, 
                     #mask=mask, 
                     square=True,
                     annot=True,
                     cbar=False,
                     cmap=cmap,
                     linewidths=.5,)
    


## Loop it

In [None]:
# create DataFrame using data
df_random = pd.DataFrame(data_random, columns =['Sentence', 'Attribution', 'Adjective', 'Part'])
df_cub = pd.DataFrame(description_CUB_dict['Overbird'], columns =['Part', 'Adjective'])

# Create trait
df_random['Trait'] = df_random['Part'] + ' ' + df_random['Adjective']
df_cub['Trait'] = df_cub['Part'] + ' ' + df_cub['Adjective']

# Create similarity column
df_random['Similarity'] = np.nan

In [None]:
# Loop over parts:
for part in tqdm(df_random['Part'].unique()):
    
    # Extract parts
    CUB_data =  df_cub[df_cub['Trait'].str.startswith(part)]['Trait']
    random_data = df_random[df_random['Trait'].str.startswith(part)]['Trait']
    
    # Check lenghts
    if len(CUB_data) == 0:
        continue
    
    # Compute Vector values
    vv_random = vector_values(random_data)
    vv_cub    = vector_values(CUB_data)
    
    # Get Matrix
    matrix = cosine_similarity(vv_random, vv_cub)
    # Clear memory
    del vv_random
    del vv_cub
    
    # Update the dataframe
    df_random.loc[random_data.index, 'Similarity'] = np.mean(matrix, axis=1)

In [None]:
df_sim = df_random.groupby('Attribution')['Similarity'].apply(list).reset_index(name='Values')
df_sim = df_sim.set_index('Attribution')

In [None]:
df_random.groupby('Attribution')['Similarity'].apply(list).index


In [None]:
df_random.groupby('Attribution')['Similarity'].apply(list).hist()

# LOOP EVERYTHING

In [33]:
import glob
import re

### CUB

In [34]:
# Open the text file
location = "../../data/external/CUB_200_2011/CUB_200_2011/classes.txt"
with open(location) as f:
    lines = f.readlines()

# Init regex
regex = r'\d+\s\d+\.'
# Clean the list
CUB = [re.sub(regex, '', line).rstrip().replace('_', ' ') for line in lines]
# Read the BOWO list to compare names
BOWO_folder_list = glob.glob('../../data/raw/BOW/*')
# Clean the lists
BOW_all = [bird[19:] for bird in BOWO_folder_list]
# Intersect common names
common_birds = list(set(CUB) & set(BOW_all))



### Descriptions

In [35]:
datadict = pickle.load(open('../../data/description/04_TRAIN_0000000-0001881_TRAITS_CHUNK_BIRDS.pkl', 'rb'))

In [None]:
attribution = collections.defaultdict(list)
idx = 0

# Loop over common birds
for bird in tqdm_notebook(common_birds[0:4], desc='Bird'):
    # Get descriptions
    text_list = [data for data in datadict[bird]]
    # Get attribution values
    for text in tqdm_notebook(text_list, desc='Sentences', leave=False):
        d = explain(text)
        d['Sentence'] = len(d['Words']) * [idx]
        d['Bird'] = len(d['Words']) * [bird]
        for key in d.keys():
            attribution[key] += d[key]
        
        idx += 1

In [None]:
with open('TEMP_ATTRIBUTION.pkl', 'wb') as f:
    pickle.dump(attribution, f)

In [37]:
attribution = pickle.load(open('TEMP_ATTRIBUTION.pkl', 'rb'))

In [38]:
df = pd.DataFrame.from_dict(attribution)

In [41]:
data_random = []

for idx in tqdm_notebook(df['Sentence'].unique()):
#for idx in range(50, 51):
    #doc = nlp(text_list[idx])
    doc = nlp(' '.join(df[df['Sentence'] == idx]['Words']))
    if len(doc) <= 3:
        continue
    # Check single
    words = [chunk.root.lemma_.lower() for chunk in doc.noun_chunks] 
    # Check multiple
    words += [chunk.root.text.lower() for chunk in doc.noun_chunks]
    # Drop duplicate
    words = list(set(words))
    #print(words)
    #print(words)
    traits =  set(words) & set(parts)
    #print(traits)
    if traits:
        # Yield the traits
        trait_list = list(traits)
        #print(trait_list)
        for trait in trait_list:
            for column in df.columns[1:-1]:
                index = df[df['Sentence'] == idx][column].sort_values(ascending=False)
                data_random.append((idx, column, df.iloc[index.index[0]].Words, trait, df.iloc[index.index[0]].Bird))
                #print(f'{idx} -- {column} -- {df.iloc[index.index[0]].Words}')

  0%|          | 0/497 [00:00<?, ?it/s]

In [42]:
description_CUB_dict = []
# Rewrite the sentence
for bird in df.Bird.unique(): 
    for (part, adjective) in CUB_dict[bird]:
        part = part[4:].replace('_', ' ')

        description_CUB_dict.append((part, adjective, bird))

In [43]:
# create DataFrame using data
df_random = pd.DataFrame(data_random, columns =['Sentence', 'Attribution', 'Adjective', 'Part', 'Bird'])
df_cub = pd.DataFrame(description_CUB_dict, columns =['Part', 'Adjective', 'Bird'])

# Create trait
df_random['Trait'] = df_random['Part'] + ' ' + df_random['Adjective']
df_cub['Trait'] = df_cub['Part'] + ' ' + df_cub['Adjective']

# Create similarity column
df_random['Similarity'] = np.nan

In [105]:
#del vv_random
#del vv_cub

In [None]:
for bird in tqdm_notebook(df_random['Bird'].unique(), desc='Bird'):

    # Loop over parts:
    for part in tqdm_notebook(df_random['Part'].unique(), desc='Trait', leave=False):

        # Extract parts
        CUB_data =  df_cub[(df_cub['Trait'].str.startswith(part)) & (df_cub['Bird'] == bird)]['Trait']
        random_data = df_random[(df_random['Trait'].str.startswith(part)) & (df_random['Bird'] == bird)]['Trait']
        

        # SKip short sentences
        if len(CUB_data) <=7:
            continue
        if len(random_data) <= 7:
            continue

        # Compute Vector values
        vv_random = vector_values(random_data)
        vv_cub    = vector_values(CUB_data)

        # Get Matrix
        matrix = cosine_similarity(vv_random, vv_cub)
        # Clear memory
        del vv_random
        del vv_cub

        # Update the dataframe
        df_random.loc[random_data.index, 'Similarity'] = np.mean(matrix, axis=1)

Bird:   0%|          | 0/4 [00:00<?, ?it/s]

Trait:   0%|          | 0/74 [00:00<?, ?it/s]

Trait:   0%|          | 0/74 [00:00<?, ?it/s]

Trait:   0%|          | 0/74 [00:00<?, ?it/s]

Trait:   0%|          | 0/74 [00:00<?, ?it/s]

In [102]:
df_random['Similarity']

0       0.774500
1       0.774500
2       0.821627
3       0.740408
4       0.780489
          ...   
6694         NaN
6695         NaN
6696         NaN
6697         NaN
6698         NaN
Name: Similarity, Length: 6699, dtype: float64

In [82]:
df_random[(df_random['Trait'].str.startswith('rem')) & (df_random['Bird'] == bird)]['Trait']

Series([], Name: Trait, dtype: object)

# Plotting

In [None]:
import seaborn as sns

In [None]:
plotdata = collections.defaultdict(list)

In [None]:
for key in df_random.groupby('Attribution')['Similarity'].apply(list).keys():
    plotdata[key] = df_random.groupby('Attribution')['Similarity'].apply(list)[key]

In [None]:
melt = pd.melt(df.from_dict(plotdata))

In [None]:
sns.ecdfplot(data=melt, x="value", hue="variable")