In [84]:
# change file structre
import sys
sys.path.append('../')

# import packages
import random
import glob
import pickle
import os
import sys
import argparse
from itertools import combinations
from scipy import spatial
from metrics import rsa
from data import one_hot, generate_dataset
from tqdm import tqdm
import numpy as np

### Similarity functions

In [26]:
def compute_jaccard_distance_score(x, y):
    """
    Jaccard Similarity J (A,B) = | Intersection (A,B) | /
                                    | Union (A,B) |
    """
    intersection_cardinality = len(set(x).intersection(set(y)))
    union_cardinality = len(set(x).union(set(y)))

    # 1 - result since we want the distance
    return 1 - (intersection_cardinality / float(union_cardinality))

def levenshtein_ratio_and_distance(s, t):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows, cols), dtype=int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # If we choose to calculate the ratio the cost of a substitution is 2.
                cost = 2

            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions

    # Computation of the Levenshtein Distance Ratio
    Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
    return (1-Ratio)

#  Compositional language

In [177]:
shapes = 13
colors = 13

# create vocabulary
vocab = []
alpha = 'a'
for i in range(0, shapes+colors): 
    vocab.append(alpha) 
    alpha = chr(ord(alpha) + 1)  

# create example datasetLanguage
data = generate_dataset([shapes, colors])

# create compositional messages
messages = []
for i in range(shapes):
    for j in range(colors):
        
        # create message
        message = [vocab[i], vocab[shapes + j]]
        messages.append(message)

print(data)
print(messages)

[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
[['a', 'n'], ['a', 'o'], ['a', 'p'], ['a', 'q'], ['a', 'r'], ['a', 's'], ['a', 't'], ['a', 'u'], ['a', 'v'], ['a', 'w'], ['a', 'x'], ['a', 'y'], ['a', 'z'], ['b', 'n'], ['b', 'o'], ['b', 'p'], ['b', 'q'], ['b', 'r'], ['b', 's'], ['b', 't'], ['b', 'u'], ['b', 'v'], ['b', 'w'], ['b', 'x'], ['b', 'y'], ['b', 'z'], ['c', 'n'], ['c', 'o'], ['c', 'p'], ['c', 'q'], ['c', 'r'], ['c', 's'], ['c', 't'], ['c', 'u'], ['c', 'v'], ['c', 'w'], ['c', 'x'], ['c', 'y'], ['c', 'z'], ['d', 'n'], ['d', 'o'], ['d', 'p'], ['d', 'q'], ['d', 'r'], ['d', 's'], ['d', 't'], ['d', 'u'], ['d', 'v'], ['d', 'w'], ['d', 'x'], ['d', 'y'], ['d', 'z'], ['e', 'n'], ['e', 'o'], ['e', 'p'], ['e', 'q'], ['e', 'r'], ['e', 's'], ['e', 't'], ['e', 'u'], ['e', 'v'], ['e', 'w'], ['e', 'x'], ['e', 'y'], ['e', 'z'], ['f', 'n'], ['f', 'o'], ['f', 'p'], ['f', 'q'], ['f', 'r'], ['f', 's'], ['f', 't'], ['f', 'u'], ['f

In [178]:
r = rsa(data, messages, spatial.distance.hamming, compute_jaccard_distance_score)
print(r)

r = rsa(data, messages, spatial.distance.hamming, levenshtein_ratio_and_distance)
print(r)


1.0
1.0


# Example language #1

In [183]:
shapes = 10
colors = 10

# create example dataset
data = generate_dataset([shapes, colors])

# create example language
messages = []
for i in range(shapes):
    for j in range(colors):
        
        # create string
        message = (i+1) * 'x ' + (j+1) * 'y '
       
        # seperate string
        messages.append(message.split())
    
# print(data)
# print(messages)

In [184]:
r = rsa(data, messages, spatial.distance.hamming, compute_jaccard_distance_score)
print(r)

r = rsa(data, messages, spatial.distance.hamming, levenshtein_ratio_and_distance)
print(r)

  rho = scipy.stats.pearsonr(sim_x, sim_y)[0]


0.0
0.37391293238082607


# Example language #2

In [172]:
# need language that combines the shape vectors
shapes = 3
colors = 3
size = 2

attributes = [shapes, colors, size]

# create example dataset
data = generate_dataset([shapes, colors, size])
print('Number of objects in the dataset: ', len(data))

# create vocabulary
vocab = []
alpha = 'a'
for i in range(0, 26): 
    vocab.append(alpha) 
    alpha = chr(ord(alpha) + 1)

# split it into 3 parts
poss_attr = set()

for obj in data:
    # extract attributes
    attr = np.where(obj == 1)

    # split these lists into lists of length 2 
    split_attr = combinations(attr[0],2)
    
    for sa in split_attr:
        poss_attr.add(sa)
    
    
# convert to list
poss_attr = list(poss_attr)

# create messages
messages = []

# loop through the dataset, find all possible attributes
for obj in data:
    
    # create message
    message = []
    
    # extract attributes
    attr = np.where(obj == 1)
    
    # split these lists into lists of length 2 
    split_attr = combinations(attr[0],2)
    
    # find index of attribute
    for sa in split_attr:
        char_index = poss_attr.index(sa)
        
        # extract character
        message.append(vocab[char_index])

    # randomly remove and element
    del message[random.randint(0,2)]
    
    messages.append(message)

Number of objects in the dataset:  18


In [173]:
r = rsa(data, messages, spatial.distance.hamming, compute_jaccard_distance_score)
print(r)

r = rsa(data, messages, spatial.distance.hamming, levenshtein_ratio_and_distance)
print(r)


0.4588314677411236
0.4588314677411236
