In [1]:
from os import path
import pandas as pd
from gensim.models import KeyedVectors
from SPARQLWrapper import SPARQLWrapper, JSON

sparql = SPARQLWrapper("http://data.odeuropa.eu/repositories/odeuropa")
sparql.setReturnFormat(JSON)

In [2]:
def label(uri):
    if uri is None or type(uri) != str or len(uri) == 0:
        return None
    if 'flavornet' in uri and len(uri.split('/')) == 6:
        uri = uri.replace('flavornet','flavornet/odors')
        
    q = '''
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX onto: <http://www.ontotext.com/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT ?label
        FROM onto:explicit
        WHERE {
            <%s> skos:prefLabel|rdfs:label ?label
        }
    ''' % uri
    sparql.setQuery(q)
    ret = sparql.queryAndConvert()
    data = [l['label'] for l in ret['results']['bindings']]
    if len(data) < 1:
        return "smell"
    data.sort(key=lambda l: ('aaa' if l['xml:lang']== 'en' else l['xml:lang']) if 'xml:lang' in l else 0)
    return data[0]['value']

def clean(array):
    return [r for r in array if not pd.isna(r)]
    
def do_nothing(inp):
    return inp

def to_labels(array):
    return [label(r) if str(r).startswith('http') else r for r in array if not pd.isna(r)]

Load all resources

In [3]:
root = './embeddings/new'
all_data = pd.read_csv('all_data_sorted.csv')

print('Loading voc.kv')
voc_emb = KeyedVectors.load(path.join(root, 'voc.kv'))

Loading voc.kv


  all_data = pd.read_csv('all_data_sorted.csv')


In [4]:
print('Loading smells.kv')
smell_emb_path = path.join(root, 'smells.kv')
print(smell_emb_path)
smell_emb = KeyedVectors.load(smell_emb_path)

def get(smell, with_labels=False): 
    res = all_data[all_data['smell'] == smell]
    proc = to_labels if with_labels else do_nothing
    return {
        'source': proc(clean(res['source'].unique().tolist())),
        'carrier': proc(clean(res['carrier'].unique().tolist())),
        'quality': proc(clean(res['quality'].unique().tolist())),
        'quality_type': proc(clean(res['quality_type'].unique().tolist())),
        'place': proc(clean(res['place'].unique().tolist())),
        'place_type': proc(clean(res['place_type'].unique().tolist())),
        'gesture': proc(clean(res['gesture'].unique().tolist())),
        'emotion': proc(clean(res['emotion'].unique().tolist())),
        'time': proc(clean(res['time'].unique().tolist()))
}

Loading smells.kv
./embeddings/new/smells.kv


In the following, we have a pandas dataframe containing all values for the most important properties of the graph, smell by smell

In [5]:
all_data

Unnamed: 0,smell,source,carrier,quality,quality_type,place,place_type,gesture,emotion,time
0,http://data.odeuropa.eu/smell/c52b0781-e906-53...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,http://data.odeuropa.eu/vocabulary/drom/spicy,http://data.odeuropa.eu/attribute-type/character,,,,,1903
1,http://data.odeuropa.eu/smell/c52b0781-e906-53...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,http://data.odeuropa.eu/vocabulary/historic-bo...,http://data.odeuropa.eu/attribute-type/character,,,,,1903
2,http://data.odeuropa.eu/smell/ae8a7f2c-eacb-58...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,,,,1916
3,http://data.odeuropa.eu/smell/54d263a5-1015-5c...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,,,,1916
4,http://data.odeuropa.eu/smell/f61895e3-110e-51...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,,,,1867
...,...,...,...,...,...,...,...,...,...,...
241505,http://data.odeuropa.eu/smell/ba377729-4697-5e...,,,,,,,,,1862.0
241506,http://data.odeuropa.eu/smell/cebb7b35-b8a4-5c...,,,,,,,,,1862.0
241507,http://data.odeuropa.eu/smell/1c72f57e-1170-50...,,,,,,,,,1862.0
241508,http://data.odeuropa.eu/smell/f6463053-02ec-59...,,,,,,,,,1862.0


In [6]:
only_obj= [l for l in voc_emb.key_to_index.keys() if l.startswith('http://data.odeuropa.eu/vocabulary/olfactory-objects')]
X = voc_emb[only_obj]

In [7]:
n_clusters = 20

In [8]:
from sklearn import cluster
from sklearn import metrics
kmeans = cluster.KMeans(n_clusters=n_clusters)
kmeans.fit(X)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
# silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
# print ("Silhouette_score: ")
# print (silhouette_score)

Cluster id labels for inputted data
[ 0  0 12  8 18 14 14 14 14 14 14 14 14 14  6  1 14 14 14 14 14 14  5 14
 14 14 14 14 14 14 14 14 14 14 14 14 14  7 14 14  7 14 14 14 14 14 14 14
 14  6 14 14  6 14 14 14 14 14 14 14 14 14 14 14 14  6 14 14 14 14 14 14
 18 14 12 14  2  2 14 14  5 14 14  2 17  2 14 14 14 14  5 10 10 12  8 15
  3 14 12  8 14 17 10  3  5 10 12  7  4  5 17  7  8  7  2 17 12  7  8  8
  3  8 14  7  7  7  7  7  7  7  7  7  9  9  9  9  9  9 13 10 14 12  2  2
  8  3  3 12 12 12  4 17  2 15  3 12  4  4  4  4 14 12  3  8 12  8 12  8
 18  3  8  5  2  3  3  3 15  8  8  2 14 17  2  2  2  2  2  2  2  2  8 12
  2  8 16  8  8 12  2  2  2  2 12 12 14 14 14 14 14 14 16 16 16 16 11 16
 16 16 16 16 16  7 16 16 16 16  7  7  7  7  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  7  0  7  0  0  7  7  7  4  4  7  4  4  4  4  7  7  8  4
  4  7  7 12 16  7  7  7  7  7  7  0 13  7  7  7  7  7  8 15 15 18 15  3
 15 15  3 15 15  8 15  7  7  7 17 17 17 17 17 15 11 17 17  2  2  2  2  2
  2  2  8 16  3

  super()._check_params_vs_input(X, default_n_init=10)


In [9]:
new_centroids = []
for i, x in enumerate(centroids):
    top = voc_emb.most_similar_cosmul(x, topn=5)
    for y in top:
        print(y, label(y[0]))

    new_centroids.append(top[0][0])
    print('%%%%%%%%%%%%%%%')

('http://data.odeuropa.eu/vocabulary/olfactory-objects/306', 3.910881280899048) Horse radish
('http://data.odeuropa.eu/vocabulary/olfactory-objects/308', 3.902214527130127) Celery
('http://data.odeuropa.eu/vocabulary/olfactory-objects/311', 3.8711423873901367) Cabbage
('http://data.odeuropa.eu/vocabulary/olfactory-objects/304', 3.8469653129577637) Bellpepper
('http://data.odeuropa.eu/vocabulary/olfactory-objects/309', 3.8126728534698486) Parsnip
%%%%%%%%%%%%%%%
('http://data.odeuropa.eu/vocabulary/olfactory-objects/43', 4.02849006652832) Cigarette
('http://data.odeuropa.eu/vocabulary/olfactory-objects/42', 4.005650043487549) Cigar-holder
('http://data.odeuropa.eu/vocabulary/olfactory-objects/40', 4.001083850860596) Cigar-box
('http://data.odeuropa.eu/vocabulary/olfactory-objects/41', 3.9922971725463867) Cigar-case
('http://data.odeuropa.eu/vocabulary/olfactory-objects/122', 3.987865447998047) Pipe
%%%%%%%%%%%%%%%
('http://data.odeuropa.eu/vocabulary/olfactory-objects/262', 1.9273463487

In [10]:
# len(only_obj)

import pandas as pd
top_concepts = pd.read_csv('embeddings//top-concepts.csv')
concepts = list(set(top_concepts['sub'].to_list()))

In [11]:
from sklearn.metrics import homogeneity_completeness_v_measure

import numpy as np
from collections import Counter

def flatten_concatenation(matrix):
     flat_list = []
     for row in matrix:
         flat_list += row
     return flat_list

def get_top_concept(t):
    return top_concepts[top_concepts['sub'] == t]['top'].to_list()

classes = [get_top_concept(t) for t in only_obj]

priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            print('error')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.8971609626051533, 0.6120032611941473, 0.7276417321213319)

In [12]:
%%time

from sklearn.manifold import TSNE

tsne = TSNE(random_state=1, n_iter=15000, metric="cosine")

embs = tsne.fit_transform(X)

CPU times: user 16.7 s, sys: 8.31 s, total: 25 s
Wall time: 3.67 s


In [13]:
df = pd.DataFrame()
df['id'] = only_obj
# df['label'] = df['id'].apply(label)
df['x'] = embs[:, 0]
df['y'] = embs[:, 1]

In [14]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline


FS = (10, 8)
fig, ax = plt.subplots(figsize=FS)
# Make points translucent so we can visually identify regions with a high density of overlapping points
ax.scatter(embs[:,0], embs[:,1], c=labels, alpha=.1)

for x in new_centroids:
    idx = voc_emb.key_to_index[x]
    if idx not in embs:
        continue
    coords = embs[idx]
    ax.text(coords[0], coords[1], label(x))

In [15]:
def get_uri_by_label():
    q = '''
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        PREFIX onto: <http://www.ontotext.com/>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

        SELECT ?uri
        FROM onto:explicit
        WHERE {
            ?uri skos:prefLabel|rdfs:label "%s"@en
        }
    ''' % uri
    sparql.setQuery(q)
    ret = sparql.queryAndConvert()
    data = [l['uri'] for l in ret['results']['bindings']]
    if len(data) < 1:
        return None
    return data[0]['value']

In [16]:
# Some helper functions for plotting annotated t-SNE visualizations

# TODO: adjust_text not available in kernels
try:
    from adjustText import adjust_text
except ImportError:
    def adjust_text(*args, **kwargs):
        pass

def adjust_text(*args, **kwargs):
    pass

def plot_bg(bg_alpha=.01, figsize=(13, 9), emb_2d=None):
    """Create and return a plot of all our movie embeddings with very low opacity.
    (Intended to be used as a basis for further - more prominent - plotting of a 
    subset of movies. Having the overall shape of the map space in the background is
    useful for context.)
    """
    if emb_2d is None:
        emb_2d = embs
    fig, ax = plt.subplots(figsize=figsize)
    X = emb_2d[:, 0]
    Y = emb_2d[:, 1]
    ax.scatter(X, Y, alpha=bg_alpha)
    return ax

def annotate_sample(n, n_ratings_thresh=0):
    """Plot our embeddings with a random sample of n movies annotated.
    Only selects movies where the number of ratings is at least n_ratings_thresh.
    """
    sample = mainstream_movies[mainstream_movies.n_ratings >= n_ratings_thresh].sample(
        n, random_state=1)
    plot_with_annotations(sample.index)

def plot_by_label_pattern(pattern, **kwargs):
    """Plot all movies whose labels match the given regex pattern.
    """
    match = df[df.label.str.contains(pattern)]
    return plot_with_annotations(match.index, **kwargs)

def add_annotations(ax, label_indices, emb_2d=None, **kwargs):
    if emb_2d is None:
        emb_2d = embs
    X = emb_2d[label_indices, 0]
    Y = emb_2d[label_indices, 1]
    ax.scatter(X, Y, **kwargs)

def plot_with_annotations(label_indices, text=True, labels=None, alpha=1, **kwargs):
    ax = plot_bg(**kwargs)
    Xlabeled = embs[label_indices, 0]
    Ylabeled = embs[label_indices, 1]
    if labels is not None:
        for x, y, label in zip(Xlabeled, Ylabeled, labels):
            ax.scatter(x, y, alpha=alpha, label=label, marker='1',
                       s=90,
                      )
        fig.legend()
    else:
        ax.scatter(Xlabeled, Ylabeled, alpha=alpha, color='green')
    
    if text:
        # TODO: Add abbreviated label column
        labels = mainstream_movies.loc[label_indices, 'label'].values
        texts = []
        for label, x, y in zip(labels, Xlabeled, Ylabeled):
            t = ax.annotate(label, xy=(x, y))
            texts.append(t)
        adjust_text(texts, 
                    #expand_text=(1.01, 1.05),
                    arrowprops=dict(arrowstyle='->', color='red'),
                   )
    return ax

FS = (13, 9)
def plot_region(x0, x1, y0, y1, text=True):
    """Plot the region of the mapping space bounded by the given x and y limits.
    """
    fig, ax = plt.subplots(figsize=FS)
    pts = df[
        (df.x >= x0) & (df.x <= x1)
        & (df.y >= y0) & (df.y <= y1)
    ]
    ax.scatter(pts.x, pts.y, alpha=.6)
    ax.set_xlim(x0, x1)
    ax.set_ylim(y0, y1)
    if text:
        texts = []
        for id, x, y in zip(pts.id.values, pts.x.values, pts.y.values):
            t = ax.annotate(label(id), xy=(x, y))
            texts.append(t)
        adjust_text(texts, expand_text=(1.01, 1.05))
    return ax

def plot_region_around(label, margin=5, **kwargs):
    """Plot the region of the mapping space in the neighbourhood of the the movie with
    the given label. The margin parameter controls the size of the neighbourhood around
    the movie.
    """
    xmargin = ymargin = margin
    match = df[df['id'] == label]
    #assert len(match) == 1
    row = match.iloc[0]
    return plot_region(row.x-xmargin, row.x+xmargin, row.y-ymargin, row.y+ymargin, **kwargs)

In [17]:
plot_region_around('http://data.odeuropa.eu/vocabulary/olfactory-objects/267')

<Axes: >

In [18]:
# Same with Dist Mult
kv = KeyedVectors.load_word2vec_format(path.join(root,'distmult_entity.bin'), binary=True)
only_obj= [l for l in kv.key_to_index.keys() if l.startswith('http://data.odeuropa.eu/vocabulary/olfactory-objects')]
CATS = ['http://data.odeuropa.eu/vocabulary/olfactory-objects/artifact','http://data.odeuropa.eu/vocabulary/olfactory-objects/carrier','http://data.odeuropa.eu/vocabulary/olfactory-objects/smell-source','http://data.odeuropa.eu/vocabulary/olfactory-objects/smell_carrier']
only_obj = [l for l in only_obj if l not in CATS]
        
X = kv[only_obj]

In [19]:
kmeans = cluster.KMeans(n_clusters=n_clusters)
kmeans.fit(X)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
# silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
# print ("Silhouette_score: ")
# print (silhouette_score)

  super()._check_params_vs_input(X, default_n_init=10)


Cluster id labels for inputted data
[ 0 12  0  4  4  5 18 18 12 11 19  5 12  2 12  7 16 17 14 17 14 17 12  6
 17 12  0 12 12 17  3 12 14 14  8 12 12 12 12 12  3 14 14 18  1  4 14 12
 17 14 14 11  4  3 14 12  7  1 14  0  4 16  7 10 12 14 12  7 11  7  1 14
  7  7 11  5  0  7 11 18 13 13 19 14 18  0 10  7 17 18 10  7  2  7  7  7
  7  7  4  4 12  4  4  4  4  4  4 14 16  3  3  7 17 15 15 15 16 16 16  7
 11  9 15 10 15  5  7  7  7  7  0 16 15  1  7 16  1 16  1 15 15  1  0 16
 11  8  9 13 13  0 13 13  1 18 19 15 13 12 11  9  9  9  3  3  9  9  3  0
  1 17 16 16 19  1 10 18 18  5  9 12  3  3  3 14  9  9 16  1  5 18 11  0
 14  0 12  0  5  7 10 10 12 10 10 10  7  7 10 10 10  7 10 12 10 10  2 10
  7  7  7 10  2  7 12 14  2  2  2  2  2  2  7  2  2  2 14  2  2  7  2 10
  7  7  7  7  7  7  7  7  7  7  7  7  7  7  2  1  9  7 10 13  7 16 10  7
  7  7  7 12  7  7  8  8  7 16  7  2  2 10  7  1 15 15 13 15 15 15 15 15
 15  7 15 15  1  2  7  2  0 11 11 11 16 15 11 15 12 17 11 11 19 19  3 12
  3  3  3  3  1

In [20]:
new_centroids = []
for i, x in enumerate(centroids):
    top = kv.most_similar_cosmul(x, topn=5)
    for y in top:
        print(y, label(y[0]))

    new_centroids.append(top[0][0])
    print('%%%%%%%%%%%%%%%')

('http://data.odeuropa.eu/vocabulary/olfactory-objects/67', 0.8280248045921326) Fireplace
('http://data.odeuropa.eu/vocabulary/olfactory-objects/1', 0.827919602394104) Amulet
('http://data.odeuropa.eu/vocabulary/olfactory-objects/271', 0.8255181908607483) Ink
('http://data.odeuropa.eu/vocabulary/olfactory-objects/69', 0.8246099948883057) Flacon
('http://data.odeuropa.eu/vocabulary/olfactory-objects/39', 0.8172957301139832) Cigar
%%%%%%%%%%%%%%%
('http://data.odeuropa.eu/vocabulary/olfactory-objects/482', 0.8600473999977112) Dahlia
('http://data.odeuropa.eu/vocabulary/olfactory-objects/329', 0.8575294017791748) Woodruff
('http://data.odeuropa.eu/vocabulary/olfactory-objects/569', 0.8515503406524658) Dandelion
('http://data.odeuropa.eu/vocabulary/olfactory-objects/82', 0.850074052810669) Garland
('http://data.odeuropa.eu/vocabulary/olfactory-objects/545', 0.8500646352767944) Mimosa
%%%%%%%%%%%%%%%
('http://data.odeuropa.eu/vocabulary/olfactory-objects/495', 0.8771436810493469) Avocado
('

In [21]:
classes = [get_top_concept(t) for t in only_obj]

priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            print('error')
            print(only_obj[i])

homogeneity_completeness_v_measure(classes_flat, labels)

(0.9627349826289522, 0.6461523604308032, 0.7732965072768323)

In [22]:
# Same with TransE
kv = KeyedVectors.load_word2vec_format(path.join(root, 'transe_entity.bin'), binary=True)
only_obj= [l for l in kv.key_to_index.keys() if l.startswith('http://data.odeuropa.eu/vocabulary/olfactory-objects')]
only_obj = [l for l in only_obj if l not in CATS]
X = kv[only_obj]

In [23]:
kmeans = cluster.KMeans(n_clusters=n_clusters)
kmeans.fit(X)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
print ("Silhouette_score: ")
print (silhouette_score)

  super()._check_params_vs_input(X, default_n_init=10)


Cluster id labels for inputted data
[ 1 18 13 10 10  5 14  7  4 12  3  2  1 11  4 11 13 18  1 18  4  1  4 10
  1 18 15  1  1  1 12  4  1 18 10 13 18  1 18 18 16  1  1 14  8 10  4 15
  4  4  4 12 10 16 18  1 17 14 15  2 10 13  9 17 15  4 13  9 12 17 14 18
 17  2 12  2 12  2  2 14  8 14  3 18  8  2  2 17  4  7  2 17  9  9  9 17
 11  9 10 10  1 10 10 10 10 10 10  1 13 16 16 17  4  2  8  8 13 13 13  2
 13 19 14  9 14  5  9 11 11  9  2 13  8  7  9 13 14 19 14  8  8 14  7 13
 16  2 19  5  2 12  8  8 14 14  3  8  8  1 12  5 19 19 16 16 19 19 16  4
 14  1 13 13  0 14 17 14 14 13 19  1 16 16 16  4 19 19 13  7 13 14 12  4
  4  4  4  5  4  9 17 17 18 17  2  2  2  2 17 17 17  9  0  4 17  0 17 17
 17  9 17  9 11 11 18  4  9 17 11 11 17 11 11 17 11 11  1 11 11 17 11 11
 11 11 11 11 17  2  9  9  9  9 11  2  9  9  9 14 19  9  9 11 11 13 17 11
  2  2 17 15 11 11  9 10 11 13  9 11  9 11 17 14  8  5  8  8 14 14  8  8
  6 11  6  8  8  9 17 11  2 12 12 12 13  2 12 14 15 15 12 12  3  8 16  4
 16 16 16 16  7

In [24]:
new_centroids = []
for i, x in enumerate(centroids):
    top = kv.most_similar_cosmul(x, topn=5)
    for y in top:
        print(y, label(y[0]))

    new_centroids.append(top[0][0])
    print('%%%%%%%%%%%%%%%')

('http://data.odeuropa.eu/vocabulary/olfactory-objects/289', 0.7604581117630005) Pineapple
('http://data.odeuropa.eu/vocabulary/olfactory-objects/291', 0.7325263023376465) Watermelon
('http://data.odeuropa.eu/vocabulary/olfactory-objects/254', 0.7013605237007141) Halitosis
('http://data.odeuropa.eu/vocabulary/olfactory-objects/295', 0.6535303592681885) Peanut
('http://data.odeuropa.eu/vocabulary/olfactory-objects/511', 0.6521593332290649) Apricot
%%%%%%%%%%%%%%%
('http://data.odeuropa.eu/vocabulary/olfactory-objects/51', 0.737899661064148) Head cone
('http://data.odeuropa.eu/vocabulary/olfactory-objects/125', 0.7374388575553894) Pomander
('http://data.odeuropa.eu/vocabulary/olfactory-objects/240', 0.7329413294792175) Chamber pot
('http://data.odeuropa.eu/vocabulary/olfactory-objects/120', 0.7304530739784241) Perfume flacon
('http://data.odeuropa.eu/vocabulary/olfactory-objects/97', 0.725816011428833) Lodereindoos
%%%%%%%%%%%%%%%
('http://data.odeuropa.eu/vocabulary/olfactory-objects/10

In [25]:
classes = [get_top_concept(t) for t in only_obj]

priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            print('error')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.861964398359632, 0.5880959092467961, 0.69916779865305)

In [26]:
all_data = pd.read_csv('all_props.csv', sep=',', index_col=0)
SOURCE = 'od_F3_had_source%20%2F%20ecrm_P137_exemplifies'
all_data

  all_data = pd.read_csv('all_props.csv', sep=',', index_col=0)


Unnamed: 0,smell,od_F6_evoked%20%2F%20ecrm_P137_exemplifies,od_F5_involved_gesture%20%2F%20ecrm_P137_exemplifies,od_F4_had_carrier%20%2F%20ecrm_P137_exemplifies,%5Eecrm_P140_assigned_attribute_to%20%2F%20ecrm_P141_assigned,od_F3_had_source%20%2F%20ecrm_P137_exemplifies,ecrm_P7_took_place_at%20%2F%20ecrm_P137_exemplifies
0,http://data.odeuropa.eu/experience/5a4850db-7f...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,
1,http://data.odeuropa.eu/experience/ad644f44-d1...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,
2,http://data.odeuropa.eu/experience/21b16c3c-d3...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,
3,http://data.odeuropa.eu/experience/28ab7efb-2d...,http://data.odeuropa.eu/vocabulary/olfactory-o...,,,,,
4,http://data.odeuropa.eu/experience/3ab179a4-f0...,http://data.odeuropa.eu/vocabulary/fragrant-sp...,,,,,
...,...,...,...,...,...,...,...
1173906,http://data.odeuropa.eu/smell/13df9c06-507c-55...,,,,,,http://data.odeuropa.eu/vocabulary/fragrant-sp...
1173907,http://data.odeuropa.eu/experience/a2102894-d6...,,,,,,http://data.odeuropa.eu/vocabulary/fragrant-sp...
1173908,http://data.odeuropa.eu/experience/d406b1d8-eb...,,,,,,http://data.odeuropa.eu/vocabulary/fragrant-sp...
1173909,http://data.odeuropa.eu/smell/e4f25cb1-1f09-58...,,,,,,http://data.odeuropa.eu/vocabulary/fragrant-sp...


In [27]:
filtered_all_data = all_data[all_data[SOURCE].notna()]
only_smells= [l for l in smell_emb.key_to_index.keys()] # if l.startswith('http://data.odeuropa.eu/vocabulary/olfactory-objects')]
onl_obj = set(only_smells).intersection(set(filtered_all_data['smell']))

X = smell_emb[onl_obj]

len(onl_obj)

116719

In [66]:
from tqdm.notebook import tqdm

all_classes = {}
for i, x in tqdm(filtered_all_data.iterrows(), total=len(filtered_all_data)):
    sm = x['smell']
    source = x[SOURCE]
    if sm not in all_classes:
        all_classes[sm] = []
    all_classes[sm].append(source)

  0%|          | 0/663054 [00:00<?, ?it/s]

In [67]:
classes = [all_classes[t] for t in tqdm(onl_obj)]
top_classes = []
for c in tqdm(classes):
    top_classes.append(flatten_concatenation([get_top_concept(s) for s in c]))

  0%|          | 0/1926 [00:00<?, ?it/s]

  0%|          | 0/1926 [00:00<?, ?it/s]

In [73]:
n_clusters=20

In [74]:
kmeans = cluster.KMeans(n_clusters=n_clusters)
kmeans.fit(X)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
# silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
# print ("Silhouette_score: ")
# print (silhouette_score)

  super()._check_params_vs_input(X, default_n_init=10)


Cluster id labels for inputted data
[ 6  8  1 ... 11  9 13]
Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-1549.106201171875


In [75]:
def get_source(k, single=True):
    sources =  all_data[all_data['smell']==k][SOURCE]
    if not single:
        return sources.tolist()
    if len(sources) > 0:
        return sources.tolist()[0]
    else:
        return ''

def get_prop(k, p):
    sources =  all_data[all_data['smell']==k][p]
    if len(sources) > 0:
        return sources.tolist()[0]
    else:
        return ''

In [76]:
priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            print('error')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.32569487299859645, 0.4962259930118029, 0.3932696405605955)

In [34]:
classes[126]

['http://data.odeuropa.eu/vocabulary/olfactory-objects/227',
 'http://data.odeuropa.eu/vocabulary/olfactory-objects/416']

In [35]:
priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([top_classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(top_classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            classes_flat.append('other')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.7305063921064572, 0.4568834047392312, 0.5621679561269542)

In [36]:
# new_centroids = []
# for i, x in enumerate(centroids):
    
#     print(f'Cluster {i} - {labels.tolist().count(i)} elements ')
#     top = smell_emb.most_similar(x, topn=5)
#     for y in top:
#         print(y, label(y[0]), '|' ,  label(get_source(y[0])))
#         print(label(get_prop(y[0], '%5Eecrm_P140_assigned_attribute_to%20%2F%20ecrm_P141_assigned')),
#              '|' , label(get_prop(y[0], 'od_F4_had_carrier%20%2F%20ecrm_P137_exemplifies')),
#              '|' , label(get_prop(y[0], 'od_F5_involved_gesture%20%2F%20ecrm_P137_exemplifies')))

#     new_centroids.append(top[0][0])
#     print('%%%%%%%%%%%%%%%')

In [37]:
# new_centroids = []
# for i, x in enumerate(centroids):
    
#     print(f'Cluster {i} - {labels.tolist().count(i)} elements ')
#     top = smell_emb.most_similar(x, topn=5)
#     for y in top:
#         print(y, label(y[0]), '|' ,  label(get_source(y[0])))
#         print(label(get_prop(y[0], '%5Eecrm_P140_assigned_attribute_to%20%2F%20ecrm_P141_assigned')),
#              '|' , label(get_prop(y[0], 'od_F4_had_carrier%20%2F%20ecrm_P137_exemplifies')),
#              '|' , label(get_prop(y[0], 'od_F5_involved_gesture%20%2F%20ecrm_P137_exemplifies')))

#     new_centroids.append(top[0][0])
#     print('%%%%%%%%%%%%%%%')

In [38]:
# %%time

# from sklearn.manifold import TSNE

# tsne = TSNE(random_state=1, n_iter=1500, metric="cosine")

# embs = tsne.fit_transform(X)

In [39]:
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# %matplotlib inline


# FS = (10, 8)
# fig, ax = plt.subplots(figsize=FS)
# # Make points translucent so we can visually identify regions with a high density of overlapping points
# ax.scatter(embs[:,0], embs[:,1], c=labels, alpha=.1)

# for x in new_centroids:
#     coords = embs[smell_emb.key_to_index[x]]
#     ax.text(coords[0], coords[1], label(get_source(x)))

In [40]:
# Same with DistMult 
kv = KeyedVectors.load_word2vec_format(path.join(root,'distmult_entity.bin'), binary=True)

In [41]:
only_smells = [l for l in kv.key_to_index.keys() if l.startswith('http://data.odeuropa.eu/smell')]
len(only_smells)

10000

In [42]:
onl_obj = set(only_smells).intersection(set(filtered_all_data['smell']))

X = kv[onl_obj]

kmeans = cluster.KMeans(n_clusters=n_clusters)
kmeans.fit(X)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
# silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
# print ("Silhouette_score: ")
# print (silhouette_score)

  super()._check_params_vs_input(X, default_n_init=10)


Cluster id labels for inputted data
[15  0 14 ... 14  7  9]
Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-1270.963623046875


In [43]:
classes = [all_classes[t] for t in tqdm(onl_obj)]
top_classes = []
for c in tqdm(classes):
    top_classes.append(flatten_concatenation([get_top_concept(s) for s in c]))

  0%|          | 0/1926 [00:00<?, ?it/s]

  0%|          | 0/1926 [00:00<?, ?it/s]

In [44]:
priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            print('error')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.346701913331363, 0.5351520913510358, 0.4207913169516226)

In [45]:
priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([top_classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(top_classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            classes_flat.append('other')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.44252573520294763, 0.2685094257344911, 0.3342235027461087)

In [46]:
# Same with TransE 
kv = KeyedVectors.load_word2vec_format(path.join(root,'transe_entity.bin'), binary=True)

In [47]:
only_smells = [l for l in kv.key_to_index.keys() if l.startswith('http://data.odeuropa.eu/smell')]

In [48]:
onl_obj = set(only_smells).intersection(set(filtered_all_data['smell']))

X = kv[onl_obj]

kmeans = cluster.KMeans(n_clusters=n_clusters)
kmeans.fit(X)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_
 
print ("Cluster id labels for inputted data")
print (labels)
# print ("Centroids data")
# print (centroids)
 
print ("Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):")
print (kmeans.score(X))
 
# silhouette_score = metrics.silhouette_score(X, labels, metric='euclidean')
 
# print ("Silhouette_score: ")
# print (silhouette_score)

  super()._check_params_vs_input(X, default_n_init=10)


Cluster id labels for inputted data
[ 7 19 14 ... 18  0 10]
Score (Opposite of the value of X on the K-means objective which is Sum of distances of samples to their closest cluster center):
-1548.9083251953125


In [49]:
classes = [all_classes[t] for t in tqdm(onl_obj)]
top_classes = []
for c in tqdm(classes):
    top_classes.append(flatten_concatenation([get_top_concept(s) for s in c]))

  0%|          | 0/1926 [00:00<?, ?it/s]

  0%|          | 0/1926 [00:00<?, ?it/s]

In [50]:
priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            print('error')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.33716285662560824, 0.5155211579474414, 0.4076881548003032)

In [51]:
priorities = []
for clust in set(labels):
    ps = np.where(labels==clust)[0]
    members = flatten_concatenation([top_classes[x] for x in ps])
    counting = dict(Counter(members))
    priority = sorted([id for id in counting.items()],key=lambda a: a[1], reverse= True)
    priority = [a[0] for a in priority]
    priorities.append(priority)
    
classes_flat = []
for i, x in enumerate(top_classes):
    if len(x) == 1:
        classes_flat.append(x[0])
    else:
        done = False
        
        for p in priorities[labels[i]]:
            if p in x:
                classes_flat.append(p)
                done = True
                break
        if not done:
            classes_flat.append('other')

homogeneity_completeness_v_measure(classes_flat, labels)

(0.2710873167996885, 0.15912656201156286, 0.20053835941540832)