In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import numpy.matlib
import pickle
import random
import seaborn as sns
import nbimporter

from matplotlib import pyplot as plt
from scipy import linalg, sparse, stats
from tqdm.notebook import tqdm, trange
from sklearn.preprocessing import normalize, scale
from sklearn.neighbors import NearestNeighbors, kneighbors_graph
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from sklearn import linear_model
from KEMA import gen_eig, kernel_manifold_alignment

Importing Jupyter notebook from KEMA.ipynb


Initialize an English dictionary to filter words from the GloVe dataset. The GloVe dataset used has parsed over Wikipedia and thus contains a lot of noise. This is filtered using a spell check

In [2]:
import hunspell
spellchecker = hunspell.HunSpell('../data/hunspell/en_US.dic',
                                 '../data/hunspell/en_US.aff')

In [3]:
GLOVE_PATH = '/mnt/guanabana/raid/data/datasets/GloVe/pretrained/glove.6B.300d.txt'
CAVS_PATH = '../data/filtered_broden_cavs.pickle'
EMBEDDING_PATH = '../data/word_embedding_6B_300D.pickle'
TEXTURE_PATH = "/raid/data/datasets/broden1_384/c_texture.csv"

In [4]:
textures = pd.read_csv(TEXTURE_PATH, index_col = 0)
texture_list = list(textures['name'])

Read the concept activation vectors from the Broden dataset as a dictionary

In [5]:
with open(CAVS_PATH, 'rb') as handle:
        cavs_broden = pickle.load(handle)

### Import GloVe

Extract the word embeddings from the GloVe dataset. The word embeddings are stored in a dictionary, with the word as key and vector as value, and in a matrix (n_samples x n_features). The original dataset contains about 400K words. All the words are run through a spell checker, if the word is not present in the English dictionary which was used, the word is removed. In total about 280000 words are removed from the GloVe dataset

In [6]:
if os.path.exists(EMBEDDING_PATH):
    with open(EMBEDDING_PATH, 'rb') as handle:
        embedding_dict = pickle.load(handle)

    glove_embedding_matrix = np.load('../data/glove_embedding_matrix.npy')
    glove_words = list(embedding_dict.keys())
    
else:
    embedding_dict = {}
    glove_words = []
    glove_embedding_matrix = np.zeros((400000, 300))

    row = 0
    # parse through the GloVe data
    with open(GLOVE_PATH, 'r', encoding="utf-8") as f:
        for line in tqdm(f):
            
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], 'float64').reshape(1,-1)
            
            # apply a spell check
            try:
                if spellchecker.spell(word):
                    embedding_dict[word] = vector
                    glove_embedding_matrix[row] = vector
                    glove_words.append(word)
                    row +=1
            except:
                UnicodeEncodeError
                    
    with open(EMBEDDING_PATH, 'wb') as handle:
        pickle.dump(embedding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    # not all 400K rows are used as a certain amount of words was removed by the spellchecker, 
    # thus the unused rows are removed from the matrix
    glove_embedding_matrix = glove_embedding_matrix[:len(glove_words)]
    np.save('../data/glove_embedding_matrix.npy', glove_embedding_matrix)

In [7]:
print('The amount of words left in the GloVe dataset:', glove_embedding_matrix.shape[0])

The amount of words left in the GloVe dataset: 118327


### Preprocess CAVs

Select the concepts from the Broden dataset, which are also available as word embedding. Several concepts in the Broden dataset end with _'-s'_. This is removed to get more concepts linked with an embedding. 

In [8]:
# list all Broden concepts
all_broden_concepts = list(cavs_broden.keys())

# remove duplicates from concept list (e.g. mountain-s is stored when mountain and mountain-s are both in the dataset)
# and remove the textures from the concepts
no_dups_concepts = [c for c in all_broden_concepts if c +'-s' not in all_broden_concepts and c not in texture_list]

In [9]:
print('Amount of CAVs left:', len(no_dups_concepts))

Amount of CAVs left: 649


In [10]:
# create a matrix of all cavs
cav_matrix = np.zeros((len(no_dups_concepts), cavs_broden[no_dups_concepts[0]]['cav'].shape[1]))
for i in range(len(no_dups_concepts)):
    cav = cavs_broden[no_dups_concepts[i]]['cav']
    cav_matrix[i] = cav

# remove the '-s' from the concepts if present 
broden_concepts = [c[:-2] if c[-2:] == '-s' else c for c in no_dups_concepts]

In [11]:
cav_matrix.shape

(649, 2048)

Extract concepts which have a CAV and a correspondence in the GloVe data. The index is stored to remove the unavailable concepts from the CAV matrix

In [12]:
# store the indices and the concepts with a correspondence in both datasets
embedding_idxs = [ix for ix, c in enumerate(broden_concepts) if c in embedding_dict.keys()]
embedding_concepts = [c for ix, c in enumerate(broden_concepts) if c in embedding_dict.keys()]

In [13]:
print("The amount of CAVs with a corresponding word embedding: ", len(embedding_idxs))

The amount of CAVs with a corresponding word embedding:  363


In [14]:
#extract the selected CAVs
cavs_with_embedding = cav_matrix[embedding_idxs]

In [15]:
cavs_with_embedding.shape

(363, 2048)

In [16]:
# To check if the subsetting was done correctly, the cav_matrix is compared with the original cavs
# When correct, this should NOT print anything
for ik, kk in enumerate(embedding_idxs):
    if not np.array_equal(cavs_broden[no_dups_concepts[kk]]['cav'], cavs_with_embedding[ik].reshape(1,-1)):
        print(kk)

Create matrix with CAVs which do not have a matching word embedding

In [17]:
cav_no_glove = np.delete(cav_matrix, embedding_idxs, axis=0)
cav_no_glove_concepts = [conc for conc in broden_concepts if conc not in embedding_concepts]

In [18]:
# To check if the subsetting was done correctly, the cav_matrix is compared with the original cavs
# When correct, this should print "matching vectors"
no_match = []
for iz, zz in enumerate(cav_no_glove_concepts):
    try:
        if not np.array_equal(cavs_broden[zz+'-s']['cav'], cav_no_glove[iz].reshape(1,-1)):
            no_match.append([iz,zz])
            print(iz, zz)
    except:
        KeyError
        if not np.array_equal(cavs_broden[zz]['cav'], cav_no_glove[iz].reshape(1,-1)):
            no_match.append([iz,zz])
            print(iz, zz)

if len(no_match) == 0:
        print('Matching vectors')

Matching vectors


Normalize the CAV matrices to norm-1

In [19]:
cavs_norm = normalize(cavs_with_embedding,axis=1)
cav_no_glove_norm = normalize(cav_no_glove, axis=1)

Add the matrices back together, the first 363 samples have a correspondence in the GloVe dataset, while the other part is used to better capture the structure of the CAV manifold

In [20]:
cavs_sorted = np.concatenate((cavs_norm, cav_no_glove_norm))
cavs_sorted_concept = embedding_concepts + cav_no_glove_concepts

In [21]:
print(cavs_sorted.shape[0] == len(cavs_sorted_concept))

True


### Include SoN image in the Manifold Alignment

In [22]:
# Read SoN info, where the ID equals the image name in the folder structure
son_info = pd.read_csv('../data/son_votes.csv', index_col = 0)

with open('../data/son_tensors.pickle', 'rb') as handle:
        son_tensors = pickle.load(handle)

From each score range (0-1, 1-2, ..., 9-10) 100 images are randomly sampled

In [23]:
son_manifold_imgs = []
for i in range(10):
    son_subset_idxs = list(son_info.query('Average > %s & Average <= %s' % (str(i), str(i+1))).index)
    son_random_idxs = random.sample(son_subset_idxs, 100)
    son_manifold_imgs.extend(son_random_idxs)

In [24]:
len(son_manifold_imgs)

1000

Create a matrix of the tensors of the random selected images

In [25]:
son_manifold_matrix = np.zeros((len(son_manifold_imgs), 2048))
for a, ix in enumerate(son_manifold_imgs):
    img_name_in_df = str(son_info.loc[ix, 'ID'])
    if ix >= 52642:
        ix += 1
    if ix >= 201047:
        ix += 1
    img_name_in_dict = son_tensors[str(ix)][0]
    
    if img_name_in_df != img_name_in_dict:
        print(a)
        
    else:
        img_tensor = son_tensors[str(ix)][2].numpy()
        son_manifold_matrix[a] = img_tensor
    

Normalize the data to unit length

In [26]:
son_manifold_matrix = normalize(son_manifold_matrix, axis=1)

Compute the neighbors for each image

In [27]:
son_imgs_neighbors_dict = {}

for jj in trange(len(son_manifold_imgs)):
    img_nn = cosine_similarity(son_manifold_matrix, son_manifold_matrix[jj].reshape(1,-1)).flatten()
    img_nn_ixs = img_nn.argsort()[::-1][1:11]
    son_imgs_neighbors_dict[str(jj)] = img_nn_ixs

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




### Add SoN to CAVs

In [28]:
cavs_sorted_son = np.concatenate((cavs_sorted, son_manifold_matrix))

In [29]:
cavs_sorted_son.shape

(1649, 2048)

### Preprocess GloVe

Create a matrix of the word embeddings for the concepts which also have a CAV.

In [30]:
embeddings_with_cav = np.zeros((len(embedding_idxs), 300))
for i in range(len(embedding_concepts)):
    word_vec = embedding_dict[embedding_concepts[i]].reshape(1,-1)
    embeddings_with_cav[i] = word_vec

In [31]:
embeddings_with_cav.shape

(363, 300)

Create matrix with GloVe embeddings which do not have a matching CAV

In [32]:
#Initiate a matrix with the required shape
glove_no_cav = np.zeros((len(glove_words) - len(embedding_concepts), 300))

# Also store the GloVe concept names (which do not have a match in CAV)
glove_no_cav_concept = []

idx = 0
for concpt in embedding_dict.keys():
    if concpt not in embedding_concepts:
        glove_no_cav_concept.append(concpt)
        concpt_vec = embedding_dict[concpt]
        glove_no_cav[idx] = concpt_vec
        idx +=1

In [33]:
glove_no_cav.shape

(117964, 300)

Normalize all data to norm-1

In [34]:
glove_norm = normalize(embeddings_with_cav,axis=1)
glove_no_cav_norm = normalize(glove_no_cav, axis=1)

Find the nearest neighbors of the GloVe embeddings which do have correspondence with CAVs. These are used in the manifold alignment to preserve the structure of the GloVe data

Create a list of unique indices of the nearest neigbors to the GloVe embeddings which have a correspondence with a CAV

In [35]:
# takes about 3min to run
glove_neighbors = []
for k in trange(embeddings_with_cav.shape[0]):
    glove_cosim = cosine_similarity(glove_no_cav_norm, glove_norm[k].reshape(1,-1))
    glove_cosim_ixs = glove_cosim.argsort(axis=0)[::-1][:10]
#     print(embedding_concepts[k])
#     print('--------')
    for ixs in glove_cosim_ixs:
#         print(glove_no_cav_concept[ixs.item()])
        if ixs.item() not in glove_neighbors:
            glove_neighbors.append(ixs.item())
#     print('\n')


HBox(children=(IntProgress(value=0, max=363), HTML(value='')))




In [36]:
print('The number of unique neighours found:', len(glove_neighbors))

The number of unique neighours found: 2646


Extract the nearest neighbours from the data

In [37]:
glove_no_cav_nneigh = glove_no_cav_norm[glove_neighbors]
glove_no_cav_concept_nneigh = [glove_no_cav_concept[neigh] for neigh in glove_neighbors]

In [38]:
# check if the matrices correspond
# if the data matches nothing should be printed
for q in range(len(glove_neighbors)):
    if not np.array_equal(embedding_dict[glove_no_cav_concept_nneigh[q]], glove_no_cav[glove_neighbors[q]].reshape(1,-1)):
        print('The data do not correspond for concept:', glove_no_cav_concept_nneigh[q])

Concatenate the GloVe data together, the first 363 rows have correspondence in the CAV dataset. The other data is used the preserve the structure of the GloVe manifold during the alignment

In [39]:
glove_sorted = np.concatenate((glove_norm, glove_no_cav_nneigh))
glove_sorted_concept = embedding_concepts.copy()
glove_sorted_concept.extend(glove_no_cav_concept_nneigh)

## Validate the sorted datasets

In [40]:
glove_embedding_matrix_norm = normalize(glove_embedding_matrix)

for ip, p in enumerate(glove_sorted_concept):
    cix = glove_words.index(p)
    origin_vec= glove_embedding_matrix_norm[cix].reshape(1,-1)
    if not np.array_equal(origin_vec, glove_sorted[ip].reshape(1,-1)):
        print(p) 

---

In [41]:
cavs_sorted_t = cavs_sorted_son.T
glove_sorted_t = glove_sorted.T

In [None]:
%store cavs_sorted_t
%store glove_sorted_t

In [None]:
print('Amount of CAVs used:', cavs_sorted_son.shape[0])
print('Amount of GloVes used:', glove_sorted.shape[0])

## Manifold alignment using Linear Kernel

Function to solve the generalized eigenvalue decomposition, which is a copy of the MatLab implementation of Devis Tuia

To perform manifold allignment the MatLab code from Devis Tuia is followed: https://github.com/dtuia/KEMA/blob/master/general_routine/KMA.m

In [None]:
x1, x2, eigenvectors, eigenvalues = kernel_manifold_alignment(cavs_sorted_t, glove_sorted_t, mu = 0.9,
                                                              lanbda = 0.5, n_neighbors = 10, n_eigs = 2, 
                                                              n_correspondence = 363)

---

## Manifold Alignment Wang '11

In [42]:
pca = PCA(n_components=50)
X1_trans = pca.fit_transform(cavs_sorted_t.T)

In [43]:
X1_trans.shape

(1649, 50)

In [None]:
z1, z2, eigenvectors, eigenvalues = manifold_alignment_wang(cavs_sorted_t, glove_sorted_t, mu = 0.9,
                                                            lanbda = 0.5, n_neighbors = 10, 
                                                            n_eigs = 50, 
                                                            n_correspondence = 363, n_cavs = 649)

The accuracy of different alignment is stored:
- dimensions keep track of the different dimensions used
- accuracy: is the accuracy of nearest neighbor alignment
- variables: 
    - T1: mu = 0.5
    - T2: mu = 0.6
    - T3: mu = 0.7
    - T4: mu = 0.8
    - T5: mu = 0.9

In [46]:
if os.path.exists('../data/manifold_alignment_accuracy.pickle'):
    with open('../data/manifold_alignment_accuracy.pickle', 'rb') as handle:
        manifold_alignment_accuracy = pickle.load(handle)
        
else:
    manifold_alignment_accuracy = {'dimensions':[],
                                  'CAV_accuracy': [],
                                  'variables': [],
                                  'GloVe_accuracy':[]}
    

In [49]:
son_imgs_neighbors_matrix = x1_graph[len(cavs_sorted_concept):, :len(cavs_sorted_concept)]
son_imgs_neighbors_matrix.shape

(1000, 649)

In [50]:
# # Print neighbors in the glove dataset
# glove_neighbors_dict = {}
# for ix, c in enumerate(glove_sorted_concept):
# #     glove_cos_neighbors = np.where(x2_graph[ix] ==1)[0]
#     glove_neighbors_dict[c] = []

#     print(c)
#     print('---------')
#     for n in np.nditer(glove_cos_neighbors):
#         print(glove_sorted_concept[n.item()])
#         glove_neighbors_dict[c].append(glove_sorted_concept[n.item()])
#     print('\n')
# #     break

## Accuracy of alignment

### CAV domain to CAV common space comparison

In [None]:
cav_cav_nn = x1_graph[:len(cavs_sorted_concept), :len(cavs_sorted_concept)]

cav_cav_common_accuracy = 0
cav_cav_total_neighbours = 0

for c, v in tqdm(enumerate(cavs_sorted_concept)):
    correct_cav_nn = 0
    cossim_cav = cosine_similarity(XT1toF.T[:len(cavs_sorted_concept)], XT1toF.T[c].reshape(-1,n_eigs))
    cossim_cav_idxs = cossim_cav.argsort(axis=0)[::-1][1:11].flatten()
    cav_cav_total_neighbours += len(cossim_cav_idxs)
    
    cav_nn_cavdomain = np.where(cav_cav_nn[c] == 1)[0].flatten()
    for z in np.nditer(cossim_cav_idxs):
        if z.item() in cav_nn_cavdomain:
            correct_cav_nn +=1 
            
    cav_cav_common_accuracy += correct_cav_nn
cav_cav_common_accuracy = cav_cav_common_accuracy / cav_cav_total_neighbours * 100

print('CAV to CAV accuracy:', round(cav_cav_common_accuracy, 3),'%')

Calculate a accuracy metric of the manifold alignment. The nearest neighbors of the transformed CAVs are calculated and compared with the nearest neighbors of this concept in the GloVe domain. The comparison is based on exact matches and not on semantic similar words

In [None]:
average_nn_accuracy = 0
total_neighbours_cav = 0

for p, w in tqdm(enumerate(embedding_concepts)):
    correct_nn = 0
    
    cos_sim = cosine_similarity(XT2toF.T, XT1toF.T[p].reshape(-1,n_eigs))
    top_10_nn = cos_sim.argsort(axis=0)[::-1][1:11]
    total_neighbours_cav += len(top_10_nn)
    
    glove_domain_nn = glove_neighbors_dict[w]
    for e in top_10_nn:
        if glove_sorted_concept[e.item()] in glove_domain_nn:
            correct_nn += 1
    average_nn_accuracy += correct_nn
    
average_nn_accuracy /= total_neighbours_cav
# manifold_alignment_accuracy['CAV_accuracy'].append(average_nn_accuracy)

print("CAV to GloVe accuracy: ", round(average_nn_accuracy,3) * 100)

Compare the GloVe neighbors in the common feature space to the GloVe neighbors in the GloVe domain. The accuracy is based on exact matches and not on semantic similar words

In [None]:
glove_glove_accuracy = 0
total_neighbours_glove = 0

for p, w in tqdm(enumerate(embedding_concepts)):
    correct_nn = 0
    cos_sim = cosine_similarity(XT2toF.T, XT2toF.T[p].reshape(-1,n_eigs))
    top_10_nn = cos_sim.argsort(axis=0)[::-1][1:11]
    total_neighbours_glove += len(top_10_nn)
    
    glove_domain_nn = glove_neighbors_dict[w]
    for e in top_10_nn:
        if glove_sorted_concept[e.item()] in glove_domain_nn:
            correct_nn += 1
    
    glove_glove_accuracy += correct_nn
    
glove_glove_accuracy /= total_neighbours_glove
# manifold_alignment_accuracy['GloVe_accuracy'].append(glove_glove_accuracy)

print("GloVe to GloVe accuracy: ", round(glove_glove_accuracy,3) * 100)

In [None]:
# manifold_alignment_accuracy['dimensions']

In [None]:
sns.set()
fig = sns.lineplot(x='dimensions', y='CAV_accuracy', hue='variables', data=manifold_alignment_accuracy)
plt.title('CAV to GloVe accuracy in common feature space')
fig.set(xlabel='Number of dimensions', ylabel='Accuracy')
plt.show()

In [None]:
sns.set()
ax = sns.lineplot(x='dimensions', y='GloVe_accuracy', hue = 'variables', data=manifold_alignment_accuracy)
plt.title('Glove to Glove accuracy in the common feature space')
fig.set(xlabel = 'Number of dimensions', ylabel= 'Accuracy')
plt.show()

In [None]:
with open('../data/manifold_alignment_accuracy.pickle', 'wb') as handle:
        pickle.dump(manifold_alignment_accuracy, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#print CAV neighbors in the common feature space
counter = 0
print("Dim:", n_eigs)
print('MU:', MUW)
print('LANBDA:', LAMBDAW,'\n')
for i in range(n_samples):
    cosim = cosine_similarity(XT2toF.T, XT1toF.T[i].reshape(-1,n_eigs))
    ixs = cosim.argsort(axis=0)[::-1][0:10]
    print('Transformed CAV concept:',embedding_concepts[i])
    print('Closest aligned transformed GloVe concepts:')
    print('-------')
    for j in ixs:
        try:
            print(glove_sorted_concept[j.item()])
        except:
            IndexError
    print('\n')


In [None]:
counter = 0
print("Dim:", n_eigs)
print('MU:', MUW)
print('LANBDA:', LAMBDAW,'\n')
for i in range(n_samples):
    cosim = cosine_similarity(XT2toF.T, XT2toF.T[i].reshape(-1,n_eigs))
    ixs = cosim.argsort(axis=0)[::-1][0:10]
    print('Transformed GloVe concept:',embedding_concepts[i])
    print('Closest aligned transformed GloVe concepts:')
    print('-------')
    for j in ixs:
        try:
            print(glove_sorted_concept[j.item()])
        except:
            IndexError
    print('\n')

In [None]:
img_accuracy = 0
cav_accuracy = 0

for i in range(len(son_manifold_imgs)):
    correct_img = 0
    correct_cav = 0
    
    imgs_neigh = np.where(x1_graph[len(cavs_sorted_concept) + i, len(cavs_sorted_concept):])[0]
    cavs_neigh = np.where(x1_graph[len(cavs_sorted_concept):, :len(cavs_sorted_concept)])[0]
    
    
    cos_imgs = cosine_similarity(XT1toF.T[len(cavs_sorted_concept):], XT1toF.T[len(cavs_sorted_concept):][i].reshape(-1,n_eigs))
    cos_cavs = cosine_similarity(XT1toF.T[:len(cavs_sorted_concept)], XT1toF.T[len(cavs_sorted_concept):][i].reshape(-1, n_eigs))
    
    imgs_nn_ixs = cos_imgs.argsort(axis=0)[::-1][1:11]
    cavs_nn_ixs = cos_cavs.argsort(axis=0)[::-1][:10].flatten()
    
#     ##### Print concepts
#     print('Image', str(i))
#     print('Nearest CAVs in CAV domain:')
#     print('---------')
#     for t in np.nditer(cavs_neigh):
#         print(cavs_sorted_concept[t.item()])
        
#     print('\n')
#     print('Nearest CAVs in common domain:')
#     print(' --------')
#     print('\n')
    
#     for q in np.nditer(cavs_nn_ixs):
#         print(cavs_sorted_concept[q.item()])
#     #############
    
    
    for j in np.nditer(imgs_nn_ixs):
        if j.item() in imgs_neigh:
            correct_img += 1
     
    img_accuracy += correct_img
    
    for q in np.nditer(cavs_nn_ixs):
        if q.item() in cavs_neigh:
            correct_cav +=1
    break
    
img_accuracy = img_accuracy/(len(son_manifold_imgs) * 10)
cav_accuracy = cav_accuracy/(len(son_manifold_imgs) * 10)
print('Accuracy of neighbouring images:', img_accuracy*100)
print('Accuracy of neighbouring CAVs:', cav_accuracy*100)
    

In [None]:
len(imgs_neigh)

## Explore new concepts related to scenicness

Select the 1000 most scenic images in the ScenicOrNot dataset

In [None]:
# sort the dataframe in descending order
sorted_son_info = son_info.sort_values(by='Average', ascending=False)

# extract the indices of the first 1000 images and the last 1000
most_scenic_ixs = np.asarray(sorted_son_info.iloc[:1000,:].index)
least_scenic_ixs = np.asarray(sorted_son_info.iloc[-1000:,:].index)

In [None]:
most_scenic_matrix = np.zeros((len(most_scenic_ixs), 2048))
for a, ix in enumerate(most_scenic_ixs):
    img_name_in_df = str(son_info.loc[ix, 'ID'])
    if ix >= 52642:
        ix += 1
    if ix >= 201047:
        ix += 1
    img_name_in_dict = son_tensors[str(ix)][0]
    
    if img_name_in_df != img_name_in_dict:
        print(a)
        
    else:
        img_tensor = son_tensors[str(ix)][2].numpy()
        most_scenic_matrix[a] = img_tensor

In [None]:
least_scenic_matrix = np.zeros((len(least_scenic_ixs), 2048))
for a, ix in enumerate(least_scenic_ixs):
    img_name_in_df = str(son_info.loc[ix, 'ID'])
    if ix >= 52642:
        ix += 1
    if ix >= 201047:
        ix += 1
    img_name_in_dict = son_tensors[str(ix)][0]
    
    if img_name_in_df != img_name_in_dict:
        print(a)
        
    else:
        img_tensor = son_tensors[str(ix)][2].numpy()
        least_scenic_matrix[a] = img_tensor

In [None]:
most_scenic_matrix = normalize(most_scenic_matrix)
least_scenic_matrix = normalize(least_scenic_matrix)

### Check for CAV concepts in the CAV domain for most scenic images

In [None]:
scenic_cav_concepts = []
for im in trange(most_scenic_matrix.shape[0]):
    cos_sim_cav_idxs = cosine_similarity(cavs_sorted, most_scenic_matrix[im].reshape(1,-1)).flatten()
    top10_cav = cos_sim_cav_idxs.argsort()[::-1][:10] # select the closest 20 neighbours to every image
    # add the concept to the list
    for ix in np.nditer(top10_cav):
        scenic_cav_concepts.append(cavs_sorted_concept[ix.item()])

In [None]:
unique_scenic_cav_concepts = list(set(scenic_cav_concepts))
cav_concepts_dict = {'Concepts' : cavs_sorted_concept,
                    'Frequency': [scenic_cav_concepts.count(x) for x in cavs_sorted_concept]
                    }

cav_concepts_df = pd.DataFrame.from_dict(cav_concepts_dict)
cav_concepts_df.sort_values(by='Frequency', ascending=False, inplace=True)

In [None]:
cav_concepts_df.head(10)

### Transform the images to the common space

In [None]:
most_scenic_matrix_t = most_scenic_matrix.T
least_scenic_matrix_t = least_scenic_matrix.T

Transform the images to the common feature space

In [None]:
scenic_imgs_transform = np.matmul(E1.T, most_scenic_matrix_t)
unscenic_imgs_transform = np.matmul(E1.T, least_scenic_matrix_t)

In [None]:
# Transform data to normal distribution using the mean and std of the training data

T3 = most_scenic_matrix_t.shape[1]
T4 = least_scenic_matrix_t.shape[1]

m3 = np.mean(scenic_imgs_transform.T, axis = 0)
m4 = np.mean(unscenic_imgs_transform.T, axis = 0)
s3 = np.std(scenic_imgs_transform.T, axis = 0)
s4 = np.std(unscenic_imgs_transform.T, axis =0)

scenic_imgs = np.divide((scenic_imgs_transform.T - np.matlib.repmat(m1, T3, 1)), 
                     np.matlib.repmat(s1, T3, 1)).T

unscenic_imgs = np.divide((unscenic_imgs_transform.T - np.matlib.repmat(m1, T4, 1)), 
                     np.matlib.repmat(s1, T4, 1)).T

### Check for CAV concepts in the common space for the most scenic images

In [None]:
new_cav_concept_commonspace = []
for i in trange(scenic_imgs.shape[1]):
    sc_idxs = cosine_similarity(XT1toF.T[:649], scenic_imgs.T[i].reshape(1,-1)).flatten()
    sc_idxs = sc_idxs.argsort()[::-1][:10]
    for ix in np.nditer(sc_idxs):
        cav_commonspace_concept = cavs_sorted_concept[ix.item()]
        new_cav_concept_commonspace.append(cav_commonspace_concept)


In [None]:
unique_new_cav_concepts = list(set(new_cav_concept_commonspace))
cav_commonspace_dict = {'Concept': cavs_sorted_concept,
                       'Frequency': [new_cav_concept_commonspace.count(o) for o in cavs_sorted_concept]}

cav_commonspace_df = pd.DataFrame.from_dict(cav_commonspace_dict)
cav_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [None]:
cav_commonspace_df.head(10)

### Check for new concepts in the Glove data in the common space for the most scenic images

In [None]:
new_glove_concept_commonspace = []
for i in trange(len(most_scenic_ixs)):
    sc_idxs = cosine_similarity(XT2toF.T, scenic_imgs.T[i].reshape(1,-1))
    sc_idxs = sc_idxs.argsort(axis=0)[::-1][:10]
    for ix in np.nditer(sc_idxs):
        glove_commonspace_concept = glove_sorted_concept[ix.item()]
        new_glove_concept_commonspace.append(glove_commonspace_concept)

In [None]:
glove_commonspace_dict = {'Concept': glove_sorted_concept,
                       'Frequency': [new_glove_concept_commonspace.count(u) for u in glove_sorted_concept]}

glove_commonspace_df = pd.DataFrame.from_dict(glove_commonspace_dict)
glove_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [None]:
glove_commonspace_df.head(10)

---

### Check the similar CAVs to the least scenic images in the CAV domain

In [None]:
unscenic_cav_concepts = []
for im in trange(least_scenic_matrix.shape[0]):
    cos_unsim_cav_idxs = cosine_similarity(cavs_sorted, least_scenic_matrix[im].reshape(1,-1))
    top10_cav = cos_unsim_cav_idxs.argsort(axis=0)[::-1][:10] # select the closest 20 neighbours to every image
    # add the concept to the list
    for ix in np.nditer(top10_cav):
        unscenic_cav_concepts.append(cavs_sorted_concept[ix.item()])

In [None]:
unique_unscenic_cav_concepts = list(set(unscenic_cav_concepts))
uncav_concepts_dict = {'Concepts' : cavs_sorted_concept,
                    'Frequency': [unscenic_cav_concepts.count(x) for x in cavs_sorted_concept]
                    }

uncav_concepts_df = pd.DataFrame.from_dict(uncav_concepts_dict)
uncav_concepts_df.sort_values(by='Frequency', ascending=False, inplace=True)

In [None]:
uncav_concepts_df.head(10)

### Check for CAV concepts in the common domain for the least scenic images

In [None]:
unscenic_cavs_concept_common = []
for i in trange(unscenic_imgs.shape[1]):
    unsc_idxs = cosine_similarity(XT1toF.T[:649], unscenic_imgs.T[i].reshape(1,-1)).flatten()
    unsc_idxs = unsc_idxs.argsort()[::-1][:10]
    for ix in np.nditer(unsc_idxs):
        unscenic_cavs_concept_common.append(cavs_sorted_concept[ix.item()])

In [None]:
unscenic_cavs_common_dict = {'Concepts' : cavs_sorted_concept,
                            'Frequency' : [unscenic_cavs_concept_common.count(c) for c in cavs_sorted_concept]}

unscenic_cavs_common_df = pd.DataFrame.from_dict(unscenic_cavs_common_dict)
unscenic_cavs_common_df.sort_values(by='Frequency', ascending=False, inplace=True)

In [None]:
unscenic_cavs_common_df.head(10)

### Check for GloVe concepts in the common domain for the least scenic images

In [None]:
new_glove_unscenic_concept_commonspace = []
for i in trange(len(least_scenic_ixs)):
    unsc_idxs = cosine_similarity(XT2toF.T, unscenic_imgs.T[i].reshape(1,-1))
    unsc_idxs = unsc_idxs.argsort(axis=0)[::-1][:10]
    for ix in np.nditer(unsc_idxs):
        glove_commonspace_unscenic_concept = glove_sorted_concept[ix.item()]
        new_glove_unscenic_concept_commonspace.append(glove_commonspace_unscenic_concept)

In [None]:
unique_unscenic_glove_concepts = list(set(new_glove_unscenic_concept_commonspace))
unglove_commonspace_dict = {'Concept': unique_unscenic_glove_concepts,
                       'Frequency': [new_glove_unscenic_concept_commonspace.count(u) for u in unique_unscenic_glove_concepts]}

unglove_commonspace_df = pd.DataFrame.from_dict(unglove_commonspace_dict)
unglove_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [None]:
unglove_commonspace_df.head(10)

## Convert images to CAVs

Get most scenic image tensor

In [None]:
test_img_ix = sorted_son_info.iloc[0,].name
img_id = sorted_son_info.loc[test_img_ix, 'ID']

if test_img_ix >= 52642:
    test_img_ix += 1
if test_img_ix >= 201047:
    test_img_ix += 1
        
test_img_tensor = son_tensors[str(test_img_ix)][2].numpy().reshape(1,-1)

y1 = np.asarray([1])

In [None]:
sorted_son_info.head(10)

Sample counter images

In [None]:
counter_imgs_idxs = []
for i in range(1,8):
    imgs_idxs = list(sorted_son_info.query('Average > %s & Average <= %s' % (str(i), str(i+1))).index)
    sample_imgs_idxs = random.sample(imgs_idxs, 100)
    for ix in sample_imgs_idxs:
        if ix >= 52642:
            ix +=1 
        if ix >= 201047:
            ix += 1
        counter_imgs_idxs.append(ix)
y2 = np.zeros(len(counter_imgs_idxs))

In [None]:
counter_imgs_matrix = np.zeros((len(counter_imgs_idxs), test_img_tensor.shape[1]))
for j, ix in enumerate(counter_imgs_idxs):
    counter_imgs_matrix[j] = son_tensors[str(ix)][2].numpy()

Merge data and labels

In [None]:
X = np.concatenate((test_img_tensor, counter_imgs_matrix))
y = np.concatenate((y1, y2))

In [None]:
lm = linear_model.SGDClassifier()
lm.fit(X, y)
lm_cav = lm.coef_

Transform image CAV to common feature space

In [None]:
img_cav = normalize(lm_cav)
img_cav_t = img_cav.T

In [None]:
lm_cav.shape

In [None]:
cav_transform = np.matmul(E1.T, img_cav_t)

T5 = cav_transform.shape[1]

cav_transform = np.divide((cav_transform.T - np.matlib.repmat(m1, T5, 1)), 
                     np.matlib.repmat(s1, T5, 1)).T


Check for aligned transformed CAVs

In [None]:
aligned_cavs = []
cavs_idxs = cosine_similarity(XT1toF.T[:649], cav_transform.T.reshape(1,-1)).flatten()
cavs_idxs = cavs_idxs.argsort()[::-1][:10]
for ix in np.nditer(cavs_idxs):
    aligned_cavs.append(cavs_sorted_concept[ix.item()])

In [None]:
aligned_cavs

In [None]:
import glob
def getPath(img_name):
    img_file = []
    for directory, _ , _ in os.walk('/raid/data/datasets/SoN/images'):
        img_file.extend(glob.glob(os.path.join(directory, img_name + '.jpg')))

    return img_file[0]

In [None]:
img_name = sorted_son_info.loc[test_img_ix, 'ID']
test_img_path = getPath(str(img_name))

In [None]:
im = plt.imread(test_img_path)
plt.imshow(im)
plt.grid(b=None)
plt.axis('off')
plt.show()

In [None]:
test_img_path