In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import numpy.matlib
import pickle
import random
import seaborn as sns
import nbimporter

from matplotlib import pyplot as plt
from scipy import linalg, sparse, stats
from tqdm.notebook import tqdm, trange
from sklearn.preprocessing import normalize, scale
from sklearn.neighbors import NearestNeighbors, kneighbors_graph
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from sklearn import linear_model
from KEMA import gen_eig, kernel_manifold_alignment, manifold_alignment_wang

Importing Jupyter notebook from KEMA.ipynb


Initialize an English dictionary to filter words from the GloVe dataset. The GloVe dataset used has parsed over Wikipedia and thus contains a lot of noise. This is filtered using a spell check

In [2]:
import hunspell
spellchecker = hunspell.HunSpell('../data/hunspell/en_US.dic',
                                 '../data/hunspell/en_US.aff')

In [3]:
GLOVE_PATH = '/mnt/guanabana/raid/data/datasets/GloVe/pretrained/glove.6B.300d.txt'
CAVS_PATH = '../data/filtered_broden_cavs.pickle'
EMBEDDING_PATH = '../data/word_embedding_6B_300D.pickle'
TEXTURE_PATH = "/raid/data/datasets/broden1_384/c_texture.csv"

In [4]:
textures = pd.read_csv(TEXTURE_PATH, index_col = 0)
texture_list = list(textures['name'])

Read the concept activation vectors from the Broden dataset as a dictionary

In [5]:
with open(CAVS_PATH, 'rb') as handle:
        cavs_broden = pickle.load(handle)

### Import GloVe

Extract the word embeddings from the GloVe dataset. The word embeddings are stored in a dictionary, with the word as key and vector as value, and in a matrix (n_samples x n_features). The original dataset contains about 400K words. All the words are run through a spell checker, if the word is not present in the English dictionary which was used, the word is removed. In total about 280000 words are removed from the GloVe dataset

In [6]:
if os.path.exists(EMBEDDING_PATH):
    with open(EMBEDDING_PATH, 'rb') as handle:
        embedding_dict = pickle.load(handle)

    glove_embedding_matrix = np.load('../data/glove_embedding_matrix.npy')
    glove_words = list(embedding_dict.keys())
    
else:
    embedding_dict = {}
    glove_words = []
    glove_embedding_matrix = np.zeros((400000, 300))

    row = 0
    # parse through the GloVe data
    with open(GLOVE_PATH, 'r', encoding="utf-8") as f:
        for line in tqdm(f):
            
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], 'float64').reshape(1,-1)
            
            # apply a spell check
            try:
                if spellchecker.spell(word):
                    embedding_dict[word] = vector
                    glove_embedding_matrix[row] = vector
                    glove_words.append(word)
                    row +=1
            except:
                UnicodeEncodeError
                    
    with open(EMBEDDING_PATH, 'wb') as handle:
        pickle.dump(embedding_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    # not all 400K rows are used as a certain amount of words was removed by the spellchecker, 
    # thus the unused rows are removed from the matrix
    glove_embedding_matrix = glove_embedding_matrix[:len(glove_words)]
    np.save('../data/glove_embedding_matrix.npy', glove_embedding_matrix)

In [7]:
print('The amount of words left in the GloVe dataset:', glove_embedding_matrix.shape[0])

The amount of words left in the GloVe dataset: 118327


### Preprocess CAVs

Select the concepts from the Broden dataset, which are also available as word embedding. Several concepts in the Broden dataset end with _'-s'_. This is removed to get more concepts linked with an embedding. 

In [8]:
# list all Broden concepts
all_broden_concepts = list(cavs_broden.keys())

# remove duplicates from concept list (e.g. mountain-s is stored when mountain and mountain-s are both in the dataset)
# and remove the textures from the concepts
no_dups_concepts = [c for c in all_broden_concepts if c +'-s' not in all_broden_concepts and c not in texture_list]

In [9]:
print('Amount of CAVs left:', len(no_dups_concepts))

Amount of CAVs left: 649


In [10]:
# create a matrix of all cavs
cav_matrix = np.zeros((len(no_dups_concepts), cavs_broden[no_dups_concepts[0]]['cav'].shape[1]))
for i in range(len(no_dups_concepts)):
    cav = cavs_broden[no_dups_concepts[i]]['cav']
    cav_matrix[i] = cav

# remove the '-s' from the concepts if present 
broden_concepts = [c[:-2] if c[-2:] == '-s' else c for c in no_dups_concepts]

In [11]:
cav_matrix.shape

(649, 2048)

Extract concepts which have a CAV and a correspondence in the GloVe data. The index is stored to remove the unavailable concepts from the CAV matrix

In [12]:
# store the indices and the concepts with a correspondence in both datasets
embedding_idxs = [ix for ix, c in enumerate(broden_concepts) if c in embedding_dict.keys()]
embedding_concepts = [c for ix, c in enumerate(broden_concepts) if c in embedding_dict.keys()]

In [13]:
print("The amount of CAVs with a corresponding word embedding: ", len(embedding_idxs))

The amount of CAVs with a corresponding word embedding:  363


In [14]:
#extract the selected CAVs
cavs_with_embedding = cav_matrix[embedding_idxs]

In [15]:
cavs_with_embedding.shape

(363, 2048)

In [16]:
# To check if the subsetting was done correctly, the cav_matrix is compared with the original cavs
# When correct, this should NOT print anything
for ik, kk in enumerate(embedding_idxs):
    if not np.array_equal(cavs_broden[no_dups_concepts[kk]]['cav'], cavs_with_embedding[ik].reshape(1,-1)):
        print(kk)

Create matrix with CAVs which do not have a matching word embedding

In [17]:
cav_no_glove = np.delete(cav_matrix, embedding_idxs, axis=0)
cav_no_glove_concepts = [conc for conc in broden_concepts if conc not in embedding_concepts]

In [18]:
# To check if the subsetting was done correctly, the cav_matrix is compared with the original cavs
# When correct, this should print "matching vectors"
no_match = []
for iz, zz in enumerate(cav_no_glove_concepts):
    try:
        if not np.array_equal(cavs_broden[zz+'-s']['cav'], cav_no_glove[iz].reshape(1,-1)):
            no_match.append([iz,zz])
            print(iz, zz)
    except:
        KeyError
        if not np.array_equal(cavs_broden[zz]['cav'], cav_no_glove[iz].reshape(1,-1)):
            no_match.append([iz,zz])
            print(iz, zz)

if len(no_match) == 0:
        print('Matching vectors')

Matching vectors


Normalize the CAV matrices to norm-1

In [19]:
cavs_norm = normalize(cavs_with_embedding,axis=1)
cav_no_glove_norm = normalize(cav_no_glove, axis=1)

Add the matrices back together, the first 363 samples have a correspondence in the GloVe dataset, while the other part is used to better capture the structure of the CAV manifold

In [20]:
cavs_sorted = np.concatenate((cavs_norm, cav_no_glove_norm))
cavs_sorted_concept = embedding_concepts + cav_no_glove_concepts

In [21]:
print(cavs_sorted.shape[0] == len(cavs_sorted_concept))

True


### Include SoN image in the Manifold Alignment

In [22]:
# Read SoN info, where the ID equals the image name in the folder structure
son_info = pd.read_csv('../data/son_votes.csv', index_col = 0)

with open('../data/son_tensors.pickle', 'rb') as handle:
        son_tensors = pickle.load(handle)

From each score range (0-1, 1-2, ..., 9-10) 100 images are randomly sampled

In [23]:
son_manifold_imgs = []
for i in range(10):
    son_subset_idxs = list(son_info.query('Average > %s & Average <= %s' % (str(i), str(i+1))).index)
    if len(son_subset_idxs) >= 500:
        son_random_idxs = random.sample(son_subset_idxs, 500)
    else: 
        son_random_idxs = son_subset_idxs
    son_manifold_imgs.extend(son_random_idxs)

In [24]:
len(son_manifold_imgs)

4798

Create a matrix of the tensors of the random selected images

In [25]:
son_manifold_matrix = np.zeros((len(son_manifold_imgs), 2048))
for a, ix in enumerate(son_manifold_imgs):
    img_name_in_df = str(son_info.loc[ix, 'ID'])
    if ix >= 52642:
        ix += 1
    if ix >= 201047:
        ix += 1
    img_name_in_dict = son_tensors[str(ix)][0]
    
    if img_name_in_df != img_name_in_dict:
        print(a)
        
    else:
        img_tensor = son_tensors[str(ix)][2].numpy()
        son_manifold_matrix[a] = img_tensor
    

Normalize the data to unit length

In [26]:
son_manifold_matrix = normalize(son_manifold_matrix, axis=1)

### Add SoN to CAVs

In [27]:
cavs_sorted_son = np.concatenate((cavs_sorted, son_manifold_matrix))

In [28]:
cavs_sorted_son.shape

(5447, 2048)

### Preprocess GloVe

Create a matrix of the word embeddings for the concepts which also have a CAV.

In [29]:
embeddings_with_cav = np.zeros((len(embedding_idxs), 300))
for i in range(len(embedding_concepts)):
    word_vec = embedding_dict[embedding_concepts[i]].reshape(1,-1)
    embeddings_with_cav[i] = word_vec

In [30]:
embeddings_with_cav.shape

(363, 300)

Create matrix with GloVe embeddings which do not have a matching CAV

In [31]:
#Initiate a matrix with the required shape
glove_no_cav = np.zeros((len(glove_words) - len(embedding_concepts), 300))

# Also store the GloVe concept names (which do not have a match in CAV)
glove_no_cav_concept = []

idx = 0
for concpt in embedding_dict.keys():
    if concpt not in embedding_concepts:
        glove_no_cav_concept.append(concpt)
        concpt_vec = embedding_dict[concpt]
        glove_no_cav[idx] = concpt_vec
        idx +=1

In [32]:
glove_no_cav.shape

(117964, 300)

Normalize all data to norm-1

In [33]:
glove_norm = normalize(embeddings_with_cav,axis=1)
glove_no_cav_norm = normalize(glove_no_cav, axis=1)

Find the nearest neighbors of the GloVe embeddings which do have correspondence with CAVs. These are used in the manifold alignment to preserve the structure of the GloVe data

Create a list of unique indices of the nearest neigbors to the GloVe embeddings which have a correspondence with a CAV

In [34]:
# takes about 3min to run
glove_neighbors = []
for k in trange(embeddings_with_cav.shape[0]):
    glove_cosim = cosine_similarity(glove_no_cav_norm, glove_norm[k].reshape(1,-1))
    glove_cosim_ixs = glove_cosim.argsort(axis=0)[::-1][:10]
#     print(embedding_concepts[k])
#     print('--------')
    for ixs in glove_cosim_ixs:
#         print(glove_no_cav_concept[ixs.item()])
        if ixs.item() not in glove_neighbors:
            glove_neighbors.append(ixs.item())
#     print('\n')


HBox(children=(IntProgress(value=0, max=363), HTML(value='')))




In [35]:
print('The number of unique neighours found:', len(glove_neighbors))

The number of unique neighours found: 2646


Extract the nearest neighbours from the data

In [36]:
glove_no_cav_nneigh = glove_no_cav_norm[glove_neighbors]
glove_no_cav_concept_nneigh = [glove_no_cav_concept[neigh] for neigh in glove_neighbors]

In [37]:
# check if the matrices correspond
# if the data matches nothing should be printed
for q in range(len(glove_neighbors)):
    if not np.array_equal(embedding_dict[glove_no_cav_concept_nneigh[q]], glove_no_cav[glove_neighbors[q]].reshape(1,-1)):
        print('The data do not correspond for concept:', glove_no_cav_concept_nneigh[q])

Concatenate the GloVe data together, the first 363 rows have correspondence in the CAV dataset. The other data is used the preserve the structure of the GloVe manifold during the alignment

In [38]:
glove_sorted = np.concatenate((glove_norm, glove_no_cav_nneigh))
glove_sorted_concept = embedding_concepts.copy()
glove_sorted_concept.extend(glove_no_cav_concept_nneigh)

## Validate the sorted datasets

In [39]:
glove_embedding_matrix_norm = normalize(glove_embedding_matrix)

for ip, p in enumerate(glove_sorted_concept):
    cix = glove_words.index(p)
    origin_vec= glove_embedding_matrix_norm[cix].reshape(1,-1)
    if not np.array_equal(origin_vec, glove_sorted[ip].reshape(1,-1)):
        print(p) 

---

In [40]:
cavs_sorted_t = cavs_sorted_son.T
glove_sorted_t = glove_sorted.T

In [41]:
%store cavs_sorted_t
%store glove_sorted_t

Stored 'cavs_sorted_t' (ndarray)
Stored 'glove_sorted_t' (ndarray)


In [42]:
print('Amount of CAVs used:', cavs_sorted_son.shape[0])
print('Amount of GloVes used:', glove_sorted.shape[0])

Amount of CAVs used: 5447
Amount of GloVes used: 3009


---

In [43]:
def FindNeighbours(A, n_neighbours, B=None, include_self=False):
    """
    A: matrix (n_features, n_samples_A)
    n_neighbours: number of neighbours (int)
    B: matrix (n_features, n_samples_B)
    
    Returns:
    neighbor_matrix (n_samples_A x n_samples_A) if B=None, otherwise (n_samples_B x n_samples_A)
    """
    
    if type(B) != numpy.ndarray:
        cosine_dist = cosine_similarity(A.T, A.T)
    else:
        cosine_dist = cosine_similarity(A.T, B.T)
    
    neighbor_matrix = np.zeros(cosine_dist.shape)
    
    for i in range(cosine_dist.shape[1]):
        cos_idx = cosine_dist[:,i].argsort(axis=0)[::-1].flatten()
        if include_self:
            top_nn = cos_idx[:n_neighbours]
        else:
            top_nn = cos_idx[1:n_neighbours+1]
        neighbor_matrix[top_nn, i] = 1
    
    return neighbor_matrix
    

---

## Manifold alignment using Linear Kernel

To perform manifold allignment the MatLab code from Devis Tuia is followed: https://github.com/dtuia/KEMA/blob/master/general_routine/KMA.m

In [44]:
# x1, x2, eigenvectors, eigenvalues = kernel_manifold_alignment(cavs_sorted_t, glove_sorted_t, mu = 0.9,
#                                                               lanbda = 0.5, n_neighbors = 10, n_eigs = 2, 
#                                                               n_correspondence = 363)

---

**Determine the nearest neighbours of the CAVs in the CAV domain**

In [45]:
cav_nn = FindNeighbours(cavs_sorted_t, 10)

In [46]:
# for i, h in enumerate(cavs_sorted_concept):
#     print(h)
#     print('-----')
#     for j in np.nditer(np.where(cav_nn[:,i] ==1)[0]):
#         try:
#             print(cavs_sorted_concept[j])
#         except:
#             IndexError
#     print('\n')

**Reduce the dimensionality of the CAVs using PCA. After dimensionality reduction, compare the nearest neighbours of the reduced CAVs with the original CAVs**

In [47]:
pca = PCA(n_components=100)
X1_pca = pca.fit_transform(cavs_sorted_t.T)

In [48]:
print(np.sum(pca.explained_variance_ratio_))

0.8534660000778961


In [49]:
pca_nn = FindNeighbours(X1_pca.T, 10)

In [50]:
nn_cav_accuracy = 0
for i in trange(cavs_sorted_t.shape[1]):
    correct_nn = 0
    for j in np.nditer(np.where(pca_nn[:,i] ==1)[0]):
        if j in np.where(cav_nn[:,i] ==1)[0]:
            correct_nn += 1
    correct_nn /= 10
    nn_cav_accuracy += correct_nn
    
nn_cav_accuracy /= cavs_sorted_t.shape[1]
print("Accuracy of PCA reduced neighbouring CAVs:", round(nn_cav_accuracy*100, 2),'%')

HBox(children=(IntProgress(value=0, max=5447), HTML(value='')))


Accuracy of PCA reduced neighbouring CAVs: 72.3 %


## Manifold Alignment Wang '11

In [51]:
%pdb

Automatic pdb calling has been turned ON


In [52]:
%%time
z1, z2, eigenvectors, eigenvalues, mean1, std1 = manifold_alignment_wang(X1_pca.T, glove_sorted_t, mu = 0.9,
                                                            lanbda = 0.5, n_neighbors = 10, 
                                                            n_eigs = 400, 
                                                            n_correspondence = 363, n_cavs = 649)

Shape of data1: (100, 5447)
Shape of data2: (300, 3009)
Computing neighbors dataset 1


HBox(children=(IntProgress(value=0, max=5447), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5447), HTML(value='')))


Computing neighbors dataset 2


HBox(children=(IntProgress(value=0, max=3009), HTML(value='')))


Building Laplacians
Solving generalized eigenvalue decomposition


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


Rotating axis if needed


HBox(children=(IntProgress(value=0, max=400), HTML(value='')))


CPU times: user 22min 40s, sys: 1h 1min 7s, total: 1h 23min 48s
Wall time: 3min 23s


---

## Accuracy of alignment

### CAV domain to CAV common space comparison

In [53]:
cav_cav = FindNeighbours(z1, 10)

In [54]:
cav_cav_accuracy = 0
for i in trange(cavs_sorted_t.shape[1]):
    correct_nn = 0
    for j in np.nditer(np.where(cav_cav[:,i] ==1)[0]):
        if j in np.where(cav_nn[:,i] ==1)[0]:
            correct_nn += 1
    correct_nn /= 10
    cav_cav_accuracy += correct_nn
    
cav_cav_accuracy /= cavs_sorted_t.shape[1]
print("Accuracy of common space neighbouring CAVs compared to original data:", round(cav_cav_accuracy*100, 2),'%')


# for b, n in enumerate(cavs_sorted_concept):
#     print(n)
#     print('-----')
#     for j in np.nditer(np.where(cav_cav[:,b] ==1)[0]):
#         try:
#             print(cavs_sorted_concept[j])
#         except:
#             IndexError
#     print('\n')

HBox(children=(IntProgress(value=0, max=5447), HTML(value='')))


Accuracy of common space neighbouring CAVs compared to original data: 50.83 %


### GloVe domain to GloVe common space comparison

In [55]:
glove_nn = FindNeighbours(glove_sorted_t, 10)
common_glove_nn = FindNeighbours(z2, 10)

In [56]:
glove_glove_accuracy = 0

for i in range(glove_sorted_t.shape[1]):
    correct_nn = 0
    for j in np.nditer(np.where(common_glove_nn[:,i] ==1)[0]):
        if j in np.where(glove_nn[:,i] ==1)[0]:
            correct_nn += 1
    correct_nn /= 10
    glove_glove_accuracy += correct_nn
    
glove_glove_accuracy /= glove_sorted_t.shape[1]
print("Accuracy of common space neighbouring GloVe's compared to original data:", round(glove_glove_accuracy*100, 2),'%')

Accuracy of common space neighbouring GloVe's compared to original data: 57.2 %


### View neighbouring GloVe concepts to the CAVs

In [57]:
cav_glove = FindNeighbours(z2, 10, z1)

In [58]:
for i, j in enumerate(cavs_sorted_concept):
    print(j)
    print('------')
    neighb = np.where(cav_glove[:,i] == 1)[0]
    for q in np.nditer(neighb):
        print(glove_sorted_concept[q])
    print('\n')

sky
------
badlands
skies
bright
starry
football
ravine
hillside
hockey
carriage
glacier


tree
------
bird
fruit
trees
eucalyptus
plants
flowering
birds
shade
mango
donor


building
------
embankment
buildings
construction
window
40-story
12-story
painting
crucifix
architectural
vaulting


person
------
torso
stretcher
man
woman
persons
buttocks
blond
rickshaw
all-around
athlete


head
------
hair
muzzle
coach
heads
assistant
ears
hound
coaches
football
goat


leg
------
arm
ear
paw
waterfall
legs
thigh
hips
wig
feet
dangling


torso
------
arm
neck
dog
fur
legs
thigh
limbs
hips
mouth
shoulders


table
------
napkin
bouquet
tables
dining
sat
casserole
ranking
chandeliers
gilded
candlesticks


arm
------
leg
torso
hair
leather
skin
hand
legs
thigh
wrist
hands


road
------
car
wheel
bicycle
highway
dashboard
roads
vehicle
steering
bicycles
widget


ear
------
neck
muzzle
dog
cat
cow
ears
mouth
hind
animal
pig


grass
------
ear
horse
pasture
garden
dirt
turf
beds
thoroughbred
terracing

counter
door
bells
taco
smith
cook
turret
ringing
caliber
taxiing


exhibitor
------
restaurant
toyshop
delicatessen
potable
bikes
restaurants
trackers
store
square
eatery


jersey
------
apparel
shirt
jacket
pants
clothing
blouse
dresses
shirts
jeans
sweater


forecourt
------
castle
plaza
campus
palace
courtyard
presidential
statue
station
apse
hotel


eiderdown
------
bed
lid
eye
beds
sleep
breeders
83-58
ridging
85-69
cream


bridge
------
river
viaduct
arch
dam
bridges
pontoon
span
dams
downstream
waterway


classroom
------
auditorium
kindergarten
elementary
classrooms
teachers
teaching
teacher
students
curriculum
gymnasium


river
------
water
creek
overlooking
flows
rivers
tributary
confluence
tributaries
hydroelectric
empties


bread
------
oven
patty
pantry
bake
dish
loaf
loaves
cake
crumbs
breads


cliff
------
rock
mountain
tower
mill
chimney
ledge
cliffs
limestone
sandstone
cavern


escalator
------
stairs
roof
apparel
embankment
staircase
railings
escalators
elevator
hand

plaything
sheep
stretcher
rubbish
room
upstairs
telephones
goats
bandaged
documentaries


shower stall
------
bottle
sink
bowl
bathroom
kitchen
lid
shower
tub
drain
restroom


swimming pool
------
bathroom
bathtub
handbag
pitch
ocean
restaurant
sail
call
drowned
floorboard


sales booth
------
canopy
fruit
nursery
nunnery
freezer
daycare
merchandising
grocery
pallets
supermarkets


dorm_room
------
bed
cockpit
organ
drawers
alligator
folder
cupboards
mad
closets
machinery


garage-indoor
------
mountain
reception
axle
cycling
laundry
tour
grimy
profits
filth
receptions


hot tub
------
court
mill
flood
drain
chef
rooftop
drowned
decks
eatery
grist


garage door
------
stairs
facility
rooftop
plastered
passenger
pilot
crate
poolside
outcrop
consulate


conveyer belt
------
stairs
canopy
platform
embankment
escalator
carport
windscreen
escalators
platforms
collection


forest-needleleaf
------
crosswalk
parlor
eiderdown
aqueduct
viaduct
fire
passageway
mantel
domes
spiffier


bakery-shop

stairs
bathroom
fountain
granite
statue
marble
flush
kiosk
dingy
bas-relief


library-outdoor
------
carpet
armchair
palm
windows
rubber
blue
front
upholstery
frond
burning


inflatable bounce game
------
arm
ear
cushion
playground
elbow
market
paned
quarter
breakwater
toys


bowling alley
------
microwave
case
bread
rifle
cooks
canopies
freezer
clothing
toaster
rifles


big top
------
tent
reception
hut
elephant
rooms
animal
tighter
wildlife
cottages
inn


fog bank
------
bookcase
dam
rented
mountains
gorge
canvases
storeroom
flatbed
slump
near


table cloth
------
plate
altar
bread
monument
stew
candles
wreaths
easels
tee
bee


hand cart
------
bedroom
bicycle
bus
stalls
flying
ride
taxi
passenger
bicyclist
carriage


liquor_store-indoor
------
bottle
shelf
barrel
delicatessen
injury
pop
manager
shop
store
drugstore


imaret
------
tower
monument
ruin
darkness
dish
cupola
erected
dishes
mausoleum
prayers


dirt_track
------
road
track
trench
driving
mud
shafts
dungeon
dig
gases
barns

---

## Explore new concepts related to scenicness

Select the 1000 most scenic images in the ScenicOrNot dataset

In [109]:
# sort the dataframe in descending order
sorted_son_info = son_info.sort_values(by='Average', ascending=False)

# extract the indices of the first 1000 images and the last 1000
most_scenic_ixs = np.asarray(sorted_son_info.loc[sorted_son_info['Average'] > 7].index)
least_scenic_ixs = np.asarray(sorted_son_info.loc[sorted_son_info['Average'] <= 2].index)

In [110]:
most_scenic_matrix = np.zeros((len(most_scenic_ixs), 2048))
for a, ix in enumerate(most_scenic_ixs):
    img_name_in_df = str(son_info.loc[ix, 'ID'])
    if ix >= 52642:
        ix += 1
    if ix >= 201047:
        ix += 1
    img_name_in_dict = son_tensors[str(ix)][0]
    
    if img_name_in_df != img_name_in_dict:
        print(a)
        
    else:
        img_tensor = son_tensors[str(ix)][2].numpy()
        most_scenic_matrix[a] = img_tensor

In [111]:
least_scenic_matrix = np.zeros((len(least_scenic_ixs), 2048))
for a, ix in enumerate(least_scenic_ixs):
    img_name_in_df = str(son_info.loc[ix, 'ID'])
    if ix >= 52642:
        ix += 1
    if ix >= 201047:
        ix += 1
    img_name_in_dict = son_tensors[str(ix)][0]
    
    if img_name_in_df != img_name_in_dict:
        print(a)
        
    else:
        img_tensor = son_tensors[str(ix)][2].numpy()
        least_scenic_matrix[a] = img_tensor

In [112]:
most_scenic_matrix = normalize(most_scenic_matrix)
least_scenic_matrix = normalize(least_scenic_matrix)

### Check for CAV concepts in the CAV domain for most scenic images

In [64]:
scenic_cav_concepts = []
for im in trange(most_scenic_matrix.shape[0]):
    cos_sim_cav_idxs = cosine_similarity(cavs_sorted, most_scenic_matrix[im].reshape(1,-1)).flatten()
    top10_cav = cos_sim_cav_idxs.argsort()[::-1][:10] # select the closest 20 neighbours to every image
    # add the concept to the list
    for ix in np.nditer(top10_cav):
        scenic_cav_concepts.append(cavs_sorted_concept[ix.item()])

HBox(children=(IntProgress(value=0, max=12384), HTML(value='')))




In [65]:
unique_scenic_cav_concepts = list(set(scenic_cav_concepts))
cav_concepts_dict = {'Concepts' : cavs_sorted_concept,
                    'Frequency': [scenic_cav_concepts.count(x) for x in cavs_sorted_concept]
                    }

cav_concepts_df = pd.DataFrame.from_dict(cav_concepts_dict)
cav_concepts_df.sort_values(by='Frequency', ascending=False, inplace=True)

In [66]:
cav_concepts_df.head(10)

Unnamed: 0,Concepts,Frequency
312,canyon,6627
213,valley,6617
28,earth,6538
353,fjord,6503
81,bush,5247
82,field,4978
38,rock,4776
201,cliff,4355
0,sky,4232
309,moor,3910


### Transform the images to the common space

In [114]:
most_scenic_matrix_pca = pca.transform(most_scenic_matrix)
least_scenic_matrix_pca = pca.transform(least_scenic_matrix)

In [115]:
most_scenic_matrix_t = most_scenic_matrix_pca.T
least_scenic_matrix_t = least_scenic_matrix_pca.T

Transform the images to the common feature space

In [116]:
E1 = eigenvectors[:X1_pca.T.shape[0], :]
E2 = eigenvectors[X1_pca.T.shape[0],:]

In [117]:
scenic_imgs_transform = np.matmul(E1.T, most_scenic_matrix_t)
unscenic_imgs_transform = np.matmul(E1.T, least_scenic_matrix_t)

In [118]:
# Transform data to normal distribution using the mean and std of the training data

T3 = most_scenic_matrix_t.shape[1]
T4 = least_scenic_matrix_t.shape[1]

m3 = np.mean(scenic_imgs_transform.T, axis = 0)
m4 = np.mean(unscenic_imgs_transform.T, axis = 0)
s3 = np.std(scenic_imgs_transform.T, axis = 0)
s4 = np.std(unscenic_imgs_transform.T, axis =0)

scenic_imgs = np.divide((scenic_imgs_transform.T - np.matlib.repmat(mean1, T3, 1)), 
                     np.matlib.repmat(std1, T3, 1)).T

unscenic_imgs = np.divide((unscenic_imgs_transform.T - np.matlib.repmat(mean1, T4, 1)), 
                     np.matlib.repmat(std1, T4, 1)).T

### Check for CAV concepts in the common space for the most scenic images

In [72]:
new_cav_concept_commonspace = []
for i in trange(scenic_imgs.shape[1]):
    sc_idxs = cosine_similarity(z1.T[:649], scenic_imgs.T[i].reshape(1,-1)).flatten()
    sc_idxs = sc_idxs.argsort()[::-1][:10]
    for ix in np.nditer(sc_idxs):
        cav_commonspace_concept = cavs_sorted_concept[ix.item()]
        new_cav_concept_commonspace.append(cav_commonspace_concept)


HBox(children=(IntProgress(value=0, max=12384), HTML(value='')))




In [73]:
unique_new_cav_concepts = list(set(new_cav_concept_commonspace))
cav_commonspace_dict = {'Concept': cavs_sorted_concept,
                       'Frequency': [new_cav_concept_commonspace.count(o) for o in cavs_sorted_concept]}

cav_commonspace_df = pd.DataFrame.from_dict(cav_commonspace_dict)
cav_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [74]:
cav_commonspace_df.head(10)

Unnamed: 0,Concept,Frequency
312,canyon,8379
201,cliff,6109
213,valley,6106
247,ocean,5118
648,mountain pass,4874
222,hill,4350
70,sea,3986
292,snowfield,3676
295,islet,3654
158,coast,3309


### Check for new concepts in the Glove data in the common space for the most scenic images

In [75]:
concept_imgs = []
new_glove_concept_commonspace = []
for i in trange(len(most_scenic_ixs)):
    sc_idxs = cosine_similarity(z2.T, scenic_imgs.T[i].reshape(1,-1))
    sc_idxs = sc_idxs.argsort(axis=0)[::-1][:10]
    for ix in np.nditer(sc_idxs):
        glove_commonspace_concept = glove_sorted_concept[ix.item()]
        new_glove_concept_commonspace.append(glove_commonspace_concept)
        if ix.item() == glove_sorted_concept.index('goats'):
            concept_imgs.append(i)

HBox(children=(IntProgress(value=0, max=12384), HTML(value='')))




In [76]:
glove_commonspace_dict = {'Concept': glove_sorted_concept,
                       'Frequency': [new_glove_concept_commonspace.count(u) for u in glove_sorted_concept]}

glove_commonspace_df = pd.DataFrame.from_dict(glove_commonspace_dict)
glove_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [77]:
glove_commonspace_df.head(10)

Unnamed: 0,Concept,Frequency
1615,mountains,3422
1618,foothills,2856
2526,snow-covered,2481
967,seas,2421
2235,rugged,2169
650,waters,2087
2912,prickly,2082
247,ocean,1663
2176,islands,1659
2950,cove,1565


### Plot images of certain concept

In [None]:
import glob
def getPath(img_name):
    img_file = []
    for directory, _ , _ in os.walk('/raid/data/datasets/SoN/images'):
        img_file.extend(glob.glob(os.path.join(directory, img_name + '.jpg')))

    return img_file[0]

In [None]:
concept_imgs_id = list(son_info.loc[most_scenic_ixs[concept_imgs],'ID'])

In [None]:
for i in tqdm(concept_imgs_id[:50]):
    path = getPath(str(i))
    img = plt.imread(path)
    plt.grid(False)
    plt.imshow(img)
    plt.show()

### Check for GloVe concept which do not have a CAV

In [95]:
glove_c_common = []
for i in trange(len(most_scenic_ixs)):
    sc_idxs = cosine_similarity(z2.T[len(embedding_concepts):,:], scenic_imgs.T[i].reshape(1,-1))
    sc_idxs = sc_idxs.argsort(axis=0)[::-1][:10]
    for ix in np.nditer(sc_idxs):
        glove_commonspace_concept = glove_sorted_concept[ix.item() + len(embedding_concepts)]
        glove_c_common.append(glove_commonspace_concept)
        
glove_c_dict = {'Concept': glove_sorted_concept,
                    'Frequency': [glove_c_common.count(u) for u in glove_sorted_concept]}

glove_c_df = pd.DataFrame.from_dict(glove_c_dict)
glove_c_df.sort_values(by='Frequency', inplace=True, ascending=False)

HBox(children=(IntProgress(value=0, max=12384), HTML(value='')))




In [96]:
glove_c_df.head(10)

Unnamed: 0,Concept,Frequency
1615,mountains,3742
1618,foothills,3117
967,seas,2819
2526,snow-covered,2766
650,waters,2494
2235,rugged,2438
2912,prickly,2327
2176,islands,2022
2950,cove,1834
970,shores,1641


---

### Check the similar CAVs to the least scenic images in the CAV domain

In [119]:
unscenic_cav_concepts = []
for im in trange(least_scenic_matrix.shape[0]):
    cos_unsim_cav_idxs = cosine_similarity(cavs_sorted, least_scenic_matrix[im].reshape(1,-1))
    top10_cav = cos_unsim_cav_idxs.argsort(axis=0)[::-1][:10] # select the closest 20 neighbours to every image
    # add the concept to the list
    for ix in np.nditer(top10_cav):
        unscenic_cav_concepts.append(cavs_sorted_concept[ix.item()])

HBox(children=(IntProgress(value=0, max=15077), HTML(value='')))




In [120]:
unique_unscenic_cav_concepts = list(set(unscenic_cav_concepts))
uncav_concepts_dict = {'Concepts' : cavs_sorted_concept,
                    'Frequency': [unscenic_cav_concepts.count(x) for x in cavs_sorted_concept]
                    }

uncav_concepts_df = pd.DataFrame.from_dict(uncav_concepts_dict)
uncav_concepts_df.sort_values(by='Frequency', ascending=False, inplace=True)

In [121]:
uncav_concepts_df.head(10)

Unnamed: 0,Concepts,Frequency
2,building,12635
0,sky,10015
1,tree,9679
9,road,7661
17,sidewalk,7255
23,street,5961
187,roundabout,5755
14,car,5612
81,bush,3966
60,roof,3906


### Check for CAV concepts in the common domain for the least scenic images

In [122]:
unscenic_cavs_concept_common = []
for i in trange(unscenic_imgs.shape[1]):
    unsc_idxs = cosine_similarity(z1.T[:649], unscenic_imgs.T[i].reshape(1,-1)).flatten()
    unsc_idxs = unsc_idxs.argsort()[::-1][:10]
    for ix in np.nditer(unsc_idxs):
        unscenic_cavs_concept_common.append(cavs_sorted_concept[ix.item()])

HBox(children=(IntProgress(value=0, max=15077), HTML(value='')))




In [123]:
unscenic_cavs_common_dict = {'Concepts' : cavs_sorted_concept,
                            'Frequency' : [unscenic_cavs_concept_common.count(c) for c in cavs_sorted_concept]}

unscenic_cavs_common_df = pd.DataFrame.from_dict(unscenic_cavs_common_dict)
unscenic_cavs_common_df.sort_values(by='Frequency', ascending=False, inplace=True)

In [124]:
unscenic_cavs_common_df.head(10)

Unnamed: 0,Concepts,Frequency
23,street,8354
9,road,6743
460,access_road,6224
17,sidewalk,5086
109,highway,4150
381,building_facade,3712
117,crosswalk,3682
516,forest_road,3269
2,building,2608
273,mansion,2591


### Check for GloVe concepts in the common domain for the least scenic images

In [125]:
new_glove_unscenic_concept_commonspace = []
for i in trange(len(least_scenic_ixs)):
    unsc_idxs = cosine_similarity(z2.T, unscenic_imgs.T[i].reshape(1,-1))
    unsc_idxs = unsc_idxs.argsort(axis=0)[::-1][:10]
    for ix in np.nditer(unsc_idxs):
        glove_commonspace_unscenic_concept = glove_sorted_concept[ix.item()]
        new_glove_unscenic_concept_commonspace.append(glove_commonspace_unscenic_concept)

HBox(children=(IntProgress(value=0, max=15077), HTML(value='')))




In [126]:
unique_unscenic_glove_concepts = list(set(new_glove_unscenic_concept_commonspace))
unglove_commonspace_dict = {'Concept': unique_unscenic_glove_concepts,
                       'Frequency': [new_glove_unscenic_concept_commonspace.count(u) for u in unique_unscenic_glove_concepts]}

unglove_commonspace_df = pd.DataFrame.from_dict(unglove_commonspace_dict)
unglove_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [127]:
unglove_commonspace_df.head(10)

Unnamed: 0,Concept,Frequency
848,highways,3176
1243,intersections,3166
2255,street,2362
821,outbuildings,2179
1100,runways,2171
9,railroads,2031
652,trains,1878
1000,brick,1845
1189,roundabouts,1791
2363,houses,1616


### Check for GloVe concepts which do not have a CAV

In [128]:
new_glove_unscenic_concept_commonspace = []
for i in trange(len(least_scenic_ixs)):
    unsc_idxs = cosine_similarity(z2.T[len(embedding_concepts):,:], unscenic_imgs.T[i].reshape(1,-1))
    unsc_idxs = unsc_idxs.argsort(axis=0)[::-1][:10]
    for ix in np.nditer(unsc_idxs):
        glove_commonspace_unscenic_concept = glove_sorted_concept[ix.item() + len(embedding_concepts)]
        new_glove_unscenic_concept_commonspace.append(glove_commonspace_unscenic_concept)

HBox(children=(IntProgress(value=0, max=15077), HTML(value='')))




In [129]:
unique_unscenic_glove_concepts = list(set(new_glove_unscenic_concept_commonspace))
unglove_commonspace_dict = {'Concept': unique_unscenic_glove_concepts,
                       'Frequency': [new_glove_unscenic_concept_commonspace.count(u) for u in unique_unscenic_glove_concepts]}

unglove_commonspace_df = pd.DataFrame.from_dict(unglove_commonspace_dict)
unglove_commonspace_df.sort_values(by='Frequency', inplace=True, ascending=False)

In [132]:
unglove_commonspace_df.head(10)

Unnamed: 0,Concept,Frequency
774,highways,3375
1136,intersections,3359
1006,runways,2346
751,outbuildings,2321
6,railroads,2203
918,brick,2047
1087,roundabouts,2042
594,trains,2027
2149,houses,1803
1783,mansions,1731


## Extract scenic GloVe concepts which are not in the unscenic GloVe concepts

In [156]:
threshold_freq = 500
scenic_glove_concept = glove_c_df.loc[glove_c_df['Frequency'] > threshold_freq, :]
unscenic_glove_concept = unglove_commonspace_df.loc[unglove_commonspace_df['Frequency'] > threshold_freq, :]

In [157]:
unique_scenic_concept = scenic_glove_concept[~scenic_glove_concept.Concept.isin(unscenic_glove_concept.Concept)]

In [159]:
unique_scenic_concept.head(10)

Unnamed: 0,Concept,Frequency
1615,mountains,3742
1618,foothills,3117
967,seas,2819
2526,snow-covered,2766
650,waters,2494
2235,rugged,2438
2912,prickly,2327
2176,islands,2022
2950,cove,1834
970,shores,1641


In [160]:
print(len(unique_scenic_concept))

50
