In [1]:
%pwd

'D:\\soundofai2\\rg_text_to_sound\\playground\\beat_toedtli\\word2word_evaluation'

# Evaluation of word-to-words Matchers
We are matching qualities to (18) keywords. How, and which matching is best?
Here we will: 

- choose a word embedding technique and a metric
- assemble a few qualities
- match the qualities to the keywords

In [2]:
import spacy
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
nlp = spacy.load("en_core_web_sm")

In [3]:
qualities=np.array([ 'delay', 'stretched', 'humming', 'vintage verb', 'less sharp', 'depressing', 'reverse', 'marimba', 
           'warm', 'chilling', 'dark', 'creepy', 'mushy', 'high energy', 'melodic', 'dreary', 'reverbed', 
           'groovy', 'chorused', 'grungy', 'rounder', 'richer', 'clean', 'negative feelings', 'quick delay', 
           'monotone', 'twangy', 'dreamy', 'light reverb', 'nasal', 'purer', 'cleaner', 'distorted', 'less noisy',
           'alot of reverb', 'heavy bass', 'tape saturation', 'distortions', 'distored', 'evil', 'melancholic',
           'heavily distorted', 'reversed', 'dolce', 'soft', 'emotional', 'cantando', 'impressive bass', 
           'reverbs', 'brassy', 'too loud',  'juicy bass', 'sharp', 'chiming', 'nice', 'sustained', 
           'tension', 'softer', 'nice round', 'grungey distortion', 'distorted bass', 'bassy', 
           'overly aggressive', 'ambient', 'slow', 'other-worldly', 'clear', 'hard', 'bright', 
           'percussive', 'glitchy', 'melancholy', 'loud', 'fuller', 'futuristic', 'light distortion', 
           'pitch-shifting', 'metallic', 'agressive', 'pitch shift', 'dull', 'reeverb delay', 'slap-back',
           'wrongness', 'higher', 'heavy', 'calm', 'distortion', 'brighter', 'scary', 'tasto', 'mellow', 
           'increased sustain', 'delays', 'flute', 'slap back delay', 'light', 'less distortion', 'noise',
           'sad', 'bouncy', 'depressed', 'deep', 'dissonant', 'choppiness', 'reverb', 'pitch shifting', 'trippy',
           'fast', 'pont', 'tropical', 'bowed', 'weird', 'tremolo', 'dry', 'chorus effect', 'moody', 
           'less aggressive', 'compression', 'weedy', 'horror', 'trappy', 'tinny', 'aggressive', 'atmospheric', 
           'light chorus', 'tube saturation'
])
keywords = np.array(['bright', 'dark', 'full', 'hollow', 'smooth', 'rough', 'warm', 'metallic', 'clear', 'muddy', 'thin', 'thick', 'pure', 'noisy', 'rich', 'sparse', 'soft', 'hard'])
assert len(keywords)==18

In [4]:
np.unique([len(nlp(str(quality)).vector.tolist() ) for quality in qualities])

array([96])

In [5]:

WordVectorArray = np.array([x for x in [nlp(str(quality)).vector.tolist() for quality in qualities] if len(x)>0]).T
KeywordVectorArray = np.array([x for x in [nlp(str(keyword)).vector.tolist() for keyword in keywords] if len(x)>0]).T

WordVectorArray.shape,KeywordVectorArray.shape

((96, 127), (96, 18))

In [6]:
qualities[9],keywords[0]
WordVec = nlp(str(qualities[9])).vector.tolist()
KeywordVec = nlp(str(keywords[0])).vector.tolist()
sim = np.dot(WordVec,KeywordVec)/np.sqrt(np.dot(WordVec,WordVec))/np.sqrt(np.dot(KeywordVec,KeywordVec))
sim

0.48935463161376935

In [7]:
#np.dot(WordVectorArray.T,KeywordVectorArray) #crashes?!??

In [8]:
NrQualities = WordVectorArray.shape[1]
NrKeywords = KeywordVectorArray.shape[1]
NrQualities,NrKeywords,KeywordVectorArray.shape

(127, 18, (96, 18))

In [9]:
def cos_similarity(quality_vec,keyword_vec):
    similarity = np.dot(quality_vec,keyword_vec)/np.sqrt(np.dot(quality_vec,quality_vec))/np.sqrt(np.dot(keyword_vec,keyword_vec))
    return similarity

def euclidean_distance(quality_vec,keyword_vec):
    similarity = 1-np.dot(quality_vec-keyword_vec,quality_vec-keyword_vec)
    return similarity

def compute_similarity_matrix(similarity_metric):
    simMatrix = np.zeros((NrQualities,NrKeywords))
    for i in range(NrQualities):
        quality_vec = WordVectorArray[:,i]
        for j in range(NrKeywords):
            keyword_vec = KeywordVectorArray[:,j]
            simMatrix[i,j]=similarity_metric(quality_vec,keyword_vec)
    return simMatrix

simMatrix_cos = compute_similarity_matrix(cos_similarity)
simMatrix_eucl = compute_similarity_matrix(euclidean_distance)
simMatrix_cos.shape,len(qualities)

((127, 18), 127)

In [10]:
simMatrix_cos[9,0],simMatrix_eucl[9,0]

(0.48935463161376935, -59.95796312459706)

In [11]:
def compute_sorted_keywords_df(simMatrix):
    l = []
    lind = []
    for i,quality in enumerate(qualities):
        index = np.argsort(simMatrix[i,:])[::-1]
        l.append(keywords[index])
        lind.append(index)
    ranked_keywords = np.vstack(l)
    df = pd.DataFrame(ranked_keywords,index=qualities)
    df.columns = pd.RangeIndex(start=1, stop=df.shape[1]+1, step=1)

    return df,np.vstack(lind)
df_cos,lind = compute_sorted_keywords_df(simMatrix_cos)
df_eucl,lind2 = compute_sorted_keywords_df(simMatrix_eucl)

In [12]:
lind

array([[15,  1,  4, ...,  2, 17, 13],
       [ 2,  0, 16, ...,  9, 10, 15],
       [15, 10,  5, ..., 13, 17,  7],
       ...,
       [ 1,  0,  4, ...,  3, 13, 10],
       [15, 10,  9, ...,  0, 17, 13],
       [15,  1,  9, ..., 17, 13,  2]], dtype=int64)

In [13]:
lind[3,:3]

array([15,  1,  9], dtype=int64)

In [None]:
from scipy import stats
stats.spearmanr([1,2,3,4,5], [5,6,7,8,7])

In [35]:
df_cos.loc['delay'].corr(df_eucl.loc['delay'])

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
df_cos.iloc[0,:].corr

In [27]:
lind[0,:]

array([15,  1,  4,  3,  5,  8, 12,  9,  0, 10, 11,  7,  6, 16, 14,  2, 17,
       13], dtype=int64)

In [29]:
lind2[1,:]

array([ 2, 16,  0,  3,  6, 11, 12,  7, 14,  8,  1,  9,  4, 17,  5, 13, 10,
       15], dtype=int64)

In [23]:
x = df_cos.corr(method="kendall")
x

In [None]:
spearmanr(lind[5,:],lind2[5,:])

In [12]:
df_cos.to_excel('sim_matrix_cos.xlsx')
df_eucl.to_excel('sim_matrix_eucl.xlsx')

In [24]:
df_cos

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
delay,sparse,dark,smooth,hollow,rough,clear,pure,muddy,bright,thin,thick,metallic,warm,soft,rich,full,hard,noisy
stretched,full,bright,soft,thick,rich,pure,clear,warm,hollow,rough,metallic,smooth,hard,dark,noisy,muddy,thin,sparse
humming,sparse,thin,rough,smooth,pure,warm,full,soft,clear,rich,thick,hollow,dark,bright,muddy,noisy,hard,metallic
vintage verb,sparse,dark,muddy,smooth,rough,pure,warm,thin,soft,rich,hollow,bright,metallic,full,clear,thick,hard,noisy
less sharp,hard,warm,full,clear,pure,muddy,dark,thin,thick,soft,hollow,rough,metallic,bright,rich,smooth,sparse,noisy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tinny,dark,sparse,warm,rough,pure,clear,smooth,soft,bright,full,rich,muddy,thick,hollow,metallic,thin,hard,noisy
aggressive,warm,pure,full,rich,bright,thick,soft,clear,rough,dark,smooth,muddy,hard,hollow,metallic,thin,noisy,sparse
atmospheric,dark,bright,smooth,rough,pure,muddy,clear,soft,hard,warm,sparse,thick,rich,metallic,full,hollow,noisy,thin
light chorus,sparse,thin,muddy,dark,rough,smooth,pure,warm,hollow,metallic,clear,soft,thick,rich,full,bright,hard,noisy


In [26]:
df_cos.index[3],df_cos.iloc[3,:],df_eucl.iloc[3,:]

('vintage verb',
 1       sparse
 2         dark
 3        muddy
 4       smooth
 5        rough
 6         pure
 7         warm
 8         thin
 9         soft
 10        rich
 11      hollow
 12      bright
 13    metallic
 14        full
 15       clear
 16       thick
 17        hard
 18       noisy
 Name: vintage verb, dtype: object,
 1        muddy
 2         warm
 3       sparse
 4         dark
 5         pure
 6       hollow
 7         soft
 8       smooth
 9     metallic
 10        thin
 11       rough
 12        rich
 13      bright
 14       thick
 15       clear
 16        full
 17        hard
 18       noisy
 Name: vintage verb, dtype: object)

In [None]:
res = spearmanr(df_cos.iloc[3,:],df_eucl.iloc[3,:])

In [14]:
#for (word,row1),(word,row2) in zip(df_cos.iterrows(),df_eucl.iterrows()):
    #res = spearmanr(row1,b=row2)

In [15]:
def compute_spearmanr(df1,df2):
    d=dict()
    for (word,row1),(word,row2) in zip(df1.iterrows(),df2.iterrows()):
        res = spearmanr(row1,row2)
        d[word]=res.correlation
    ser = pd.Series(d)
    return ser

In [16]:
#spearmanr(df_cos,df_eucl) #kernel restarting

In [1]:
compute_spearmanr(df_cos,df_eucl)

NameError: name 'compute_spearmanr' is not defined

In [None]:
spearmanr?

In [None]:
res