In [2]:
from correlation import calc_correlation, vectorize_concepts
from utils.data import load_data, load_gpt, load_cslb, load_sorting, load_cslb_count_vec, load_mcrae, generate_concepts_to_keep, match_behv_sim, load_behav
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from scipy.spatial.distance import squareform
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

nlp = spacy.load("en_core_web_sm")

from os.path import join as pjoin



ImportError: attempted relative import with no known parent package

In [2]:
def sort(df):
    sorted_df = sorting_df.reset_index().set_index('concept_id')
    df['concept_num'] = df.index.map(sorted_df['index'])
    df = df.sort_values(by='concept_num')
    df = df.drop('concept_num', axis=1)
    return df

def vec(gpt_df, cslb_df, mc_df, behv_sim, bert_df, vec = 'binary'):
    gpt_vec = vectorize_concepts(gpt_df, load_sorting(), 'bla', vec)
    cslb_vec = vectorize_concepts(cslb_df, load_sorting(), 'bla', vec)
    mc_vec = vectorize_concepts(mc_df, load_sorting(), 'bla', vec)
    bert_vec = vectorize_concepts(bert_df, load_sorting(), 'bla', vec)

    if vec == 'count':
        cslb_vec = load_cslb_count_vec()

    intersection_concepts = generate_concepts_to_keep(gpt_df, mc_df, cslb_df, bert_df, 'intersection')
    gpt_vec = gpt_vec.loc[intersection_concepts]
    cslb_vec = cslb_vec.loc[intersection_concepts]
    mc_vec = mc_vec.loc[intersection_concepts]
    bert_vec = bert_vec.loc[intersection_concepts]
    behv_sim = match_behv_sim(behv_sim, intersection_concepts, load_sorting())
    gpt_vec = sort(gpt_vec)
    cslb_vec = sort(cslb_vec)
    mc_vec = sort(mc_vec)
    bert_vec = sort(bert_vec)
    
    return gpt_vec, cslb_vec, mc_vec, behv_sim, bert_vec

# Load data 

In [3]:
# Vectorization is done here with ALL features 
gpt_df, mc_df, behv_sim_all, cslb_df, sorting_df, bert_df = load_data(True, True, 4, 1, None, True, 1, True)
gpt_vec, cslb_vec, mc_vec, behv_sim, bert_vec = vec(gpt_df, cslb_df, mc_df, behv_sim_all, bert_df, 'count')


  warn(msg)


Amount of concepts to keep: 317


In [None]:
gpt_df['feature'].hist()

In [None]:
gpt_df, mc_df, behv_sim, cslb_df, sorting_df, bert_df = load_data(True, True, 4, 1, None, False, 1, True)

In [None]:
fig, ax = plt.subplots(1)

bla = gpt_df.groupby('feature', as_index=False).agg({'concept_id': 'count'})
bla = bla.sort_values(by='concept_id')
print(bla)
ax.plot(bla['feature'], bla['concept_id'])

In [None]:
fig, ax = plt.subplots(1)

bla = cslb_df.groupby('feature', as_index=False).agg({'concept_id': 'count'})
bla = bla.sort_values(by='concept_id')
print(bla)
ax.plot(bla['feature'], bla['concept_id'])

In [None]:
fig, ax = plt.subplots(1)

bla = mc_df.groupby('feature', as_index=False).agg({'concept_id': 'count'})
bla = bla.sort_values(by='concept_id', ascending=False)
print(bla)
ax.plot(bla['feature'], bla['concept_id'])


# Predicting human similarity judgements

## THINGS

In [None]:
r_gpt_behav, r_cslb_behav, r_mc_behav, r_gpt_mc, r_cslb_gpt = calc_correlation(gpt_vec, mc_vec, behv_sim, cslb_vec, bert_vec)

In [20]:
sorting_df = load_sorting()
gpt_df = load_gpt(1,True,1,0,True)
gpt_vec = vectorize_concepts(gpt_df, sorting_df, 'bla', 'binary')
gpt_vec = sort(gpt_vec)
behv_sim = load_behav()
r_gpt_behav, r_cslb_behav, r_mc_behav, r_gpt_mc, r_cslb_gpt = calc_correlation(gpt_vec, None, behv_sim, None, None)



Correlation GPT and THINGS: 0.5807




## Model performance with more runs

In [None]:
min_amount_runs_feature_occured = 1
group_to_one_concept = False
min_amount_runs_feature_occured_within_concept = 1
run_nr = None 
duplicates = True 

gpt_df = load_gpt(min_amount_runs_feature_occured, group_to_one_concept, min_amount_runs_feature_occured_within_concept, run_nr, duplicates)
mc_df = load_mcrae(True)
clsb_df = load_cslb(True)
        
r = []
for i in range(1, 31):
    runs = list(range(1, i+1))
    print(runs)
    gpt_df_temp = gpt_df[gpt_df['run_nr'].isin(runs)]
    gpt_df_temp = gpt_df_temp.groupby('concept_id', as_index=False).agg({'feature': lambda x: ';'.join(x)})
    
    gpt_vec, cslb_vec, mc_vec, behv_sim_matched = vec(gpt_df_temp, clsb_df, mc_df, behv_sim, 'binary')
    r_gpt_behav, r_cslb_behav, r_mc_behav, r_gpt_mc, r_cslb_gpt = calc_correlation(gpt_vec, mc_vec, behv_sim_matched, cslb_vec)
    r.append(r_gpt_behav)

In [None]:
plt.plot(range(1, len(r) + 1), r)

## Wrd Similarity and relatedness

In [None]:
gpt_df, mc_df, behv_sim, cslb_df, sorting_df, bert_df = load_data(True, True, 4, 1, None, True, 1, True)
gpt_vec, cslb_vec, mc_vec, behv_sim, bert_sim = vec(gpt_df, cslb_df, mc_df, behv_sim, bert_df, 'count')

In [3]:
human_sim_judgements = './human_sim_judgements'
wordsim = pd.read_csv(pjoin(human_sim_judgements, '/wordsim353/combined.csv'))
wordsim = wordsim.rename(columns={'Word 1': 'word1', 'Word 2': 'word2', 'Human (mean)': 'rating'})
simlex = pd.read_csv(pjoin(human_sim_judgements, 'simlex999/SimLex-999.txt'), sep='\t', usecols=['word1', 'word2', 'SimLex999'])
simlex = simlex.rename(columns={'SimLex999': 'rating'})
men = pd.read_csv(pjoin(human_sim_judgements, 'men/MEN_dataset_natural_form_full'), sep=' ', names=['word1', 'word2', 'rating'], header=None)
yp = pd.read_csv(pjoin(human_sim_judgements, 'yp/yp-130.csv'))
yp = yp.rename(columns={'similarity': 'rating'})

mturk771 = pd.read_csv(pjoin(human_sim_judgements, 'mturk_771/mturk-771.csv'))
mturk771 = mturk771.rename(columns={'similarity': 'rating'})

mturk287 = pd.read_csv(pjoin(human_sim_judgements, 'mturk_287/mturk-287.csv'))
mturk287 = mturk287.rename(columns={'similarity': 'rating'})

rw = pd.read_csv(pjoin(human_sim_judgements, 'rw/rw.csv'))
rw = rw.rename(columns={'similarity': 'rating'})
                 

NameError: name 'pd' is not defined

In [None]:
def calc_pair_sim(df_vec, concept1, concept2, out_of_category_concepts):
    print(f'{concept1} {concept2} {out_of_category_concepts}')
    df1 = pd.DataFrame(df_vec.loc[concept1]).transpose()
    df2 = pd.DataFrame(df_vec.loc[concept2]).transpose()
    sim = cosine_similarity(df1, df2)[0][0]
    
    out_sims = []
    for out in out_of_category_concepts:
        df1 = pd.DataFrame(df_vec.loc[concept1]).transpose()
        df2 = pd.DataFrame(df_vec.loc[concept2]).transpose()
        df3 = pd.DataFrame(df_vec.loc[out]).transpose()
        out_sim = cosine_similarity(df1, df3)
        out_sims.append(out_sim)
        out_sim = cosine_similarity(df2, df3)
        out_sims.append(out_sim)
    
    mean = np.asarray(out_sims).mean()
    return sim - mean

In [None]:
for dataset_name, wordsim_df in (('wordsim-353', wordsim), 
                                 ('simlex-999', simlex), 
                                 ('men', men),
                                 ('mturk-771', mturk771),
                                 ('mturk-287', mturk287),
                                 ('rw', rw),
                                 ('yp', yp)):
    ratings = []
    gpt_similarities = []
    cslb_similarities = []
    mc_similarities = []
    
    for row in wordsim_df.iterrows():
        word1 = row[1]['word1'].lower()
        word2 = row[1]['word2'].lower()
        rating = row[1]['rating']
        
        gpt_words = gpt_vec.index
        
        if word1 not in gpt_words:
            word1 = f'{word1}1'
            
        if word2 not in gpt_words:
            word2 = f'{word2}1'
        
       
        word1_in_gpt_cslb = word1 in gpt_words and word1 in cslb_vec.index and word1 in mc_vec.index
        word2_in_gpt_cslb = word2 in gpt_words and word2 in cslb_vec.index and word2 in mc_vec.index
        
        
        if word1_in_gpt_cslb and word2_in_gpt_cslb:
            ratings.append(rating)

            gpt_sim = calc_pair_sim(gpt_vec, word1, word2)
            gpt_similarities.append(gpt_sim)

            cslb_sim = calc_pair_sim(cslb_vec, word1, word2)
            cslb_similarities.append(cslb_sim)
            
            mc_sim = calc_pair_sim(mc_vec, word1, word2)
            mc_similarities.append(mc_sim)
    
    corr_cslb = np.corrcoef(cslb_similarities, ratings)[1][0]
    corr_gpt = np.corrcoef(gpt_similarities, ratings)[1][0]
    corr_mc = np.corrcoef(mc_similarities, ratings)[1][0]
    print(f'Dataset {dataset_name} has {len(ratings)} intersection word pairs and correlation of GPT-Dataset is {corr_gpt:.3} and CSLB-Dataset is {corr_cslb:.3}  Mc-Dataset is {corr_mc:.3}')
        
