# Top features per dimension

In [None]:
from evaluation.data import load_dimension_embeddings, load_gpt, load_sorting, load_data
from copy import deepcopy
from evaluation.correlation import calc_correlation, vectorize_concepts
from evaluation.data import load_data, load_gpt, load_cslb, load_sorting, load_cslb_count_vec, load_mcrae, generate_concepts_to_keep, match_behv_sim, load_behav
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from scipy.spatial.distance import squareform
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

nlp = spacy.load("en_core_web_sm")

In [None]:
def sort(df):
    sorted_df = sorting_df.reset_index().set_index('concept_id')
    df['concept_num'] = df.index.map(sorted_df['index'])
    df = df.sort_values(by='concept_num')
    df = df.drop('concept_num', axis=1)
    return df

def vec(gpt_df, cslb_df, mc_df, behv_sim, bert_df, vec = 'binary'):
    gpt_vec = vectorize_concepts(gpt_df, load_sorting(), 'bla', vec)
    cslb_vec = vectorize_concepts(cslb_df, load_sorting(), 'bla', vec)
    mc_vec = vectorize_concepts(mc_df, load_sorting(), 'bla', vec)
    bert_vec = vectorize_concepts(bert_df, load_sorting(), 'bla', vec)

    if vec == 'count':
        cslb_vec = load_cslb_count_vec()

    intersection_concepts = generate_concepts_to_keep(gpt_df, mc_df, cslb_df, bert_df, 'intersection')
    gpt_vec = gpt_vec.loc[intersection_concepts]
    cslb_vec = cslb_vec.loc[intersection_concepts]
    mc_vec = mc_vec.loc[intersection_concepts]
    bert_vec = bert_vec.loc[intersection_concepts]
    behv_sim = match_behv_sim(behv_sim, intersection_concepts, load_sorting())
    gpt_vec = sort(gpt_vec)
    cslb_vec = sort(cslb_vec)
    mc_vec = sort(mc_vec)
    bert_vec = sort(bert_vec)
    
    return gpt_vec, cslb_vec, mc_vec, behv_sim, bert_vec

In [None]:
min_amount_runs_feature_occured = 5
group_to_one_concept = True
min_amount_runs_feature_occured_within_concept = 1
run_nr = None 
duplicates = True 
strategy = None
vec_method = 'count'

gpt_df, mc_df, behv_sim, cslb_df, sorting_df = load_data(True, True, min_amount_runs_feature_occured, min_amount_runs_feature_occured_within_concept, strategy, group_to_one_concept, run_nr, duplicates)
gpt_vec, cslb_vec, mc_vec, behv_sim = vec(gpt_df, cslb_df, mc_df, behv_sim, vec_method)
dimension_embeddings = load_dimension_embeddings()
intersection_concepts = generate_concepts_to_keep(gpt_df, mc_df, cslb_df, 'intersection')


In [None]:
dims = dimension_embeddings.columns
dims = dims[:1]
for dim in dims:
    print(f'Dimension: {dim}')
    for model, vec_df in (('GPT', gpt_vec), ('CSLB', cslb_vec)):
        print(f'Model: {model}')
        vectorized_concepts_copy = deepcopy(vec_df)
        # slow
        for concept_id in intersection_concepts:
            if concept_id != 'man':
                dimension_score = dimension_embeddings.loc[concept_id][dim]
                vectorized_concepts_copy.loc[concept_id] = vectorized_concepts_copy.loc[concept_id] * dimension_score

        vectorized_concepts_sum = vectorized_concepts_copy.sum(axis=0)

        #for feature in vectorized_concepts_sum.index:
        #    amount_concepts_where_feature_occured = vectorized_concepts[vectorized_concepts[feature] == 1].shape[0]
        #    vectorized_concepts_sum[feature] = vectorized_concepts_sum[feature] / amount_concepts_where_feature_occured
        dim_string = dim.replace('/', '-')
        top_features = vectorized_concepts_sum.sort_values(ascending=False)[:20]
        print(top_features)
    top_features.to_csv(f'./evaluation/dimensions/{dim_string}.csv')