# Top features per dimension

In [1]:
import sys 
import os 

sys.path.append('../..')

DATA_DIR = '../../data'
os.environ['DATA_DIR'] = DATA_DIR

from copy import deepcopy
from utils.data import load_data, load_dimension_embeddings, load_sorting
from utils.correlation import vectorize_concepts
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from scipy.spatial.distance import squareform
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

nlp = spacy.load("en_core_web_sm")

2022-09-08 13:02:37.295387: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-08 13:02:37.295420: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
min_amount_runs_feature_occured = 5
group_to_one_concept = True
min_amount_runs_feature_occured_within_concept = 1
run_nr = None 
duplicates = True 
gpt_df, mc_df, behv_sim, cslb_df, sorting_df, _ = load_data(True, True, min_amount_runs_feature_occured, min_amount_runs_feature_occured_within_concept, group_to_one_concept, run_nr, duplicates)

../../data


  warn(msg)


# Feature-Concept Matrix with TF-IDF 

normalized across concepts, so that features like is small are weighted less as they are present in almost all concepts

In [75]:
gpt_vec_count = vectorize_concepts(gpt_df, load_sorting(), 'bla', 'tfidf')


# Feature-Concept Matrix with Counts

normalized by dividing by the number of concepts the features occurs

In [4]:
gpt_vec_binary = vectorize_concepts(gpt_df, load_sorting(), 'bla', 'binary')

def count_concepts(values):
    n = len([value for value in values if value != 0])
    return n

counts = gpt_vec_binary.groupby(lambda x: True).agg(count_concepts).reset_index(drop=True).T

gpt_vec_count = vectorize_concepts(gpt_df, load_sorting(), 'bla', 'count')


Unnamed: 0,0
absorbs water,4
amplifies sound,4
attracts iron,1
attracts metal,1
attracts nails,1
...,...
winks clothes,4
winks dishes,1
writes,3
writes on paper,6


In [31]:


# when CSLB is used we can only use the overlap concepts
#dimension_embeddings = dimension_embeddings.loc[dimension_embeddings.index.isin(intersection_concepts)]

list(gpt_vec_binary.index) == list(dimension_embeddings.index)

True

In [6]:
#def weight_feature_counts_with_dimension_values(df, dimension_values, normalized):
#    df = df.mul(dimension_values, axis=0)
#    df = df.sum(axis=0).to_frame()

#    if normalized:
#        df = df.div(counts)
        
#    df = df.reset_index() 
#    df = df.rename(columns={df.columns[0]: 'feature', df.columns[1]: 'weight'})  

In [76]:
def matrix_to_top_list(df):
    df_list = pd.DataFrame()
    for dim in df.columns:
        dim_values = df.loc[:, [dim]].reset_index()
        dim_values = dim_values.rename(columns={dim_values.columns[0]: 'feature', dim_values.columns[1]: 'weight'})
        top = dim_values.sort_values(by='weight', ascending=False)[:20]
        top['dimension'] = dim
        df_list = pd.concat([df_list, top])
    return df_list

# Load dimension weights 

In [80]:
normed_features = pd.DataFrame()

n_dims = 49
dimension_embeddings = load_dimension_embeddings(n_dims)
dims = dimension_embeddings.columns

# norm sum = 1 per dimension -> needed for normalization by dimensions
dim_sums = dimension_embeddings.sum(axis=0)
print(dim_sums.shape)
dimension_embeddings = dimension_embeddings.div(dim_sums)
print(dimension_embeddings.shape)


(49,)
(1854, 49)


In [9]:
df=pd.DataFrame({'Dimension': load_dimension_embeddings(49).columns}).reset_index()
df['index'] = df['index'] + 1
print(df.to_latex(index=False))

\begin{tabular}{rl}
\toprule
 index &                                        Dimension \\
\midrule
     1 &                made of metal / artificial / hard \\
     2 &  food-related / eating-related / kitchen-related \\
     3 &                         animal-related / organic \\
     4 &             clothing-related / fabric / covering \\
     5 & furniture-related / household-related / artifact \\
     6 &                            plant-related / green \\
     7 &                                 outdoors-related \\
     8 &             transportation / motorized / dynamic \\
     9 &                          wood-related / brownish \\
    10 &                                body part-related \\
    11 &                                         colorful \\
    12 &              valuable / special occasion-related \\
    13 &                          electronic / technology \\
    14 &    sport-related / recreational activity-related \\
    15 &                              disc-shap

# Compute dimension weights for all features

In [81]:
weighted_features_for_all_dims = pd.DataFrame()

for dim in dims:
    print(f'Dimension: {dim}')
    df = deepcopy(gpt_vec_count)
    dimension_values = dimension_embeddings.loc[:, dim]
    df = df.mul(dimension_values, axis=0)
    df = df.sum(axis=0).to_frame().sort_index()
    df = df.rename(columns={df.columns[0]: dim})
    weighted_features_for_all_dims = pd.concat([weighted_features_for_all_dims, df], axis=1)

Dimension: made of metal / artificial / hard
Dimension: food-related / eating-related / kitchen-related
Dimension: animal-related / organic
Dimension: clothing-related / fabric / covering
Dimension: furniture-related / household-related / artifact
Dimension: plant-related / green
Dimension: outdoors-related
Dimension: transportation / motorized / dynamic
Dimension: wood-related / brownish
Dimension: body part-related
Dimension: colorful
Dimension: valuable / special occasion-related
Dimension: electronic / technology
Dimension: sport-related / recreational activity-related
Dimension: disc-shaped / round
Dimension: tool-related
Dimension: many small things / course pattern
Dimension: paper-related / thin / flat / text-related
Dimension: fluid-related / drink-related
Dimension: long / thin
Dimension: water-related / blue
Dimension: powdery / fine-scale pattern
Dimension: red
Dimension: feminine (stereotypically) / decorative
Dimension: bathroom-related / sanitary
Dimension: black / noble

# Normalize feature weights across dimensions

## By substract mean value from all other dimensions

In [82]:
for dim in dims:
    features_for_all_other_dims = weighted_features_for_all_dims.drop(dim, axis=1)
    mean_per_feature = features_for_all_other_dims.mean(axis=1)

    dim_values = weighted_features_for_all_dims.loc[:, dim]
    dim_values_normed = dim_values.subtract(mean_per_feature).to_frame()

    dim_values_normed = dim_values_normed.rename(columns={dim_values_normed.columns[0]: dim})
    normed_features = pd.concat([normed_features, dim_values_normed], axis=1)


In [None]:
normed_features.to_csv(f'./normed_features_per_{n_dims}_dimension_matrix.csv')
matrix_to_top_list(normed_features).to_csv(f'./normed_features_per_{n_dims}_dimension_list.csv', index=False)