In [None]:
from utils.correlation import calc_correlation, vectorize_concepts
from utils.data import load_data, load_gpt, load_cslb, load_sorting, load_mcrae, generate_concepts_to_keep
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from scipy.spatial.distance import squareform
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns


nlp = spacy.load("en_core_web_sm")

In [None]:
def stats(df):
    n_features = df.shape[0]
    n_concepts = df['concept_id'].unique().shape[0]
    n_unique_features = df['feature'].unique().shape[0]
    mean_amount_features_per_concept = df.groupby('concept_id').agg({'feature': 'count'})['feature'].mean()
    mean_amount_of_concepts_per_feature = df.groupby('feature').agg({'concept_id': 'count'})['concept_id'].mean()
    print(f'Amount of features: {n_features}')
    print(f'Amount of concepts: {n_concepts}')
    print(f'Amount of unique features: {n_unique_features}')
    print(f'Mean amount of feature per concept: {mean_amount_features_per_concept}')
    print(f'Share of unique features to all features: {(n_unique_features/n_features)*100}')
    print(f'Mean amount of concepts per feature: {mean_amount_of_concepts_per_feature}')
    #TODO NoSF
    df_amount_concepts = df.groupby('feature', as_index=False).agg({'concept_id': 'count'}).rename(columns={'concept_id': 'concept_count'})
    df = df.merge(df_amount_concepts, on='feature', how='left')
    df = df[df['concept_count'] >= 3]
    mean_amount_shared_features_per_concept = df.groupby('concept_id').agg({'feature': 'count'})['feature'].mean()
    print(f'Mean amount of shared features per concept: {mean_amount_shared_features_per_concept}')
    
    return n_features, n_concepts, n_unique_features, mean_amount_features_per_concept, mean_amount_of_concepts_per_feature, mean_amount_shared_features_per_concept


In [None]:
cslb_df = load_cslb(False)
gpt_df = load_gpt(1, False, 1,1, True)
mc_df = load_mcrae(False, False)

print('Stats for CSLB')
stats(cslb_df)
print('\n')

print('Stats for GPT with duplicates')
stats(gpt_df)
print('\n')

print('Stats for GPT without duplicates')
gpt_df_without_dup = load_gpt(4, False, 1, 1, False)
stats(gpt_df_without_dup)
print('\n')

print('Stats for GPT without duplicates and no filter')
gpt_df_without_dup_no = load_gpt(1, False, 1, 1, False)
stats(gpt_df_without_dup_no)
print('\n')

print('Stats for McRae without duplicates')
stats(mc_df)
