In [1]:
%load_ext autoreload
%autoreload 2

import sys
if '..' not in sys.path:
    sys.path.append('..')
from notebooks.results import load_results, classification, clusterization, to_latex_table, GremDataFrame, include_pivot_index, groups_to_latex_table
import re
import pandas as pd

In [21]:
GREMVECS = ['BigramMorphTagVectorizer100', 'BigramMorphTagVectorizer370', 'FullMorphTagVectorizer', 'MorphTagVectorizer', 'StyloMetrix']
BOWVECS = ['CountVectorizer1000', 'CountVectorizer5000', 'TfidfVectorizer1000', 'TfidfVectorizer5000', 'CountTfidf1000']
SEMVECS = ['HerbertFT', 'HerbertFrozen', 'RoBERTaFT', 'RoBERTaFrozen', 'DPEBPVectorizer']
GREMSEMVECS = ['GremBERT', 'PanGremBERT', 'FrozenGremBERT', 'FrozenPanGremBERT']

def extract_letters(s: str) -> str:
    return re.findall(r'[a-zA-Z]+', s)[0]

def fix_dataset_name(s: str) -> str:
    return {
        'TweeterCyberbullying': 'TwitterCyberbullying'
    }.get(s, s)

def fix_vectorizer_name(s: str) -> str:
    return {
        'DPEBPVectorizer100Avg': 'DPEBPVectorizer',
        'SpacyMorphTagVectorizer': 'MorphTagVectorizer',
    }.get(s, s)

def assign_vectorizer_type(name: str) -> str:
    if name in BOWVECS:
        return 'zliczający'
    if name in SEMVECS:
        return 'semantyczny'
    if name in GREMVECS:
        return 'gramatyczny'
    if name in GREMSEMVECS:
        return 'gramatyczno-semantyczny'
    return 'nieznany'

def vector_type_sorter(data: tuple[str, pd.DataFrame]) -> int:
    name = data[0]
    if name == 'zliczający':
        return 1
    if name == 'gramatyczny':
        return 2
    if name == 'semantyczny':
        return 3
    if name == 'gramatyczno-semantyczny':
        return 4
    return 5

In [22]:
results_df = GremDataFrame(load_results('../results/'))
results_df['dataset'] = results_df['dataset'].apply(fix_dataset_name)
results_df['vectorizer'] = results_df['vectorizer'].apply(fix_vectorizer_name)
results_df['base_head_model'] = results_df['params_name'].apply(extract_letters)
results_df['vector_type'] = results_df['vectorizer'].apply(assign_vectorizer_type)
results_df = GremDataFrame(results_df[results_df['vector_type'] != 'nieznany'])
print(len(results_df))
results_df.head()

1596


Unnamed: 0,dataset,datacleaner,vectorizer,params_name,accuracy,f1_score,recall,precision,silhouette,davies_bouldin,calinski_harabasz,bcubed_precission,bcubed_recall,bcubed_f1,base_head_model,vector_type
0,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,MLP1,0.915423,0.477922,0.5,0.457711,,,,,,,MLP,gramatyczny
1,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,RandomForest1,0.915423,0.477922,0.5,0.457711,,,,,,,RandomForest,gramatyczny
2,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,LogisticRegression1,0.915423,0.477922,0.5,0.457711,,,,,,,LogisticRegression,gramatyczny
3,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,KMeans1.0,,,,,0.086742,3.113713,889.408505,0.845114,0.501334,0.629336,KMeans,gramatyczny
4,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,DBSCAN1,,,,,0.213195,2.498378,56.53377,0.844912,0.928915,0.884924,DBSCAN,gramatyczny


In [23]:
results_df.to_parquet('../witek/results.parquet')

# Klasyfikacja

In [None]:
df = (
    results_df
    .classification()
    .dataset('Classics5Authors35Books')
    .data_cleaner('ProperNamesMasker')
    .sort_values('f1_score')
    .groupby('vectorizer')
    .first()
)
df = include_pivot_index(df, 'vectorizer')
gdf = df.groupby('vector_type')[['vectorizer', 'recall', 'precision', 'accuracy', 'f1_score']]

print(groups_to_latex_table(
    gdf,
    groups_sort_key=vector_type_sorter,
    bold_labels=['f1_score', 'accuracy'],
    column_names=['typ wektorów', 'wektoryzator', 'pełność (recall)', 'precyzja', 'dokładność', 'miara f1'],
))

In [None]:
df = (
    results_df
    .classification()
    .dataset('Classics5Authors35Books')
    .data_cleaner('DummyDatacleaner')
    .sort_values('f1_score')
    .groupby('vectorizer')
    .first()
    [['accuracy', 'f1_score', 'recall', 'precision', 'base_head_model']]
)
df = include_pivot_index(df, 'wektoryzator')
print(to_latex_table(
    df,
    column_names=['wektoryzator', 'dokładność', 'miara f1', 'pełność (recall)', 'precyzja', 'model UM'],
    bold_labels=['f1_score', 'accuracy', 'recall', 'precision'],
))
df.style.highlight_max(color = '#666666', axis = 0)

# Grupowanie

In [None]:
df = (
    results_df
    .clusterization()
    .dataset('Classics5Authors35Books')
    .data_cleaner('DummyDatacleaner')
    .sort_values('f1_score')
    .groupby('vectorizer')
    .first()
    [['bcubed_f1', 'bcubed_recall', 'bcubed_precission', 'silhouette', 'base_head_model']]
)
df = include_pivot_index(df, 'wektoryzator')
print(to_latex_table(
    df,
    column_names=['wektoryzator', 'f1 bcubed', 'pełność bcubed (recall)', 'precyzja bcubed', 'indesk silhouette',  'model UM'],
    bold_labels=['bcubed_f1', 'bcubed_recall', 'bcubed_precission', 'silhouette'],
))
df.style.highlight_max(color = '#666666', axis = 0)

In [43]:
pd.options.display.float_format = '{:,.3f}'.format
vecs = list(results_df['vectorizer'].unique())
vecs.remove('HerbertVectorizer')

df = (
    results_df
    .classification()
    .vectorizer(vecs)
    # .data_cleaner('ProperNamesMasker')
    .sort_values('f1_score', ascending=False)
    .drop_duplicates(subset=['dataset', 'vectorizer'])
    .drop(columns=list(results_df.columns[6:-1]) + ['datacleaner', 'params_name'])
    .pivot(index='vectorizer', columns='dataset', values='f1_score')
    .style.highlight_max(color = '#666666', axis = 0)
)
# df = include_pivot_index(df, 'zbiór danych')
df

dataset,Classics5Authors35Books,EroticVsOthers,OldNewspapers,PrusVsSienkiewicz,StarWarsFanfic,StarWarsFanficMedium,StarWarsFanficShort,TwitterCyberbullying,WritingStyle
vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BigramMorphTagVectorizer100,0.879816,0.596767,0.089154,0.750679,0.963278,0.818753,0.686301,0.500348,0.536801
BigramMorphTagVectorizer370,0.891134,0.60058,0.116297,0.796445,0.900966,0.850944,0.709485,0.615311,0.537584
CountTfidf1000,0.924524,0.646749,0.131315,0.883495,0.987759,0.88322,0.738083,0.632196,0.663568
CountVectorizer1000,0.897451,0.646804,0.128188,0.883495,0.975478,0.876556,0.738083,0.626262,0.656544
CountVectorizer5000,0.914083,0.655272,0.16485,0.899799,0.987759,0.88246,0.760631,0.708266,0.671502
DPEBPVectorizer,0.593334,0.615427,0.151467,0.706392,0.938943,0.864065,0.747152,0.500348,0.561997
FrozenGremBERT,0.66397,0.615279,0.193199,0.763844,0.902206,0.870549,0.806167,0.609427,0.589378
FrozenPanGremBERT,0.812815,0.616076,0.184294,0.68736,0.902206,0.889939,0.789929,0.548467,0.597499
FullMorphTagVectorizer,0.748191,0.604915,0.119493,0.82544,0.926655,0.799182,0.720949,0.632452,0.573628
GremBERT,0.913038,0.667508,0.324053,0.630393,1.0,0.977331,0.863479,0.489798,0.675413
