In [3]:
%load_ext autoreload
%autoreload 2

import sys
if '..' not in sys.path:
    sys.path.append('..')
from notebooks.results import load_results, classification, clusterization, to_latex_table, GremDataFrame, include_pivot_index, groups_to_latex_table, VectorInfo, vector_data_iter, dataset_iter, DatasetInfo
import re
import pandas as pd
from qwlist import Lazy, QList

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
GREMVECS = ['BigramMorphTagVectorizer100', 'BigramMorphTagVectorizer370', 'FullMorphTagVectorizer', 'MorphTagVectorizer', 'StyloMetrix']
BOWVECS = ['CountVectorizer1000', 'CountVectorizer5000', 'TfidfVectorizer1000', 'TfidfVectorizer5000', 'CountTfidf1000']
SEMVECS = ['HerbertFT', 'HerbertFrozen', 'RoBERTaFT', 'RoBERTaFrozen', 'DPEBPVectorizer']
GREMSEMVECS = ['GremBERT', 'PanGremBERT', 'FrozenGremBERT', 'FrozenPanGremBERT']

def extract_letters(s: str) -> str:
    return re.findall(r'[a-zA-Z]+', s)[0]

def fix_dataset_name(s: str) -> str:
    return {
        'TweeterCyberbullying': 'TwitterCyberbullying'
    }.get(s, s)

def fix_vectorizer_name(s: str) -> str:
    return {
        'DPEBPVectorizer100Avg': 'DPEBPVectorizer',
        'SpacyMorphTagVectorizer': 'MorphTagVectorizer',
    }.get(s, s)

def assign_vectorizer_type(name: str) -> str:
    if name in BOWVECS:
        return 'semantyczny'
    if name in SEMVECS:
        return 'głęboki semantyczny'
    if name in GREMVECS:
        return 'gramatyczny'
    if name in GREMSEMVECS:
        return 'gramatyczno-semantyczny'
    return 'nieznany'

def vector_type_sorter(data: tuple[str, pd.DataFrame]) -> int:
    name = data[0]
    if name == 'semantyczny':
        return 1
    if name == 'gramatyczny':
        return 2
    if name == 'głęboki semantyczny':
        return 3
    if name == 'gramatyczno-semantyczny':
        return 4
    return 5

In [5]:
VECTORIZERS = [
    'MorphTagVectorizer',
    'DPEBPVectorizer',
    'CountVectorizer1000',
    'TfidfVectorizer1000',
    'FullMorphTagVectorizer',
    'BigramMorphTagVectorizer100',
    'BigramMorphTagVectorizer370',
    'StyloMetrix',
    'CountVectorizer5000',
    'TfidfVectorizer5000',
    'RoBERTaFT',
    'HerbertFrozen',
    'HerbertFT',
    'RoBERTaFrozen',
    'FrozenPanGremBERT',
    'GremBERT',
    'PanGremBERT',
    'FrozenGremBERT',
    'CountTfidf1000'
]

DATASETS = [
    'TwitterCyberbullying',
    'PrusVsSienkiewicz',
    'WritingStyle',
    'Classics5Authors35Books',
    'OldNewspapers',
    'StarWarsFanfic',
    'EroticVsOthers',
    'StarWarsFanficShort',
    'StarWarsFanficMedium'
]

VECTOR_SIZES = {
    'TfidfVectorizer1000': 1000,
    'TfidfVectorizer5000': 5000,
    'BigramMorphTagVectorizer100': 100,
    'BigramMorphTagVectorizer370': 370,
    'CountVectorizer1000': 1000,
    'CountVectorizer5000': 5000,
    'DPEBPVectorizer': 100,
    'FullMorphTagVectorizer': 1436,
    'HerbertFT': 768,
    'MorphTagVectorizer': 92,
    'StyloMetrix': 172,
    'HerbertFrozen': 768,
    'RoBERTaFT': 768,
    'RoBERTaFrozen': 768,
    'CountTfidf1000': 2000,
    'FrozenGremBERT': 860,
    'GremBERT': 860,
    'FrozenPanGremBERT': 1138,
    'PanGremBERT': 1138
}

In [6]:
results_df = GremDataFrame(load_results('../results/'))
results_df['dataset'] = results_df['dataset'].apply(fix_dataset_name)
results_df['vectorizer'] = results_df['vectorizer'].apply(fix_vectorizer_name)
results_df['base_head_model'] = results_df['params_name'].apply(extract_letters)
results_df['vector_type'] = results_df['vectorizer'].apply(assign_vectorizer_type)
results_df = GremDataFrame(results_df[results_df['vector_type'] != 'nieznany'])

DATA_CLEANERS = QList(results_df['datacleaner'].unique())

print(len(results_df))
results_df.head()

1596


Unnamed: 0,dataset,datacleaner,vectorizer,params_name,accuracy,f1_score,recall,precision,silhouette,davies_bouldin,calinski_harabasz,bcubed_precission,bcubed_recall,bcubed_f1,base_head_model,vector_type
0,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,MLP1,0.915423,0.477922,0.5,0.457711,,,,,,,MLP,gramatyczny
1,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,RandomForest1,0.915423,0.477922,0.5,0.457711,,,,,,,RandomForest,gramatyczny
2,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,LogisticRegression1,0.915423,0.477922,0.5,0.457711,,,,,,,LogisticRegression,gramatyczny
3,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,KMeans1.0,,,,,0.086742,3.113713,889.408505,0.845114,0.501334,0.629336,KMeans,gramatyczny
4,TwitterCyberbullying,DummyDatacleaner,MorphTagVectorizer,DBSCAN1,,,,,0.213195,2.498378,56.53377,0.844912,0.928915,0.884924,DBSCAN,gramatyczny


In [23]:
results_df.to_parquet('../witek/results.parquet')

In [8]:
ProperNamesMasker = 'ProperNamesMasker'
DummyDatacleaner = 'DummyDatacleaner'

# Klasyfikacja

`ascending = False` !!!!!

In [None]:
df = (
    results_df
    .classification()
    .dataset('Classics5Authors35Books')
    .data_cleaner(DummyDatacleaner)
    .sort_values('f1_score', ascending=False)
    .groupby('vectorizer')
    .first()
)
df

In [14]:
df = (
    results_df
    .classification()
    .dataset('OldNewspapers')
    .data_cleaner(DummyDatacleaner)
    .sort_values('f1_score', ascending=False)
    .groupby('vectorizer')
    .first()
)
df = include_pivot_index(df, 'vectorizer')
gdf = df.groupby('vector_type')[['vectorizer', 'recall', 'precision', 'accuracy', 'f1_score']]

print(groups_to_latex_table(
    gdf,
    groups_sort_key=vector_type_sorter,
    bold_labels=['f1_score', 'accuracy'],
    column_names=['typ wektorów', 'wektoryzator', 'pełność\n(recall)', 'precyzja', 'dokładność', 'miara f1'],
))

\begin{table}[H]
	\centering
	\caption{}
	\resizebox{
	\ifdim\width>\columnwidth
		\columnwidth
	\else
		\width
	\fi
	}{!}{
	\begin{tabular}{||c|c|c|c|c|c||}
		\hline
		\textbf{typ wektorów} & \textbf{wektoryzator} & \textbf{\makecell{pełność\\(recall)}} & \textbf{precyzja} & \textbf{dokładność} & \textbf{miara f1} \\
		\hline
		\multirow{5}{*}{semantyczny} & CountTfidf1000 & 0,132 & 0,145 & 0,259 & 0,131 \\
		 & CountVectorizer1000 & 0,129 & 0,142 & 0,258 & 0,128 \\
		 & CountVectorizer5000 & 0,159 & 0,194 & 0,287 & 0,165 \\
		 & TfidfVectorizer1000 & 0,128 & 0,178 & 0,267 & 0,123 \\
		 & TfidfVectorizer5000 & 0,164 & 0,209 & \textbf{0,297} & \textbf{0,169} \\
\hline		\multirow{5}{*}{gramatyczny} & BigramMorphTagVectorizer100 & 0,097 & 0,098 & 0,218 & 0,089 \\
		 & BigramMorphTagVectorizer370 & 0,120 & 0,162 & 0,244 & 0,116 \\
		 & FullMorphTagVectorizer & 0,119 & 0,190 & 0,258 & 0,119 \\
		 & MorphTagVectorizer & 0,125 & 0,145 & \textbf{0,264} & \textbf{0,121} \\
		 & StyloMetrix & 0

# Grupowanie

In [28]:
df = (
    results_df
    .clusterization()
    .dataset('OldNewspapers')
    .data_cleaner(DummyDatacleaner)
    .sort_values('bcubed_f1', ascending=False)
    .groupby('vectorizer')
    .first()
)
df = include_pivot_index(df, 'vectorizer')
gdf = df.groupby('vector_type')[['vectorizer', 'base_head_model', 'silhouette', 'bcubed_f1']]
print(groups_to_latex_table(
    gdf,
    groups_sort_key=vector_type_sorter,
    column_names=['typ wektorów', 'wektoryzator', 'sposób grupowania', 'indeks silhouette', 'bcubed f1'],
    bold_labels=['bcubed_f1', 'silhouette'],
))

df = (
    results_df
    .clusterization()
    .dataset('OldNewspapers')
    .data_cleaner(DummyDatacleaner)
    .sort_values('bcubed_f1', ascending=False)
    .groupby('vectorizer')
    .first()
)
df = include_pivot_index(df, 'vectorizer')
gdf = df[['vectorizer', 'silhouette', 'bcubed_recall', 'bcubed_precission', 'bcubed_f1', 'base_head_model']]
df = include_pivot_index(gdf, 'vector_type')
df.style.highlight_max(color = '#666666', axis = 0)

\begin{table}[H]
	\centering
	\caption{}
	\resizebox{\textwidth}{!}{
	\begin{tabular}{||c|c|c|c|c||}
		\hline
		\textbf{typ wektorów} & \textbf{wektoryzator} & \textbf{sposób grupowania} & \textbf{indeks silhouette} & \textbf{bcubed f1} \\
		\hline
		\multirow{5}{*}{zliczający} & CountTfidf1000 & DBSCAN & -0.273 & \textbf{0.160} \\
		 & CountVectorizer1000 & DBSCAN & -0.294 & \textbf{0.160} \\
		 & CountVectorizer5000 & DBSCAN & -0.324 & 0.158 \\
		 & TfidfVectorizer1000 & DBSCAN & \textbf{-0.213} & \textbf{0.160} \\
		 & TfidfVectorizer5000 & DBSCAN & -0.264 & 0.158 \\
\hline		\multirow{5}{*}{gramatyczny} & BigramMorphTagVectorizer100 & DBSCAN & -0.032 & 0.154 \\
		 & BigramMorphTagVectorizer370 & DBSCAN & -0.108 & 0.158 \\
		 & FullMorphTagVectorizer & DBSCAN & -0.006 & \textbf{0.160} \\
		 & MorphTagVectorizer & DBSCAN & \textbf{0.338} & 0.157 \\
		 & StyloMetrix & DBSCAN & 0.208 & 0.152 \\
\hline		\multirow{5}{*}{semantyczny} & DPEBPVectorizer & DBSCAN & -1.000 & 0.157 \\
		 & Herb

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column_name] = df.index


Unnamed: 0_level_0,vector_type,vectorizer,silhouette,bcubed_recall,bcubed_precission,bcubed_f1,base_head_model
vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BigramMorphTagVectorizer100,BigramMorphTagVectorizer100,BigramMorphTagVectorizer100,-0.032203,0.56803,0.08923,0.154233,DBSCAN
BigramMorphTagVectorizer370,BigramMorphTagVectorizer370,BigramMorphTagVectorizer370,-0.108391,0.921974,0.086394,0.157984,DBSCAN
CountTfidf1000,CountTfidf1000,CountTfidf1000,-0.27306,0.899255,0.087591,0.159633,DBSCAN
CountVectorizer1000,CountVectorizer1000,CountVectorizer1000,-0.293812,0.899255,0.087591,0.159633,DBSCAN
CountVectorizer5000,CountVectorizer5000,CountVectorizer5000,-0.323708,0.965596,0.085782,0.157566,DBSCAN
DPEBPVectorizer,DPEBPVectorizer,DPEBPVectorizer,-1.0,1.0,0.085301,0.157193,DBSCAN
FrozenGremBERT,FrozenGremBERT,FrozenGremBERT,-1.0,1.0,0.085301,0.157193,DBSCAN
FrozenPanGremBERT,FrozenPanGremBERT,FrozenPanGremBERT,-1.0,1.0,0.085301,0.157193,DBSCAN
FullMorphTagVectorizer,FullMorphTagVectorizer,FullMorphTagVectorizer,-0.005609,0.963403,0.087071,0.159708,DBSCAN
GremBERT,GremBERT,GremBERT,0.197332,0.241898,0.402852,0.302285,KMeans


In [43]:
pd.options.display.float_format = '{:,.3f}'.format
vecs = list(results_df['vectorizer'].unique())
vecs.remove('HerbertVectorizer')

df = (
    results_df
    .classification()
    .vectorizer(vecs)
    # .data_cleaner('ProperNamesMasker')
    .sort_values('f1_score', ascending=False)
    .drop_duplicates(subset=['dataset', 'vectorizer'])
    .drop(columns=list(results_df.columns[6:-1]) + ['datacleaner', 'params_name'])
    .pivot(index='vectorizer', columns='dataset', values='f1_score')
    .style.highlight_max(color = '#666666', axis = 0)
)
# df = include_pivot_index(df, 'zbiór danych')
df

dataset,Classics5Authors35Books,EroticVsOthers,OldNewspapers,PrusVsSienkiewicz,StarWarsFanfic,StarWarsFanficMedium,StarWarsFanficShort,TwitterCyberbullying,WritingStyle
vectorizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BigramMorphTagVectorizer100,0.879816,0.596767,0.089154,0.750679,0.963278,0.818753,0.686301,0.500348,0.536801
BigramMorphTagVectorizer370,0.891134,0.60058,0.116297,0.796445,0.900966,0.850944,0.709485,0.615311,0.537584
CountTfidf1000,0.924524,0.646749,0.131315,0.883495,0.987759,0.88322,0.738083,0.632196,0.663568
CountVectorizer1000,0.897451,0.646804,0.128188,0.883495,0.975478,0.876556,0.738083,0.626262,0.656544
CountVectorizer5000,0.914083,0.655272,0.16485,0.899799,0.987759,0.88246,0.760631,0.708266,0.671502
DPEBPVectorizer,0.593334,0.615427,0.151467,0.706392,0.938943,0.864065,0.747152,0.500348,0.561997
FrozenGremBERT,0.66397,0.615279,0.193199,0.763844,0.902206,0.870549,0.806167,0.609427,0.589378
FrozenPanGremBERT,0.812815,0.616076,0.184294,0.68736,0.902206,0.889939,0.789929,0.548467,0.597499
FullMorphTagVectorizer,0.748191,0.604915,0.119493,0.82544,0.926655,0.799182,0.720949,0.632452,0.573628
GremBERT,0.913038,0.667508,0.324053,0.630393,1.0,0.977331,0.863479,0.489798,0.675413


In [33]:
dataset = 'PrusVsSienkiewicz'
data_cleaner = 'DummyDatacleaner'


path = f'../data/{dataset}/{data_cleaner}/train.parquet'
df = pd.read_parquet(path)
df['len'] = df['clean_text'].apply(len)
df['word_count'] = df['clean_text'].apply(lambda s: len(s.split(' ')))
print(f'Classes: {len(df["label"].unique())}')
print(f'Mean text len: {df["len"].values.mean():.0f}')
print(f'Mean word count: {df["word_count"].values.mean():.0f}')
print(f'Examples: {len(df)}')
df.head()

Classes: 2
Mean text len: 996
Mean word count: 156
Examples: 5846


Unnamed: 0,clean_text,label,len,word_count
0,Wstęp\nW północno-wschodnim kącie Afryki leży...,0,996,141
1,równinie mającej kształt trójkąta. Trójkąt te...,0,1000,150
2,złoty. Wreszcie w początkach czerwca Nilowa p...,0,994,148
3,"gruntach osadza się muł żyzny, który zastępuj...",0,995,143
4,gdy słońce zbliża się do gwiazdozbioru Wagi. ...,0,998,140


In [None]:
def fix_dataset(info: VectorInfo | DatasetInfo) -> VectorInfo | DatasetInfo:
    name = {
        'TweeterCyberbullying': 'TwitterCyberbullying'
    }.get(info.dataset, info.dataset)
    info.dataset = name
    return info


def fix_vectorizer(info: VectorInfo) -> VectorInfo:
    name = {
        'DPEBPVectorizer100Avg': 'DPEBPVectorizer',
        'SpacyMorphTagVectorizer': 'MorphTagVectorizer'
    }.get(info.vectorizer, info.vectorizer)
    info.vectorizer = name
    return info


d = {
    info.vectorizer: info.vec_len
    for info
    in (
        Lazy(vector_data_iter("..\\data"))
        .map(fix_dataset)
        .map(fix_vectorizer)
        .filter(lambda x: x.dataset in DATASETS)
        .filter(lambda x: x.data_cleaner in DATA_CLEANERS)
        .filter(lambda x: x.vectorizer in VECTORIZERS)
    )
}

d