Mame mix balikov, chceme sa ale zamerat na SLSP. Vsetky non-SLSP pouzijeme na trenovanie.
Vyskusame postupne pridavat SLSP baliky a pozriet sa ako to ovplyvnuje accuracy na nepridane SLSP baliky (pouzite ako test set)

In [None]:
import pandas as pd
import numpy as np

slsp_root_path = '/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/'


# original texts
pb_slsp_df = pd.read_parquet(slsp_root_path + 'PB_SLSP_10k_0724.parquet')
mix_df = pd.read_parquet(slsp_root_path + 'SLSP_12k_0924.parquet')
small_df = pd.read_parquet(slsp_root_path + 'SLSP_1k_0724.parquet')
small_df = small_df.rename({'Balík': 'Balik', 'Číslo pohľadávky' : 'CisloPohladavky', 'Názov Súboru':'NazovSuboru' },axis=1)
big_df = pd.read_parquet(slsp_root_path + 'SLSP_29k_0924.parquet')
big_df['Balik'] = big_df.shape[0] * ['SLSP24DM'] # we assume big_df (29k) is SLSP24DM


# no placeholders
pb_slsp_df = pd.read_parquet(slsp_root_path + 'PB_SLSP_10k_0724_noplaceholders.parquet')
mix_df = pd.read_parquet(slsp_root_path + 'SLSP_12k_0924_noplaceholders.parquet')
small_df = pd.read_parquet(slsp_root_path + 'SLSP_1k_0724_noplaceholders.parquet')
small_df = small_df.rename({'Balík': 'Balik', 'Číslo pohľadávky' : 'CisloPohladavky', 'Názov Súboru':'NazovSuboru' },axis=1)
big_df = pd.read_parquet(slsp_root_path + 'SLSP_29k_0924_noplaceholders.parquet')
big_df['Balik'] = big_df.shape[0] * ['SLSP24DM'] # we assume big_df (29k) is SLSP24DM




all_df = pd.concat([pb_slsp_df, mix_df, small_df, big_df])[['CisloPohladavky','Balik','NazovSuboru','Text','label']]
all_df.loc[all_df['Balik'].isna(),'Balik'] = 'NA'


# preprocess texts
import mlflow
mlflow.set_tracking_uri('http://localhost:5001')
initial_preprocessor = mlflow.pyfunc.load_model('models:/InitialTextPreprocessor/1')
noplaceholders_preprocessor = mlflow.pyfunc.load_model('models:/InitialAndRemovePlaceholdersTextPreprocessor/1')

orig_texts = all_df.Text
all_df.Text = orig_texts
all_df.Text = initial_preprocessor.predict(orig_texts)
all_df.Text = noplaceholders_preprocessor.predict(orig_texts)





In [None]:
import mltools
from datetime import datetime

metadata = {
    'author': 'Ondrej Gutten',
    'date': datetime.today().strftime('%d - %m - %Y'),
    'preprocessing' : 'Initial',
    'labels' : 'Zredukovany ciselnik 03 - 02 - 2025',
    'description' : 'Data pre SLSP model. Baliky: SLSP21a, SLSP22M (len zopar), SLSP23a. Zpracovane 02/2025'
}

metadata = {
    'author': 'Ondrej Gutten',
    'date': datetime.today().strftime('%d - %m - %Y'),
    'preprocessing' : 'None',
    'labels' : 'Zredukovany ciselnik 03 - 02 - 2025',
    'description' : 'Data pre SLSP model. Merged slsp_obtained24 a slsp_obtained25. Baliky: SLSP21a, SLSP22M (len zopar), SLSP23a, SLSP24DM. Zpracovane 02/2025'
}


In [None]:
# relabel based on new label system
'''
7423 -> 5797
7428 -> 5766
9997 -> 7499
9995 -> 7496
7427 -> 5789
9998 -> ? delete
9996 -> 7497
7337 -> ? delete
7420 -> 7499
'''

all_df.loc[all_df.label == '7423','label'] = '5797'
all_df.loc[all_df.label == '7428','label'] = '5766'
all_df.loc[all_df.label == '9997','label'] = '7499'
all_df.loc[all_df.label == '9995','label'] = '7496'
all_df.loc[all_df.label == '7427','label'] = '5789'
all_df.loc[all_df.label == '9996','label'] = '7497'
all_df.loc[all_df.label == '7420','label'] = '7499'
all_df = all_df[all_df.label != '7337']
all_df = all_df[all_df.label != '9998']


In [None]:
slsp21_df = all_df[np.array(all_df['Balik'] == 'SLSP21a') | np.array(all_df['Balik'] == 'SLSP22M')] # only 4 lines for 22
slsp23_df = all_df[all_df['Balik'] == 'SLSP23a']
slsp24_df = all_df[all_df['Balik'] == 'SLSP24DM']
slsp21_to_24_df = pd.concat([slsp21_df, slsp23_df, slsp24_df])
non_slsp_df = all_df[['SLSP' not in x for x in all_df['Balik']]]

slsp21_to_24_df.to_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_24_obtained24_labels25.parquet')

In [None]:
import mltools
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_rf(train_df, test_df):
    rf = RandomForestClassifier()
    model = mltools.architecture.TF_IDF_Classifier(baseline_classifier = rf, model_name = 'rf')
    model.fit(train_df['Text'], train_df['label'])
    y_pred = model.predict(test_df['Text'])
    return accuracy_score(test_df['label'], y_pred)

In [None]:
all_df = mltools.data.drop_classes_with_1_member(all_df,'label')
overview = {}
overview['non_VS_21'] = train_rf(non_slsp_df, slsp21_df)
overview['non_VS_23'] = train_rf(non_slsp_df, slsp23_df)    
overview['non_VS_24'] = train_rf(non_slsp_df, slsp24_df)
overview['non_VS_SLSP'] = train_rf(non_slsp_df, pd.concat([slsp21_df, slsp23_df, slsp24_df]))
overview['non21_VS_23'] = train_rf(pd.concat([non_slsp_df, slsp21_df]), slsp23_df)
overview['non21_VS_24'] = train_rf(pd.concat([non_slsp_df, slsp21_df]), slsp24_df)
overview['non21_VS_SLSP'] = train_rf(pd.concat([non_slsp_df, slsp21_df]), pd.concat([slsp23_df, slsp24_df]))
overview['non2123_VS_24'] = train_rf(pd.concat([non_slsp_df, slsp21_df, slsp23_df]), slsp24_df)


train_df, test_df = mltools.data.pandas_split_categorical_data(all_df,stratified = True, target_column = 'label')
overview['all'] = train_rf(train_df, test_df)

print(overview)

# non_VS_23 0.05118565644881434
# non_VS_24 0.6438081695966908
# non_VS_SLSP 0.4553230793944018
# non21_VS_23 0.8720358588779641
# non21_VS_24 0.6790977249224406
# non21_VS_SLSP 0.7207655887082366
# non2123_VS_24 0.6761892450879007 # many 'errors' are marked as 5766 in SLSP24DM but are actually relevant types (5796/7426/etc.)
# all 0.9215908019979266 # errors are inherent to different labelings

# non vs SLSP does not work at all
# at least SLSP21 for somewhat decent results
# 'all' is the best

In [None]:
# merge with new data
slsp_obtained25_labels25_df = pd.read_excel('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_zredukovany_ciselnik_02_2025.xlsx')
slsp_obtained25_labels25_df['label'] = slsp_obtained25_labels25_df['zredukovaný číselník']
slsp_obtained25_labels25_df['label'] = slsp_obtained25_labels25_df['label'].astype(str)


slsp_obtained25_labels25_df.to_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_23_obtained25_labels25_no_preprocessing.parquet')
orig_texts = slsp_obtained25_labels25_df.Text
slsp_obtained25_labels25_df.Text = initial_preprocessor.predict(orig_texts)
slsp_obtained25_labels25_df.to_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_23_obtained25_labels25_initial_preprocessing.parquet')
slsp_obtained25_labels25_df.Text = noplaceholders_preprocessor.predict(orig_texts)
slsp_obtained25_labels25_df.to_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_23_obtained25_labels25_noplaceholders_preprocessing.parquet')



slsp_obtained25_labels25_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_23_obtained25_labels25_noplaceholders_preprocessing.parquet')
slsp_obtained24_labels25_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_24_obtained24_labels25_noplaceholders_preprocessing.parquet')
slsp_obtained24_labels25_df.CisloPohladavky.loc[slsp_obtained24_labels25_df.CisloPohladavky.isna()] = 'NA'
slsp_obtained24_labels25_df.NazovSuboru.loc[slsp_obtained24_labels25_df.NazovSuboru.isna()] = 'NA'

slsp_merged_df = pd.merge(left = slsp_obtained24_labels25_df, right = slsp_obtained25_labels25_df, how = 'outer', on = ['CisloPohladavky','Balik','NazovSuboru'])

slsp_merged_df = slsp_merged_df[['CisloPohladavky','Balik','NazovSuboru','Text_x','Text_y','label_x','label_y']]
slsp_merged_df.loc[:,'label'] = slsp_merged_df['label_y']
slsp_merged_df.loc[:,'Text'] = slsp_merged_df['Text_y']
slsp_merged_df.loc[:,'label'][slsp_merged_df['label'].isna()] = slsp_merged_df['label_x'][slsp_merged_df['label'].isna()]
slsp_merged_df.loc[:,'Text'][slsp_merged_df['Text'].isna()] = slsp_merged_df['Text_x'][slsp_merged_df['Text'].isna()]
slsp_merged_df = slsp_merged_df.drop(['Text_x','Text_y','label_x','label_y'],axis=1)
slsp_merged_df = slsp_merged_df.drop_duplicates()

mydf = mltools.utils.attach_metadata_to_pandas_dataframe(slsp_merged_df,metadata)
mydf.to_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_24_obtained24_and_25_and_merged_labels25_no_preprocessing.parquet')


In [None]:

# remove SLSP24DM
all_data_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/data_podla_vendora/SLSP_21_to_24_obtained24_and_25_and_merged_labels25_noplaceholders_preprocessing.parquet')
mydf = all_data_df[all_data_df.Balik != 'SLSP24DM']
mydf = all_data_df

# prep train_df and test_df
mydf = mltools.data.drop_classes_with_1_member(mydf,target_column = mydf.columns[1])
train_df, test_df = mltools.data.pandas_split_categorical_data(mydf)
train_X = train_df.Text
train_y = train_df.label
test_X = test_df.Text
test_y = test_df.label

In [None]:
import mltools

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
model = mltools.architecture.TF_IDF_Classifier(baseline_classifier = rf, model_name = 'rf')
model.fit(train_X, train_y)


In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(test_X)
accuracy_score(test_y, y_pred)

test_y_nonDM = test_y[test_df.Balik != 'SLSP24DM']
y_pred_nonDM = y_pred[test_df.Balik != 'SLSP24DM']  
accuracy_score(test_y_nonDM, y_pred_nonDM)

# including SLSP24DM on all test data - 0.965312881296845
# including SLSP24DM on nonDM test_data - 0.996272630457934
# without SLSP24DM - 0.0.9989434759640782

In [None]:
# production data
target_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/SLSP24abc_to_classify.parquet')
orig_texts = target_df.Text.to_numpy()
prep_texts = np.array(noplaceholders_preprocessor.predict(target_df.Text))

In [None]:
# cluster target data
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.12, min_samples=3, metric = 'cosine')
vects = model.clf.estimator['vect'].transform(target_df.Text)
vects = model.clf.estimator['vect'].transform(prep_texts)
clusters = dbscan.fit_predict(vects)

dbscan1 = DBSCAN(eps=0.12, min_samples=3, metric = 'cosine')
vects1 = vects[clusters == -1]
clusters1 = dbscan.fit_predict(vects1)



In [None]:
# Additional data (1301 samples)
add_df = pd.read_excel('/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/SLSP24abc_nezatriedene.xlsx')
add_df = add_df[add_df['spracovať'] == 1]

add_df['PreppedText'] = noplaceholders_preprocessor.predict(add_df.Text)