In [None]:
import numpy as np
import pandas as pd
import mlflow
import mltools

mlflow.set_tracking_uri('http://localhost:5001')

# production data
target_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/SLSP24abc_to_classify.parquet')
target_noplaceholders_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/SLSP24abc_to_classify_noplaceholders_preprocessing.parquet')

orig_texts = np.array(target_df.Text)
texts = np.array(target_noplaceholders_df.Text)

pyfunc_model = mlflow.pyfunc.load_model('models:/SLSP_Spisy_naked_model/1')
model = pyfunc_model._PyFuncModel__model_impl.python_model.model
vects = model.clf.estimator['vect'].transform(orig_texts)


cluster_analysis_df = pd.read_csv('data/SLSP/DBSCAN_clustering_eps0.12_minsamples3_noplaceholders_texts_SLSP24abc.csv')
preds = cluster_analysis_df['predictions']
clusters = cluster_analysis_df['clusters']

In [None]:
# cluster target data
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.12, min_samples=3, metric = 'cosine')
vects = model.clf.estimator['vect'].transform(target_df.Text)
clusters = dbscan.fit_predict(vects)

orig_texts = target_df.Text
texts = noplaceholders_preprocessor.predict(orig_texts)
preds = model.predict(texts)

In [None]:
# expected global variables:
#   preds
#   clusters
#   model
#   texts
#   orig_texts

from collections import Counter
from scipy.stats import entropy

def entropy1(labels, base=None):
   value,counts = np.unique(labels, return_counts=True)
   return entropy(counts, base=base)

def cluster_analysis(cluster_number : int):
    global preds
    global clusters
    global model
    global texts
    probas = model.predict_proba(texts[clusters == cluster_number])
    max_probas = np.max(probas, axis = 1)

    print(Counter(preds[clusters == cluster_number]))
    print(entropy1(preds[clusters == cluster_number]))
    print("Max proba quantiles (1 - 0.9 - 0.1 - 0) :")
    print(np.quantile(max_probas, 1), np.quantile(max_probas, 0.9), np.quantile(max_probas, 0.1), np.quantile(max_probas, 0))
    return(sum(clusters == cluster_number), Counter(preds[clusters == cluster_number]), entropy1(preds[clusters == cluster_number]), np.quantile(max_probas, 1), np.quantile(max_probas, 0))

def cluster_inspection(cluster_number : int, show_texts : bool = True):
    global texts
    global orig_texts
    global clusters
    global preds
    global model
    probas = model.predict_proba(texts[clusters == cluster_number])
    top2_indices = np.argsort(-probas)[:,:2]
    top2_probas = probas[np.arange(probas.shape[0])[:, None], top2_indices]
    top2_labels = model.known_labels[top2_indices]
    cluster_texts = texts[clusters == cluster_number]
    cluster_orig_texts = orig_texts[clusters == cluster_number]

    for idx in range(len(cluster_texts)):
        if show_texts:
            print(cluster_orig_texts[idx])
        print(top2_probas[idx])
        print(top2_labels[idx])
    
    if not show_texts:
        print(cluster_orig_texts[0])

def maximum_within_cluster_distance(cluster_number : int):
    global vects
    subset = vects[clusters == cluster_number]
    distances =  np.zeros((subset.shape[0],subset.shape[0]))
    for idx in range(subset.shape[0]):
        for idx2 in range(idx):
            if idx != idx2:
                distances[idx, idx2] = 1 - cosine_similarity(subset[idx], subset[idx2])
    return np.max(distances)
                



In [None]:
# analysis of heterogeneous clusters
# cluster 22
from sklearn.cluster import DBSCAN

i = 22

dbscan_models = {}
subset_clusters = {}

dbscan_models[i] = DBSCAN(eps = 0.2, min_samples=3)
subset_texts = orig_texts[clusters == i]
subset_vects = vects[clusters == i]
subset_clusters[i] = dbscan_models[i].fit_predict(subset_vects)

orig_texts[clusters == i][subset_clusters[i] == 0]
# cluster16
# cluster133

In [None]:
def cluster_number_to_idxs(cluster_number : int):
    return np.where(clusters == cluster_number)[0]

def cluster_subset_to_idxs(cluster_number : int, subset_index : list[bool]):
    subset_index = np.array(subset_index)
    return np.where((clusters.to_numpy() == cluster_number))[0][subset_index]

In [None]:
def parallel_map(func, data, num_workers=8, **kwargs):
    import multiprocessing
    import functools

    try:
        multiprocessing.set_start_method("fork", force = True)
    except RuntimeError:
        pass

    func_with_kwargs = functools.partial(func, **kwargs)

    """Runs a function in parallel using multiprocessing.Pool with fork."""
    with multiprocessing.Pool(processes=num_workers) as pool:
        results = pool.map(func_with_kwargs, data)
    return results

In [None]:
# find the closest sample from one of the clustered samples
#def find_closest_clustered_sample(vects: np.ndarray, idx: int, clusters: list[int]):
similarities = cosine_similarity(vects[idx], vects[clusters != -1])[0]
max_idx = similarities.argmax()
print(similarities[max_idx])
print(clusters[clusters != -1][max_idx])
print(orig_texts[idx])
print(orig_texts[clusters != -1][max_idx])


def sims(vects, clusters, idx):
    similarities = cosine_similarity(vects[idx], vects[clusters != -1])[0]
    max_idx = similarities.argmax()
    return similarities[max_idx]


In [None]:
# inspect similarity of unclustered samples to clustered samples using cosine_similarity + fuzz similarity

def max_cosine_similarity(idx, cluster_number):
    from sklearn.metrics.pairwise import cosine_similarity
    global vects
    global clusters
    global orig_texts
    sims = cosine_similarity(vects[idx],vects[clusters == cluster_number])
    max_value = np.max(sims)
    max_idx = np.where(clusters == cluster_number)[0][np.argmax(sims)]
    return max_value, max_idx

def max_fuzz_similarity(idx, cluster_number, texts_to_use = None):
    from rapidfuzz import fuzz
    global vects
    global clusters
    global orig_texts
    texts_to_use = orig_texts if texts_to_use is None else texts_to_use
    sims = [fuzz.token_sort_ratio(texts_to_use[idx],text) for text in texts_to_use[clusters == cluster_number]]
    max_value = np.max(sims)
    max_idx = np.where(clusters == cluster_number)[0][np.argmax(sims)]
    return max_value, max_idx

def best_cluster_match(idx, metric = 'cosine', texts_to_use = None):
    if clusters[idx] != -1:
        return clusters[idx]
    max_cluster_number = max(np.unique(clusters))

    def max_fuzz_length_similarity(idx, cluster_number):
        return max_fuzz_similarity(idx, cluster_number, texts_to_use)

    metric = max_cosine_similarity if metric == 'cosine' else max_fuzz_length_similarity
    sims = np.array([metric(idx, i)[0] for i in range(max_cluster_number + 1)])
    return np.max(sims), np.argmax(sims), sims

def new_max_cosine_similarity(text, cluster_number):
    from sklearn.metrics.pairwise import cosine_similarity
    global vects
    global clusters
    global orig_texts
    sims = cosine_similarity(model.clf.estimator['vect'].transform([text]),vects[clusters == cluster_number])
    max_value = np.max(sims)
    max_idx = np.where(clusters == cluster_number)[0][np.argmax(sims)]
    return max_value, max_idx

def new_max_fuzz_similarity(text, cluster_number, texts_to_use = None):
    from rapidfuzz import fuzz
    global vects
    global clusters
    global orig_texts
    texts_to_use = orig_texts if texts_to_use is None else texts_to_use
    sims = [fuzz.token_sort_ratio(text,cluster_text) for cluster_text in texts_to_use[clusters == cluster_number]]
    max_value = np.max(sims)
    max_idx = np.where(clusters == cluster_number)[0][np.argmax(sims)]
    return max_value, max_idx

def new_best_cluster_match(text, metric = 'cosine', texts_to_use = None):
    max_cluster_number = max(np.unique(clusters))

    def new_max_fuzz_length_similarity(text, cluster_number):
        return new_max_fuzz_similarity(text, cluster_number, texts_to_use)

    metric = new_max_cosine_similarity if metric == 'cosine' else new_max_fuzz_length_similarity
    sims = np.array([metric(text, i)[0] for i in range(max_cluster_number + 1)])
    return np.max(sims), np.argmax(sims), sims

def new_parallel_best_cluster_match(text, metric = 'cosine', texts_to_use = None):
    max_cluster_number = max(np.unique(clusters))

    def new_max_fuzz_length_similarity(text, cluster_number):
        return new_max_fuzz_similarity(text, cluster_number, texts_to_use)

    metric = new_max_cosine_similarity if metric == 'cosine' else new_max_fuzz_length_similarity
    sims = np.array([metric(text, i)[0] for i in range(max_cluster_number + 1)])
    return np.max(sims), np.argmax(sims), sims


In [None]:
#TLDR - vsetky predikovane na orig_texts ako 7499 alebo predikovane na prep_texts ako 7499 su spravne oznacene ako 7499,s tymito vynimkami:

# VYNIMKY z 7499. Vsetky ostatne z 7499 v clusteri -1 su spravne oznacene ako 7499
# z predikcii na orig_texts su tieto 2 vynimky:
#idx 37324
'POVOLENIE NÁ POBYŤ O RNS255036 Kód zdravotnej poisťovne: 25 " MEHMET GURÚ PISKÍN ROd"šČŠIŠ ž&äoängéžlĺ\\i\\ňkačné číslo poistenca: Dam"ä\'šuj%ežŕ,t 983 :, . Preukazplatnýcc — 01,10,2014 J DÁTUM A MIESTO NARODENIA, D3.12.1983 "TUR, ERUH: POHLAVIE ŠTÁTNA PRÍSLUŠNOSŤ M TUREGCKO POZNÁMKY IRSVKRN525503628312039241<<<<< 8312033M1909225TUR<<<<<<<<<<<5 PISKIN<<MEHMET<GU_ERUE<<<<<<<<< k š > ZÁZNAMY POISŤOVNE: Preukaz číslo: 25-030100963 Plati do: Kôd poisťovne - pobočky: 25 - 03 Podpis a odtlačok pečiatky',

#idx 36599
"Dodatok k PRACOVNEJ ZMLUVE Dohoda o zmene pracovnej zmluvy Zamestnávateľ : [Firma] Potravinárska č.6,[PSC]Rimavská Sobota Zastúpený: Ing. [Meno Priezvisko], predseda predstavenstva a generálny riaditeľ, Ing. [Meno Priezvisko], PhD., finančný riaditeľ a Pán (ďalej zamestnanec: [Meno Priezvisko] narodený [DatumNarodenia] [RodneCislo] číslo OP: EU463988 trvalé bydlisko[UlicaCislo] [Mesto]uzatvorili túto dohodu o zmene pracovnej zmluvy: 1. Písomná pracovná zmluvá uzavretá medzi zamestnancom a zamestnávateľom dňa: 01.01.2021 sa mení dňom 091.07.2021 takto: 2. Pracovný pomer na dobu určitú do 30.06.2021 sa mení na dobu určitú do 31.12.2021. Ostatné podmienky dohodnuté v pracovnej zmluve ostávajú nezmenné. U Rimavská Sobota, dňa 10.06.2021 TAURIS, as ' Potravinárska 6 á :[PSC]Rimäská Sobola — d / [/ —— pečiatka a podpis |(zamieštnávateľa Ť - podpis zamestnanca . Strana 1z1",

# z predikcii na prep_texts ktore nie su predikovane ako 7499 na orig_texts je tato 1 vynimka:
# 37362
'ÚRAD PRÁCE, [Meno Priezvisko] A RODINY [Meno Priezvisko] Farbiarska 57, [Mesto] Potvrdenie o poberaní rodičovského príspevku za rok 2022 Meno a priezvisko: [Meno Priezvisko] Dátum narodenia: [DatumNarodenia] Adresa:[UlicaCislo] [PSC] [Meno Priezvisko] Dávky za jednotlivé mesiace Mesiac Platba v eurách Január 383,80 Február — 383,80 Marec — - j W Ap P 383,80 Máj 383,80 Jún 383,80 Júl 383,80 [Meno Priezvisko])(í | September - o W Október 0,00 | November ob December | 0,00 Suma za celé obdobie: 2 686,60 SOCIÁLNYCH MEÍ A áž SOSVČTARÁ UUBOVŇA Farbiarska ŠT [PSC] [Meno Priezvisko] V Starej Ľubovni 11.07.2022 | O / ot S Podpis zodpovedn%o pracovníka Vybavuje: [Meno Priezvisko], Mgr. PhD.',

In [None]:
# inspect similarity of unclustered samples to clustered samples using cosine_similarity + fuzz similarity
idxs1 = np.where(clusters == -1)[0]
mydf = pd.DataFrame(columns = ['max_cosine_similarity','similar_cosine_cluster','model_cosine_prediction','max_fuzz_similarity','similar_fuzz_cluster','model_fuzz_prediction'], index = idxs1)



In [None]:
from rapidfuzz import fuzz
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

target_df = pd.read_parquet('/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/SLSP24abc_to_classify.parquet')
orig_texts = target_df.Text.to_numpy()
chopped_texts = np.array([text[:1000] for text in orig_texts])
def fuzz_similarity(sample_1, sample_2):
    return fuzz.token_sort_ratio(sample_1, sample_2)

import numpy as np
from joblib import Parallel, delayed

def generate_indices(n):
    for i in range(n):
        for j in range(i + 1, n):
            yield (i, j)

def compute_distance(x, y):
    """Example distance function (Euclidean distance)."""
    return fuzz_similarity(x, y)

def parallel_distance_matrix(data, num_workers=4, batch_size=5000):
    """
    Computes the distance matrix in parallel using a generator to avoid memory issues.
    
    - `num_workers`: Number of parallel jobs.
    - `batch_size`: Number of (i, j) pairs processed in one batch.
    """
    n = len(data)
    dist_matrix = np.zeros((n, n))  # Initialize result matrix

    print("Number of pairs: ", n * (n - 1) // 2)
    batch = []
    for i, j in generate_indices(n):
        batch.append((i, j))

        # Process when we reach batch size
        if len(batch) >= batch_size:
            print('Processing batch starting with', batch[0])
            results = Parallel(n_jobs=num_workers, backend="loky")(
                delayed(compute_distance)(data[i], data[j]) for i, j in batch
            )

            # Store results and clear batch
            for (i, j), dist in zip(batch, results):
                dist_matrix[i, j] = dist
                dist_matrix[j, i] = dist  # Symmetric fill
            
            batch = []  # Reset batch

    # Process any remaining pairs
    if batch:
        results = Parallel(n_jobs=num_workers, backend="loky")(
            delayed(compute_distance)(data[i], data[j]) for i, j in batch
        )
        for (i, j), dist in zip(batch, results):
            dist_matrix[i, j] = dist
            dist_matrix[j, i] = dist

    return dist_matrix

# Example usage
data = chopped_texts  # 100 samples, 5-dimensional vectors
dist_matrix = parallel_distance_matrix(data, num_workers=12, batch_size=100000)
print(dist_matrix)
print("Distance matrix computation complete.")

In [None]:
# group_clusters_to_labels - read from excel sheet - group cluster analysis
group_clusters_to_labels = pd.read_clipboard(sep = ';')

# group_clusters_to_idxs
group_clusters_to_idxs = joblib.load('/Users/ondrejgutten/Work/PISI.nosync/data/SLSP/group_clusters_to_idxs.pickle')

# excel_labels - semi-manual assignment of labels in excel sheet
excel_labels = pd.read_clipboard()
excel_labels.columns = ['similar_cosine_cluster','prelabel']
excel_labels['label'] = excel_labels['prelabel']
excel_labels.loc[excel_labels['prelabel'] == 'max_cosine_sim_cluster', 'label'] = ['cluster_' + str(x) for x in excel_labels.loc[excel_labels['prelabel'] == 'max_cosine_sim_cluster', 'similar_cosine_cluster']]

def cluster_number_to_cluster_group_idx(cluster_number):
    found = np.where([cluster_number in cluster_list for cluster_list in cluster_groups])[0]
    if found.shape[0] == 0:
        return None
    if found.shape[0] > 1:
        raise ValueError('Multiple clusters found')
    return found[0]

def idx_to_cluster_group_idx(idx):
    found = np.where([idx in cluster_list for cluster_list in group_clusters_to_idxs])[0]
    if found.shape[0] == 0:
        return None
    if found.shape[0] > 1:
        raise ValueError('Multiple clusters found')
    return found[0]

def idx_to_final_label(idx):
    # look for idx in group_clusters_to_idxs
    # if successful, return label from corresponding group_clusters_to_labels
    # if not look in excel_labels['label']
        # if label starts with cluster_X, return label corresponding to group_clusters_to_labels[ cluster_number_to_cluster_group_idx(X) ]
        # otherwise return excel_labels['label']
    
    clustered = idx_to_cluster_group_idx(idx)
    if clustered is not None:
        return group_clusters_to_labels[clustered]
    else:
        label = excel_labels.loc[idx,'label']
        if 'cluster' in label:
            cluster_number = int(label.split('_')[1])
            return group_clusters_to_labels[cluster_number_to_cluster_group_idx(cluster_number)]
        return label