# Generating Similiarity Matrixes

In [15]:
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from collections import Counter
from statistics import mean
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
import sys
sys.path.append('../../src')
from DataHandler import DataHandler

In [17]:
dataHandler = DataHandler("../config.yaml")

In [18]:
df_ddc_labels = dataHandler.load_data("data_files.processed_data.keywords.df_ddc")
df_ddc_labels.head(3)

📂 Lade: ddc_keywords.p
   ✅ Geladen: 4,548 Zeilen × 7 Spalten


Unnamed: 0,pipe:ID,ddc_d2_label,ddc_d3_label,ddc_combined_label,ddc_d2_count,ddc_d3_count,ddc_total_count
0,8I6sM5zapD60,"[511, 6]","[526, 912]","[511, 6, 526, 912]",2,2,4
1,8ZICOHBmAHyQ,"[620, 624, 620, 658]","[624, 618, 531, 531, 531, 621]","[620, 624, 620, 658, 624, 618, 531, 531, 531, ...",4,6,10
2,8Lfz8SAKa6k0,"[133, 338, 6, 6, 658, 6, 153]","[338, 153, 205, 6, 519, 6, 6]","[133, 338, 6, 6, 658, 6, 153, 338, 153, 205, 6...",7,7,14


## Vorbereitung der Ähnlichkeitsberechnung

### Berechne eine Ähnlichkeit anhand der DDC Nummern

In [19]:
# Funktion: Vektor aus Ganzzahl-DDCs
def make_ddc_vector(codes, length=1000):
    vec = np.zeros(length)
    for code in codes:
        idx = int(code)  # Ganzzahlanteil
        if 0 <= idx < length:
            vec[idx] += 1
            #vec[idx] = 1  # Setze den Wert auf 1, ohne die Häufigkeit zu zählen
    return vec

# Vektoren generieren
df_ddc_labels["ddc_vector"] = df_ddc_labels["ddc_combined_label"].apply(make_ddc_vector)

In [20]:
# Vektoren zusammenführen in Matrix für spätere Cosinus-Ähnlichkeitsberechnung
X = np.vstack(df_ddc_labels["ddc_vector"].values)

# Finden, an welchen Spaltenindexstellen überall 0 steht
all_zero_indices = np.all(X == 0, axis=0)  
X_cleaned = X[:, ~all_zero_indices]
print(f"DDC-Vektormatrix vorbereitet: {X_cleaned.shape}")

def cosine_ddc_similarity():
    """
    Berechnet Cosinus-Ähnlichkeit basierend auf DDC-Vektoren
    Verwendet die bereits vorbereitete DDC-Vektormatrix
    """
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Verwende die bereits erstellte DDC-Vektormatrix
    cos_similarity_matrix = cosine_similarity(X)
    return cos_similarity_matrix

DDC-Vektormatrix vorbereitet: (4548, 494)


In [21]:
def jaccard_similarity(list1, list2):
    """
    Standard Jaccard-Ähnlichkeit zwischen zwei Listen
    """
    # Umwandlung der Listen in Mengen
    set1 = set(list1)
    set2 = set(list2)
    
    # Berechnung der Schnittmenge und der Vereinigung
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    # Berechnung der Jaccard-Ähnlichkeit
    if union == 0:
        return 0.0  # Vermeidung von Division durch Null
    else:
        return intersection / union


In [22]:

def weighted_jaccard_similarity(list1, list2):
    """
    Gewichtete Jaccard-Ähnlichkeit basierend auf Häufigkeiten der DDC-Nummern
    """
    from collections import Counter
    
    if not list1 and not list2:
        return 1.0
    if not list1 or not list2:
        return 0.0
    
    counter1 = Counter(list1)
    counter2 = Counter(list2)
    
    # Alle einzigartigen DDC-Nummern
    all_ddcs = set(list1 + list2)
    
    intersection_weight = 0
    union_weight = 0
    
    for ddc in all_ddcs:
        count1 = counter1.get(ddc, 0)
        count2 = counter2.get(ddc, 0)
        
        # Intersection: Minimum der Häufigkeiten
        intersection_weight += min(count1, count2)
        # Union: Maximum der Häufigkeiten  
        union_weight += max(count1, count2)
    
    return intersection_weight / union_weight if union_weight > 0 else 0.0

def tfidf_cosine_similarity(list1, list2, doc_frequencies, total_docs):
    """
    TF-IDF gewichtete Cosinus-Ähnlichkeit
    Seltene DDC-Nummern bekommen höheres Gewicht
    """
    from collections import Counter
    import math
    
    if not list1 and not list2:
        return 1.0
    if not list1 or not list2:
        return 0.0
    
    counter1 = Counter(list1)
    counter2 = Counter(list2)
    
    # Alle DDC-Nummern in beiden Listen
    all_ddcs = set(list1 + list2)
    
    # TF-IDF Vektoren berechnen
    vector1 = {}
    vector2 = {}
    
    for ddc in all_ddcs:
        # Term Frequency
        tf1 = counter1.get(ddc, 0) / len(list1) if list1 else 0
        tf2 = counter2.get(ddc, 0) / len(list2) if list2 else 0
        
        # Inverse Document Frequency
        df = doc_frequencies.get(ddc, 1)  # Dokumente mit dieser DDC-Nummer
        idf = math.log(total_docs / df) if df > 0 else 0
        
        # TF-IDF Gewichtung
        vector1[ddc] = tf1 * idf
        vector2[ddc] = tf2 * idf
    
    # Cosinus-Ähnlichkeit berechnen
    dot_product = sum(vector1[ddc] * vector2[ddc] for ddc in all_ddcs)
    norm1 = math.sqrt(sum(v**2 for v in vector1.values()))
    norm2 = math.sqrt(sum(v**2 for v in vector2.values()))
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot_product / (norm1 * norm2)

def overlap_coefficient(list1, list2):
    """
    Overlap-Koeffizient (Szymkiewicz-Simpson Index)
    Gut für unterschiedlich große Listen
    """
    set1 = set(list1)
    set2 = set(list2)
    
    if not set1 and not set2:
        return 1.0
    if not set1 or not set2:
        return 0.0
    
    intersection = len(set1.intersection(set2))
    min_size = min(len(set1), len(set2))
    
    return intersection / min_size

def dice_coefficient(list1, list2):
    """
    Dice-Koeffizient (Sørensen-Dice Index)
    Gibt doppeltes Gewicht auf Überschneidungen
    """
    set1 = set(list1)
    set2 = set(list2)
    
    if not set1 and not set2:
        return 1.0
    if not set1 or not set2:
        return 0.0
    
    intersection = len(set1.intersection(set2))
    return (2 * intersection) / (len(set1) + len(set2))



In [23]:
# BERECHNUNG DER DOKUMENTFREQUENZEN FÜR TF-IDF
print("Berechne Dokumentfrequenzen für TF-IDF...")

doc_frequencies = {}
total_docs = len(df_ddc_labels)

for _, row in df_ddc_labels.iterrows():
    unique_ddcs = set(row['ddc_combined_label'])
    for ddc in unique_ddcs:
        doc_frequencies[ddc] = doc_frequencies.get(ddc, 0) + 1

print(f"Dokumentfrequenzen für {len(doc_frequencies)} einzigartige DDC-Nummern berechnet")

# VERGLEICH VERSCHIEDENER ÄHNLICHKEITSMASSE
print("\nTeste verschiedene Ähnlichkeitsmaße mit Beispieldaten...")

# Wähle einige Beispiel-Materialien für Vergleich
sample_indices = [0, 1, 10, 50, 100]
sample_materials = df_ddc_labels.iloc[sample_indices]

print("\nVergleich der Ähnlichkeitsmaße für erste 5 Materialien:")
print("Material 1:", sample_materials.iloc[0]['ddc_combined_label'][:5])
print("Material 2:", sample_materials.iloc[1]['ddc_combined_label'][:5])

list1 = sample_materials.iloc[0]['ddc_combined_label']
list2 = sample_materials.iloc[1]['ddc_combined_label']

print(f"\nÄhnlichkeitsvergleich zwischen Material {sample_materials.iloc[0]['pipe:ID']} und {sample_materials.iloc[1]['pipe:ID']}:")
print(f"Standard Jaccard: {jaccard_similarity(list1, list2):.4f}")
print(f"Weighted Jaccard: {weighted_jaccard_similarity(list1, list2):.4f}")
print(f"TF-IDF Cosine: {tfidf_cosine_similarity(list1, list2, doc_frequencies, total_docs):.4f}")
print(f"Overlap Coefficient: {overlap_coefficient(list1, list2):.4f}")
print(f"Dice Coefficient: {dice_coefficient(list1, list2):.4f}")

# ANALYSE DER DATENVERTEILUNG
print(f"\nDatenanalyse:")
print(f"Durchschnittliche Anzahl DDC-Nummern pro Dokument: {np.mean([len(labels) for labels in df_ddc_labels['ddc_combined_label']]):.2f}")
print(f"Median Anzahl DDC-Nummern pro Dokument: {np.median([len(labels) for labels in df_ddc_labels['ddc_combined_label']]):.2f}")
print(f"Max Anzahl DDC-Nummern pro Dokument: {max([len(labels) for labels in df_ddc_labels['ddc_combined_label']])}")

# Dokumente ohne DDC-Nummern
empty_docs = sum(1 for labels in df_ddc_labels['ddc_combined_label'] if len(labels) == 0)
print(f"Dokumente ohne DDC-Nummern: {empty_docs} ({empty_docs/total_docs*100:.1f}%)")

# Häufigste DDC-Nummern
from collections import Counter
all_ddcs = []
for labels in df_ddc_labels['ddc_combined_label']:
    all_ddcs.extend(labels)

ddc_counter = Counter(all_ddcs)
print(f"\nHäufigste DDC-Nummern:")
for ddc, count in ddc_counter.most_common(10):
    print(f"  {ddc}: {count} Dokumente ({count/total_docs*100:.1f}%)")

Berechne Dokumentfrequenzen für TF-IDF...
Dokumentfrequenzen für 494 einzigartige DDC-Nummern berechnet

Teste verschiedene Ähnlichkeitsmaße mit Beispieldaten...

Vergleich der Ähnlichkeitsmaße für erste 5 Materialien:
Material 1: [511, 6, 526, 912]
Material 2: [620, 624, 620, 658, 624]

Ähnlichkeitsvergleich zwischen Material 8I6sM5zapD60 und 8ZICOHBmAHyQ:
Standard Jaccard: 0.0000
Weighted Jaccard: 0.0000
TF-IDF Cosine: 0.0000
Overlap Coefficient: 0.0000
Dice Coefficient: 0.0000

Datenanalyse:
Durchschnittliche Anzahl DDC-Nummern pro Dokument: 12.95
Median Anzahl DDC-Nummern pro Dokument: 13.00
Max Anzahl DDC-Nummern pro Dokument: 32
Dokumente ohne DDC-Nummern: 0 (0.0%)

Häufigste DDC-Nummern:
  621: 3310 Dokumente (72.8%)
  5: 3064 Dokumente (67.4%)
  371: 2814 Dokumente (61.9%)
  4: 2541 Dokumente (55.9%)
  658: 2181 Dokumente (48.0%)
  519: 1535 Dokumente (33.8%)
  6: 1427 Dokumente (31.4%)
  511: 1373 Dokumente (30.2%)
  25: 1309 Dokumente (28.8%)
  302: 1207 Dokumente (26.5%)


In [24]:
# BERECHNUNG VERBESSERTER ÄHNLICHKEITSMATRIZEN

def calculate_similarity_matrix(similarity_function, name, **kwargs):
    """Helper function to calculate similarity matrix with progress bar"""
    print(f"\nBerechne {name} Ähnlichkeitsmatrix...")
    
    matrix = np.zeros((len(df_ddc_labels), len(df_ddc_labels)))
    np.fill_diagonal(matrix, 1.0)  # Diagonale = 1.0
    
    for i in tqdm(range(len(df_ddc_labels)), desc=f"Berechnung {name}"):
        for j in range(i + 1, len(df_ddc_labels)):
            list1 = df_ddc_labels.iloc[i]['ddc_combined_label']
            list2 = df_ddc_labels.iloc[j]['ddc_combined_label']
            
            if similarity_function.__name__ == 'tfidf_cosine_similarity':
                similarity = similarity_function(list1, list2, doc_frequencies, total_docs)
            else:
                similarity = similarity_function(list1, list2)
            
            matrix[i, j] = similarity
            matrix[j, i] = similarity  # Symmetrisch
    
    return matrix

# Berechne alle verbesserten Ähnlichkeitsmatrizen
print("Starte Berechnung aller Ähnlichkeitsmatrizen...")

# 1. Cosinus-Ähnlichkeit (DDC-Vektoren)
print("\nBerechne Cosinus DDC-Ähnlichkeitsmatrix...")
cosine_ddc_matrix = cosine_ddc_similarity()

# 2. Standard Jaccard
jaccard_matrix = calculate_similarity_matrix(jaccard_similarity, "Standard Jaccard")

# 3. Weighted Jaccard
weighted_jaccard_matrix = calculate_similarity_matrix(weighted_jaccard_similarity, "Weighted Jaccard")

# 4. TF-IDF Cosine
tfidf_cosine_matrix = calculate_similarity_matrix(tfidf_cosine_similarity, "TF-IDF Cosine")

# 5. Overlap Coefficient
overlap_matrix = calculate_similarity_matrix(overlap_coefficient, "Overlap Coefficient")

# 6. Dice Coefficient
dice_matrix = calculate_similarity_matrix(dice_coefficient, "Dice Coefficient")

# Erstelle DataFrames
material_ids = df_ddc_labels["pipe:ID"]

cosine_ddc_df = pd.DataFrame(cosine_ddc_matrix, index=material_ids, columns=material_ids)
jaccard_similarity_df = pd.DataFrame(jaccard_matrix, index=material_ids, columns=material_ids)
weighted_jaccard_df = pd.DataFrame(weighted_jaccard_matrix, index=material_ids, columns=material_ids)
tfidf_cosine_df = pd.DataFrame(tfidf_cosine_matrix, index=material_ids, columns=material_ids)
overlap_df = pd.DataFrame(overlap_matrix, index=material_ids, columns=material_ids)
dice_df = pd.DataFrame(dice_matrix, index=material_ids, columns=material_ids)

print("\nAlle Ähnlichkeitsmatrizen berechnet!")

# VERGLEICH DER ERGEBNISSE
print("\nVergleich der Ähnlichkeitsverteilungen:")

# Extrahiere obere Dreiecksmatrix (ohne Diagonale) für Statistiken
def get_upper_triangle(matrix):
    return matrix[np.triu_indices_from(matrix, k=1)]

cosine_ddc_upper = get_upper_triangle(cosine_ddc_matrix)
jaccard_upper = get_upper_triangle(jaccard_matrix)
weighted_jaccard_upper = get_upper_triangle(weighted_jaccard_matrix) 
tfidf_cosine_upper = get_upper_triangle(tfidf_cosine_matrix)
overlap_upper = get_upper_triangle(overlap_matrix)
dice_upper = get_upper_triangle(dice_matrix)

methods = {
    "Cosinus DDC": cosine_ddc_upper,
    "Standard Jaccard": jaccard_upper,
    "Weighted Jaccard": weighted_jaccard_upper,
    "TF-IDF Cosine": tfidf_cosine_upper,
    "Overlap Coefficient": overlap_upper,
    "Dice Coefficient": dice_upper
}

print(f"{'Methode':<20} {'Mean':<8} {'Std':<8} {'Min':<8} {'Max':<8} {'Non-Zero%':<10}")
print("-" * 70)

for name, values in methods.items():
    non_zero_pct = (np.count_nonzero(values) / len(values)) * 100
    print(f"{name:<20} {np.mean(values):<8.4f} {np.std(values):<8.4f} {np.min(values):<8.4f} {np.max(values):<8.4f} {non_zero_pct:<10.1f}")

print(f"\nInterpretation:")
print(f"- Höhere Standardabweichung = bessere Differenzierung")
print(f"- Niedrigerer Non-Zero% = sparsere, selektivere Ähnlichkeiten")
print(f"- TF-IDF gewichtet seltene DDC-Nummern höher")
print(f"- Overlap gut für unterschiedlich große Dokumentgrößen")

Starte Berechnung aller Ähnlichkeitsmatrizen...

Berechne Cosinus DDC-Ähnlichkeitsmatrix...

Berechne Standard Jaccard Ähnlichkeitsmatrix...


Berechnung Standard Jaccard:   0%|          | 8/4548 [00:01<16:57,  4.46it/s]


KeyboardInterrupt: 

In [None]:
def hierarchical_ddc_distance(ddc1, ddc2):
    """
    Berechnet die hierarchische Distanz zwischen zwei DDC-Nummern basierend auf der Baumstruktur.
    
    DDC-Hierarchie:
    - Hauptklasse (z.B. 100): Oberste Ebene
    - Unterklasse (z.B. 150): Zweite Ebene  
    - Spezialklasse (z.B. 152): Dritte Ebene
    
    Distanz basiert auf der Anzahl der Ebenen, die übereinstimmen.
    """
    if ddc1 == ddc2:
        return 0.0  # Identische DDC-Nummern
    
    # Konvertiere zu Strings für Vergleich
    str1, str2 = str(ddc1), str(ddc2)
    
    # Bestimme Hierarchieebenen basierend auf DDC-Struktur
    # DDC-Hauptklassen: 000-999 (alle 3-stelligen Zahlen)
    main_class1 = int(str1[0]) * 100  # Erste Stelle * 100 (z.B. 1xx = 100er Bereich)
    main_class2 = int(str2[0]) * 100
    
    sub_class1 = int(str1[:2]) * 10   # Erste zwei Stellen * 10 (z.B. 15x = 150er Bereich)
    sub_class2 = int(str2[:2]) * 10
    
    # Vollständige Klasse
    full_class1 = ddc1
    full_class2 = ddc2
    
    # Hierarchische Ähnlichkeit berechnen
    if main_class1 != main_class2:
        # Verschiedene Hauptklassen: maximale Distanz
        return 1.0
    elif sub_class1 != sub_class2:
        # Gleiche Hauptklasse, verschiedene Unterklassen: mittlere Distanz
        return 0.6
    elif full_class1 != full_class2:
        # Gleiche Unter-, verschiedene Spezialklassen: geringe Distanz
        return 0.3
    else:
        # Identisch (sollte nicht erreicht werden)
        return 0.0

def hierarchical_similarity_sets(set1, set2):
    """
    Berechnet die hierarchische Ähnlichkeit zwischen zwei Sets von DDC-Nummern.
    Verwendet den minimalen hierarchischen Abstand zwischen allen Kombinationen.
    """
    if len(set1) == 0 and len(set2) == 0:
        return 1.0  # Beide leer = vollständig ähnlich
    if len(set1) == 0 or len(set2) == 0:
        return 0.0  # Einer leer = keine Ähnlichkeit
    
    # Berechne alle paarweisen Distanzen
    distances = []
    for ddc1 in set1:
        for ddc2 in set2:
            distances.append(hierarchical_ddc_distance(ddc1, ddc2))
    
    # Verwende den minimalen Abstand (beste Übereinstimmung)
    min_distance = min(distances)
    
    # Konvertiere Distanz zu Ähnlichkeit (1 - Distanz)
    return 1.0 - min_distance

def hierarchical_similarity_average(list1, list2):
    """
    Berechnet die durchschnittliche hierarchische Ähnlichkeit zwischen zwei Listen von DDC-Nummern.
    Berücksichtigt alle Elemente und deren Häufigkeiten.
    """
    if len(list1) == 0 and len(list2) == 0:
        return 1.0
    if len(list1) == 0 or len(list2) == 0:
        return 0.0
    
    # Berechne gewichtete Ähnlichkeit basierend auf Häufigkeiten
    from collections import Counter
    counter1 = Counter(list1)
    counter2 = Counter(list2)
    
    # Alle einzigartigen DDC-Nummern
    all_ddcs = set(list1 + list2)
    
    similarity_sum = 0.0
    weight_sum = 0.0
    
    for ddc1 in counter1:
        for ddc2 in counter2:
            # Gewicht basierend auf Häufigkeiten
            weight = counter1[ddc1] * counter2[ddc2]
            # Hierarchische Ähnlichkeit
            sim = 1.0 - hierarchical_ddc_distance(ddc1, ddc2)
            
            similarity_sum += weight * sim
            weight_sum += weight
    
    if weight_sum == 0:
        return 0.0
    
    return similarity_sum / weight_sum

In [None]:
# Berechnung der hierarchischen Ähnlichkeitsmatrix
print("Berechne hierarchische Ähnlichkeitsmatrix...")

# Initialisiere Matrix
hierarchical_matrix = np.zeros((len(df_ddc_labels), len(df_ddc_labels)))

# Fülle Diagonale mit 1.0 (Material mit sich selbst)
np.fill_diagonal(hierarchical_matrix, 1.0)

# Berechne Ähnlichkeiten für alle Paare
for i in tqdm(range(len(df_ddc_labels)), desc="Berechnung der hierarchischen Ähnlichkeit"):
    for j in range(i + 1, len(df_ddc_labels)):
        # Verwende die durchschnittliche hierarchische Ähnlichkeit
        similarity = hierarchical_similarity_average(
            df_ddc_labels.iloc[i]['ddc_combined_label'], 
            df_ddc_labels.iloc[j]['ddc_combined_label']
        )
        hierarchical_matrix[i, j] = similarity
        hierarchical_matrix[j, i] = similarity  # Symmetrisch

# Erstelle DataFrame
hierarchical_df = pd.DataFrame(
    hierarchical_matrix, 
    index=df_ddc_labels["pipe:ID"], 
    columns=df_ddc_labels["pipe:ID"]
)

hierarchical_df.head(3)

Berechne hierarchische Ähnlichkeitsmatrix...


Berechnung der hierarchischen Ähnlichkeit: 100%|██████████| 4548/4548 [19:49<00:00,  3.82it/s] 


pipe:ID,8I6sM5zapD60,8ZICOHBmAHyQ,8Lfz8SAKa6k0,3ztCv-WpxJ4U,6mOhjfscZK2A,1eteONeHL82Y,4ko3QE49jYdg,1mjbqKfwSW7U,1BruMQFjEIRY,1Qhnsa15Gixs,...,12QgNIYLxydAY,12IVVtroo82Uk,127HM_Da6VkYA,12jbMf7CPG0ck,11ONBt_S6XLVM,8Qx2WQd_ANdI,7rwytM7w-8HE,8MBQRpA2XSQU,10MPMUvFFT_H0,11zdGfDSphJpg
pipe:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8I6sM5zapD60,1.0,0.13,0.133929,0.0125,0.122917,0.095833,0.157143,0.005882,0.067308,0.030556,...,0.259375,0.2075,0.1875,0.24,0.053846,0.044643,0.034615,0.2125,0.2375,0.066667
8ZICOHBmAHyQ,0.13,1.0,0.152857,0.035,0.255833,0.096667,0.068571,0.025294,0.055385,0.090556,...,0.105,0.115,0.131667,0.179,0.196923,0.017143,0.069231,0.172,0.17625,0.213333
8Lfz8SAKa6k0,0.133929,0.152857,1.0,0.064286,0.11369,0.059524,0.063265,0.045378,0.094505,0.107937,...,0.045536,0.049286,0.060714,0.097143,0.162637,0.019388,0.157692,0.052143,0.039286,0.183333


In [None]:
hierarchical_df.head(3)

pipe:ID,8I6sM5zapD60,8ZICOHBmAHyQ,8Lfz8SAKa6k0,3ztCv-WpxJ4U,6mOhjfscZK2A,1eteONeHL82Y,4ko3QE49jYdg,1mjbqKfwSW7U,1BruMQFjEIRY,1Qhnsa15Gixs,...,12QgNIYLxydAY,12IVVtroo82Uk,127HM_Da6VkYA,12jbMf7CPG0ck,11ONBt_S6XLVM,8Qx2WQd_ANdI,7rwytM7w-8HE,8MBQRpA2XSQU,10MPMUvFFT_H0,11zdGfDSphJpg
pipe:ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8I6sM5zapD60,1.0,0.13,0.133929,0.0125,0.122917,0.095833,0.157143,0.005882,0.067308,0.030556,...,0.259375,0.2075,0.1875,0.24,0.053846,0.044643,0.034615,0.2125,0.2375,0.066667
8ZICOHBmAHyQ,0.13,1.0,0.152857,0.035,0.255833,0.096667,0.068571,0.025294,0.055385,0.090556,...,0.105,0.115,0.131667,0.179,0.196923,0.017143,0.069231,0.172,0.17625,0.213333
8Lfz8SAKa6k0,0.133929,0.152857,1.0,0.064286,0.11369,0.059524,0.063265,0.045378,0.094505,0.107937,...,0.045536,0.049286,0.060714,0.097143,0.162637,0.019388,0.157692,0.052143,0.039286,0.183333


In [None]:
# SPEICHERN ALLER DDC-BASIERTEN ÄHNLICHKEITSMATRIZEN
print("Speichere alle DDC-basierten Ähnlichkeitsmatrizen über DataHandler...")

# Speichere alle Matrizen über die DataHandler save_data API
dataHandler.save_data(cosine_ddc_df, "data_files.processed_data.similarity_keyword_based.df_keyword_cos_similarity")
dataHandler.save_data(jaccard_similarity_df, "data_files.processed_data.similarity_keyword_based.df_keyword_jaccard_similarity")
dataHandler.save_data(weighted_jaccard_df, "data_files.processed_data.similarity_keyword_based.df_keyword_weighted_jaccard_similarity")
dataHandler.save_data(tfidf_cosine_df, "data_files.processed_data.similarity_keyword_based.df_keyword_tfidf_cosine_similarity")
dataHandler.save_data(overlap_df, "data_files.processed_data.similarity_keyword_based.df_keyword_overlap_similarity")
dataHandler.save_data(dice_df, "data_files.processed_data.similarity_keyword_based.df_keyword_dice_similarity")
dataHandler.save_data(hierarchical_df, "data_files.processed_data.similarity_keyword_based.df_keyword_hierarchical_similarity")

print("✅ Alle DDC-basierten Ähnlichkeitsmatrizen gespeichert!")
print("📁 Neue strukturierte Pfade in den Unterordnern verwendet")

Speichere alle DDC-basierten Ähnlichkeitsmatrizen über DataHandler...
💾 Datei gespeichert: keyword_cosine_similarity.p
   📁 Pfad: /media/sz/Data/Connected_Lecturers/Opal/processed/similarity/keyword_cosine_similarity.p
   📊 DataFrame: 4,548 Zeilen × 4548 Spalten
   📏 Dateigröße: 157.9 MB
   🕐 Zeitstempel: 2025-07-29 12:54:43
   ⏱️  Speicherdauer: 0.10 Sekunden
💾 Datei gespeichert: keyword_jaccard_similarity.p
   📁 Pfad: /media/sz/Data/Connected_Lecturers/Opal/processed/similarity/keyword_jaccard_similarity.p
   📊 DataFrame: 4,548 Zeilen × 4548 Spalten
   📏 Dateigröße: 157.9 MB
   🕐 Zeitstempel: 2025-07-29 12:54:43
   ⏱️  Speicherdauer: 0.08 Sekunden
💾 Datei gespeichert: keyword_weighted_jaccard_similarity.p
   📁 Pfad: /media/sz/Data/Connected_Lecturers/Opal/processed/similarity/keyword_weighted_jaccard_similarity.p
   📊 DataFrame: 4,548 Zeilen × 4548 Spalten
   📏 Dateigröße: 157.9 MB
   🕐 Zeitstempel: 2025-07-29 12:54:43
   ⏱️  Speicherdauer: 0.08 Sekunden
💾 Datei gespeichert: keyword_