In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
from tqdm import tqdm
import json
from psycopg2 import sql
database = {
    "database": "postgres",
    "user": "postgres",
    "password": "password",
    "host": "192.168.1.16",
    "port": "5432"
}

def connect_to_db():
    return psycopg2.connect(
        dbname=database["database"],
        user=database["user"],
        password=database["password"],
        host=database["host"],
        port=database["port"]
    )

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
conn = connect_to_db()
conn.close()

In [4]:
from dataclasses import dataclass

@dataclass
class Subject:
    code: str
    name: str
    classification_name: str
    alternative_names: list[str]
    related: list[str]

In [26]:
# import json, urllib
# from tqdm import tqdm
# from psycopg2 import sql
# with open("../llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as gnd_subjects_file:
#     gnd_subjects = json.load(gnd_subjects_file)

# subjects = [
#     Subject(
#         code=subject["Code"],
#         name=urllib.parse.quote_plus(subject["Name"]),
#         classification_name=urllib.parse.quote_plus(subject["Classification Name"]),
#         alternative_names=[
#             urllib.parse.quote_plus(alternative_name)
#             for alternative_name in subject["Alternate Name"]
#         ],
#         related=[
#             urllib.parse.quote_plus(related)
#             for related in subject["Related Subjects"]
#         ]
#     )
#     for subject in gnd_subjects
# ]

# conn = connect_to_db()
# cursor = conn.cursor()
# cursor.execute("SET search_path TO ag_catalog;")
# for subject in tqdm(subjects):
#     query = sql.SQL("""
#         SELECT * FROM cypher('gnd', $$
#             MATCH (s:Subject {{code: {code}}})
#             SET s.name = {name}
#             RETURN s.name
#         $$) AS (s text);
#     """).format(
#         code=sql.Literal(subject.code),
#         name=sql.Literal(subject.name)
#     )
#     cursor.execute(query)
# conn.commit()
# conn.close()

In [5]:
def random_sample_documents(amount):
    connection = connect_to_db()
    cursor = connection.cursor()

    cursor.execute(
        f"""
        SET search_path TO ag_catalog;
        SELECT * FROM (
                SELECT * FROM cypher('gnd', $$
                MATCH (d:Document)-[:DOC_SUBJECT]->(s:Subject)
                RETURN d
            $$) AS (d agtype)
        ) AS subquery
        ORDER BY random()
        LIMIT {amount};
        """
    )
    documents = [json.loads(a[0].replace("::vertex", "")) for a in cursor.fetchall() ]
    to_ret = []
    for document in documents:
        doc_id = document["id"]
        query = f"""
        SELECT * FROM cypher('gnd', $$
            MATCH (d:Document)-[:DOC_SUBJECT]->(s:Subject)
            WHERE id(d) = {doc_id}
            RETURN s
        $$) AS (s agtype);
        """
        cursor.execute(query)

        related_subjects = [json.loads(a[0].replace("::vertex", "")) for a in cursor.fetchall() ]
        to_ret.append((document, related_subjects))

    connection.close()
    
    return to_ret
    
docs = random_sample_documents(4)

In [6]:
from typing import List
SUBJ_SAMPLE_NR = 100

def sample_random_subjects(sample_size: int):
    to_ret = []
    connection = connect_to_db()
    cursor = connection.cursor()

    cursor.execute(f"""
        SET search_path TO ag_catalog;
        SELECT * FROM
        ( 
            SELECT * FROM cypher('gnd', $$
                MATCH (s:Subject)
                RETURN s
            $$) AS (s agtype)
        ) AS subquery
        ORDER BY random()
        LIMIT {sample_size}
    """)
    
    subjects = [json.loads(a[0].replace("::vertex", "")) for a in cursor.fetchall()]
    
    connection.close()

    return subjects

test_data = []
for doc in tqdm(docs):
    subj_to_sample = SUBJ_SAMPLE_NR - len(doc[1])
    sample = sample_random_subjects(subj_to_sample)
    sample.extend(doc[1])
    test_data.append((doc, sample))
    

100%|██████████| 4/4 [00:00<00:00,  7.30it/s]


In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import base64


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')


In [30]:
# similarity scores based on title
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import torch
from datetime import datetime
import urllib
def plot_similarity(base, target_sentences, actual_targets, similarities, num_cols=5):

    similarity_floats = [sim.item() for sim in similarities]

    # 2) Combine original/truncated text & similarity into a DataFrame
    df = pd.DataFrame({
        'Original Sentence': target_sentences,
        'Truncated Sentence': [
            (ts[:10] + '...') if len(ts) > 10 else ts
            for ts in target_sentences
        ],
        'Similarity': similarity_floats
    })

    # 3) Sort by similarity in descending order
    df = df.sort_values('Similarity', ascending=False).reset_index(drop=True)

    # 4) Determine how many rows we need to accommodate 'num_cols'
    num_items = len(df)
    num_rows = math.ceil(num_items / num_cols)

    # 5) Create two 2D arrays (shape: [num_rows x num_cols]):
    #    - 'score_grid' for numeric similarity
    #    - 'text_grid' for truncated sentence annotation
    score_grid = [[float('nan')] * num_cols for _ in range(num_rows)]
    text_grid  = [[""] * num_cols for _ in range(num_rows)]

    #    We’ll also keep track of the original sentence to check for highlighting
    original_grid = [[""] * num_cols for _ in range(num_rows)]

    # Fill the grids row by row
    idx = 0
    for r in range(num_rows):
        for c in range(num_cols):
            if idx < num_items:
                row = df.iloc[idx]
                score_grid[r][c]     = row['Similarity']
                text_grid[r][c]      = row['Truncated Sentence']
                original_grid[r][c]  = row['Original Sentence']
                idx += 1
            else:
                # No more data, leave as NaN / ""
                pass

    # 6) Convert 'score_grid' to a Pandas DataFrame for the heatmap
    #    We’ll label rows as 0..(num_rows-1) and columns as 0..(num_cols-1).
    score_df = pd.DataFrame(score_grid)

    # 7) Plot
    sns.set_style("whitegrid")
    sns.set_context("paper", font_scale=0.8)

    plt.figure(figsize=(max(6, num_cols * 1.2), max(3, num_rows * 1.2)))

    ax = sns.heatmap(
        score_df,
        annot=False,            # We'll do manual annotation
        cmap="vlag",
        cbar=True,
        vmin=-1,                # Color scale min
        vmax=1,                 # Color scale max
        linewidths=0.5,
        linecolor="white"       # So we see grid lines
    )

    # 8) Manually annotate each cell with truncated sentence text
    #    If the original sentence is in 'actual_targets', make it green & bold.
    for r in range(num_rows):
        for c in range(num_cols):
            val = score_grid[r][c]
            if not math.isnan(val):
                truncated_sent = text_grid[r][c]
                original_sent  = original_grid[r][c]

                # Check highlighting
                if original_sent in actual_targets:
                    color = "green"
                    fontweight = "bold"
                else:
                    color = "black"
                    fontweight = "normal"

                ax.text(
                    c + 0.5,
                    r + 0.5,
                    truncated_sent,
                    ha="center",
                    va="center",
                    color=color,
                    fontweight=fontweight,
                    fontsize=8
                )

    # 9) Cosmetic tweaks: hide numeric tick labels, add a title
    ax.set_xticks([])   # Hide x tick positions
    ax.set_yticks([])   # Hide y tick positions
    ax.set_xlabel("")
    ax.set_ylabel("")

    plt.title(f'Similarity Scores for "{base}"', pad=10)

    plt.tight_layout()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plt.savefig(f"title_only_sim_plot{timestamp}.png", dpi=600, bbox_inches='tight')

def compute_similarity(base, targets):
    to_ret = []
    for target in targets:
        to_ret.append(F.cosine_similarity(base, target, dim=0))

    return to_ret

for sample in test_data:
    document = sample[0][0]
    document_title = decode_base64(document['properties']['title'])
    
    all_sentences = [urllib.parse.unquote(a['properties']['name']) for a in sample[1]]
    actual_related = [urllib.parse.unquote(a['properties']['name']) for a in sample[0][1]]
    all_sentences.insert(0, document_title)
    tokenized_sentences = tokenizer(all_sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**tokenized_sentences)
    
    embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
    
    similarity_scores = compute_similarity(embeddings[0], embeddings[1:])
    # plot_similarity(all_sentences[0], all_sentences[1:], actual_related, similarity_scores)

    

In [31]:
def plot_similarity(base, target_sentences, actual_targets, similarities, num_cols=5):
    """
    Displays similarity scores in a multi-column heatmap grid:
      - Each cell's color scale is determined by similarity (-1 to +1).
      - The cell annotation shows the truncated sentence (up to 10 chars).
      - Sentences found in 'actual_targets' appear in green and bold text.
    
    :param base: The "base" sentence (string) we compare everything against.
    :param target_sentences: List of strings (the sentences being compared).
    :param actual_targets: List of strings that should be highlighted in green/bold.
    :param similarities: List of PyTorch tensors for similarity scores.
    :param num_cols: Number of columns in the grid.
    """
    # 1) Convert similarity scores (list of tensors) to floats
    similarity_floats = [sim.item() for sim in similarities]

    # 2) Combine original/truncated text & similarity into a DataFrame
    df = pd.DataFrame({
        'Original Sentence': target_sentences,
        'Truncated Sentence': [
            (ts[:10] + '...') if len(ts) > 10 else ts
            for ts in target_sentences
        ],
        'Similarity': similarity_floats
    })

    # 3) Sort by similarity in descending order
    df = df.sort_values('Similarity', ascending=False).reset_index(drop=True)

    # 4) Determine how many rows we need to accommodate 'num_cols'
    num_items = len(df)
    num_rows = math.ceil(num_items / num_cols)

    # 5) Create two 2D arrays (shape: [num_rows x num_cols]):
    #    - 'score_grid' for numeric similarity
    #    - 'text_grid' for truncated sentence annotation
    score_grid = [[float('nan')] * num_cols for _ in range(num_rows)]
    text_grid  = [[""] * num_cols for _ in range(num_rows)]

    #    We’ll also keep track of the original sentence to check for highlighting
    original_grid = [[""] * num_cols for _ in range(num_rows)]

    # Fill the grids row by row
    idx = 0
    for r in range(num_rows):
        for c in range(num_cols):
            if idx < num_items:
                row = df.iloc[idx]
                score_grid[r][c]     = row['Similarity']
                text_grid[r][c]      = row['Truncated Sentence']
                original_grid[r][c]  = row['Original Sentence']
                idx += 1
            else:
                # No more data, leave as NaN / ""
                pass

    # 6) Convert 'score_grid' to a Pandas DataFrame for the heatmap
    #    We’ll label rows as 0..(num_rows-1) and columns as 0..(num_cols-1).
    score_df = pd.DataFrame(score_grid)

    # 7) Plot
    sns.set_style("whitegrid")
    sns.set_context("paper", font_scale=0.8)

    plt.figure(figsize=(max(6, num_cols * 1.2), max(3, num_rows * 1.2)))

    ax = sns.heatmap(
        score_df,
        annot=False,            # We'll do manual annotation
        cmap="vlag",
        cbar=True,
        vmin=-1,                # Color scale min
        vmax=1,                 # Color scale max
        linewidths=0.5,
        linecolor="white"       # So we see grid lines
    )

    # 8) Manually annotate each cell with truncated sentence text
    #    If the original sentence is in 'actual_targets', make it green & bold.
    for r in range(num_rows):
        for c in range(num_cols):
            val = score_grid[r][c]
            if not math.isnan(val):
                truncated_sent = text_grid[r][c]
                original_sent  = original_grid[r][c]

                # Check highlighting
                if original_sent in actual_targets:
                    color = "green"
                    fontweight = "bold"
                else:
                    color = "black"
                    fontweight = "normal"

                ax.text(
                    c + 0.5,
                    r + 0.5,
                    truncated_sent,
                    ha="center",
                    va="center",
                    color=color,
                    fontweight=fontweight,
                    fontsize=8
                )

    # 9) Cosmetic tweaks: hide numeric tick labels, add a title
    ax.set_xticks([])   # Hide x tick positions
    ax.set_yticks([])   # Hide y tick positions
    ax.set_xlabel("")
    ax.set_ylabel("")

    plt.title(f'Similarity Scores for "{base[:10]}..."', pad=10)

    plt.tight_layout()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    plt.savefig(f"title_and_abstract_only_sim_plot{timestamp}.png", dpi=300, bbox_inches='tight')


for sample in test_data:
    document = sample[0][0]
    document_title = decode_base64(document['properties']['title']) + " " + decode_base64(document['properties']['content'])
    all_sentences = [urllib.parse.unquote(a['properties']['name']) for a in sample[1]]
    actual_related = [urllib.parse.unquote(a['properties']['name']) for a in sample[0][1]]
    all_sentences.insert(0, document_title)
    tokenized_sentences = tokenizer(all_sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**tokenized_sentences)
    
    embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
    
    similarity_scores = compute_similarity(embeddings[0], embeddings[1:])
    # plot_similarity(all_sentences[0], all_sentences[1:], actual_related, similarity_scores)

In [17]:
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import torch
from datetime import datetime
import urllib

def compute_similarity(base, targets):
    to_ret = []
    for target in targets:
        to_ret.append(F.cosine_similarity(base, target, dim=0))

    return to_ret

new_docs = random_sample_documents(50)

SUBJ_SAMPLE_NR = 500
test_data = []
for doc in tqdm(new_docs):
    subj_to_sample = SUBJ_SAMPLE_NR - len(doc[1])
    sample = sample_random_subjects(subj_to_sample)
    sample.extend(doc[1])
    test_data.append((doc, sample))

pos_thresh1 = 20
pos_thresh2 = 5
pos_thresh3 = 100
final5 = 0
final20 = 0
final100 = 0
for sample in tqdm(test_data):
    document = sample[0][0]
    document_title = urllib.parse.unquote(document['properties']['title']) + " " + urllib.parse.unquote(document['properties']['content'])
    all_sentences = [urllib.parse.unquote(a['properties']['name']) + " " + urllib.parse.unquote(a['properties']['classification_name']) for a in sample[1]]
    actual_related = [urllib.parse.unquote(a['properties']['name']) + " " + urllib.parse.unquote(a['properties']['classification_name']) for a in sample[0][1]]
    all_sentences.insert(0, document_title)
    tokenized_sentences = tokenizer(all_sentences, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**tokenized_sentences)
    
    embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
    
    similarity_scores = [(i, a) for i,a in enumerate(compute_similarity(embeddings[0], embeddings[1:]))]
    similarity_scores.sort(key= lambda x: x[1], reverse=True)
    top_scored = [a[0] for a in similarity_scores]
    top_scored = [all_sentences[i+1] for i in top_scored]
    
    count_actuals100 = 0
    count_actuals20 = 0
    count_actuals5  = 0

    for actual in actual_related:
        count_actuals20 += 1 if actual in top_scored[:pos_thresh1] else 0
        count_actuals5 += 1 if actual in top_scored[:pos_thresh2] else 0
        count_actuals100 += 1 if actual in top_scored[:pos_thresh3] else 0
    
    count_actuals5 = round(count_actuals5/len(actual_related)*100 , 2)
    count_actuals20 = round(count_actuals20/len(actual_related)*100 , 2)
    count_actuals100 = round(count_actuals100/len(actual_related)*100 , 2)
    
    final5 += count_actuals5
    final20 += count_actuals20
    final100 += count_actuals100

print(f"Precision of the sentence transformer over first 5 matches: {final5/len(test_data):.2f}")
print(f"Precision of the sentence transformer over first 20 matches: {final20/len(test_data):.2f}")
print(f"Precision of the sentence transformer over first 100 matches: {final100/len(test_data):.2f}")




100%|██████████| 50/50 [00:09<00:00,  5.18it/s]
100%|██████████| 50/50 [19:13<00:00, 23.06s/it]

Precision of the sentence transformer over first 5 matches: 33.85
Precision of the sentence transformer over first 20 matches: 58.72
Precision of the sentence transformer over first 100 matches: 90.41





In [14]:
import urllib

def grab_top_embeddings(doc_emb, n):

    conn = connect_to_db()
    cursor = conn.cursor()

    query = f"""
        SELECT label_code
        FROM label_embeddings
        ORDER BY  1 - (embedding <=> '{doc_emb}'::vector) DESC
        LIMIT {n};
    """

    cursor.execute(query)
    top_labels = cursor.fetchall()

    cursor.close()
    conn.close()

    return [a[0] for a in top_labels]

new_docs = random_sample_documents(100)
with tqdm(new_docs, desc="Starting..") as pbar:
    overall_acc = 0
    for i,doc in enumerate(new_docs):
        document = doc[0]
        related_subjects = [a['properties']['code'] for a in doc[1]]
        document_content = urllib.parse.unquote(document['properties']['title']) + " " + urllib.parse.unquote(document['properties']['content'])
        tokenized_text = tokenizer([document_content], padding=True, truncation=True, return_tensors='pt')
        # token_ids = tokenized_text["input_ids"]
        # att_mask = tokenized_text["attention_mask"]
        with torch.no_grad():
            model_output = model(**tokenized_text)
    
        embeddings = mean_pooling(model_output, tokenized_text['attention_mask'])
        
        emb_vector = embeddings.cpu().numpy().flatten()
        embedding_list = emb_vector.tolist()
        embedding_str = '[' + ','.join(map(str, embedding_list)) + ']'

        top_embeddings = grab_top_embeddings(embedding_str, 100)
        doc_acc = 0
        for subject_code in related_subjects:
            doc_acc += 1 if subject_code in top_embeddings else 0
        
        doc_acc = round(doc_acc/len(related_subjects) * 100, 2)
        overall_acc += doc_acc

        pbar.set_description(f"Current computed accuracy: {overall_acc / (i+1):.2f}")
        pbar.update()

    print(f"Actual subjects accuracy in top 100 closest in the dataset: {overall_acc/len(new_docs):.2f}")


Current computed accuracy: 12.68: 100%|██████████| 100/100 [00:15<00:00,  6.32it/s]

Actual subjects accuracy in top 100 closest in the dataset: 12.68



