In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import psycopg2
from tqdm import tqdm
import json
from psycopg2 import sql
database = {
    "database": "postgres",
    "user": "postgres",
    "password": "password",
    "host": "192.168.1.16",
    "port": "5432"
}

def connect_to_db():
    return psycopg2.connect(
        dbname=database["database"],
        user=database["user"],
        password=database["password"],
        host=database["host"],
        port=database["port"]
    )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
conn = connect_to_db()
conn.close()

In [3]:
from dataclasses import dataclass

@dataclass
class Subject:
    code: str
    name: str
    classification_name: str
    alternative_names: list[str]
    related: list[str]

In [4]:
# import json, urllib
# from tqdm import tqdm
# from psycopg2 import sql
# with open("../llms4subjects/shared-task-datasets/GND/dataset/GND-Subjects-all.json", "r") as gnd_subjects_file:
#     gnd_subjects = json.load(gnd_subjects_file)

# subjects = [
#     Subject(
#         code=subject["Code"],
#         name=urllib.parse.quote_plus(subject["Name"]),
#         classification_name=urllib.parse.quote_plus(subject["Classification Name"]),
#         alternative_names=[
#             urllib.parse.quote_plus(alternative_name)
#             for alternative_name in subject["Alternate Name"]
#         ],
#         related=[
#             urllib.parse.quote_plus(related)
#             for related in subject["Related Subjects"]
#         ]
#     )
#     for subject in gnd_subjects
# ]

# conn = connect_to_db()
# cursor = conn.cursor()
# cursor.execute("SET search_path TO ag_catalog;")
# for subject in tqdm(subjects):
#     query = sql.SQL("""
#         SELECT * FROM cypher('gnd', $$
#             MATCH (s:Subject {{code: {code}}})
#             SET s.name = {name}
#             RETURN s.name
#         $$) AS (s text);
#     """).format(
#         code=sql.Literal(subject.code),
#         name=sql.Literal(subject.name)
#     )
#     cursor.execute(query)
# conn.commit()
# conn.close()

In [6]:
def random_sample_documents(amount):
    connection = connect_to_db()
    cursor = connection.cursor()

    cursor.execute(
        f"""
        SET search_path TO ag_catalog;
        SELECT * FROM (
                SELECT * FROM cypher('gnd', $$
                MATCH (d:Document)-[:DOC_SUBJECT]->(s:Subject)
                RETURN d
            $$) AS (d agtype)
        ) AS subquery
        ORDER BY random()
        LIMIT {amount};
        """
    )
    documents = [json.loads(a[0].replace("::vertex", "")) for a in cursor.fetchall() ]
    to_ret = []
    for document in documents:
        doc_id = document["id"]
        query = f"""
        SELECT * FROM cypher('gnd', $$
            MATCH (d:Document)-[:DOC_SUBJECT]->(s:Subject)
            WHERE id(d) = {doc_id}
            RETURN s
        $$) AS (s agtype);
        """
        cursor.execute(query)

        related_subjects = [json.loads(a[0].replace("::vertex", "")) for a in cursor.fetchall() ]
        to_ret.append((document, related_subjects))

    connection.close()
    
    return to_ret
    
docs = random_sample_documents(4)

In [7]:
from typing import List
SUBJ_SAMPLE_NR = 100

def sample_random_subjects(sample_size: int):
    to_ret = []
    connection = connect_to_db()
    cursor = connection.cursor()

    cursor.execute(f"""
        SET search_path TO ag_catalog;
        SELECT * FROM
        ( 
            SELECT * FROM cypher('gnd', $$
                MATCH (s:Subject)
                RETURN s
            $$) AS (s agtype)
        ) AS subquery
        ORDER BY random()
        LIMIT {sample_size}
    """)
    
    subjects = [json.loads(a[0].replace("::vertex", "")) for a in cursor.fetchall()]
    
    connection.close()

    return subjects

test_data = []
for doc in tqdm(docs):
    subj_to_sample = SUBJ_SAMPLE_NR - len(doc[1])
    sample = sample_random_subjects(subj_to_sample)
    sample.extend(doc[1])
    test_data.append((doc, sample))
    

100%|██████████| 4/4 [00:00<00:00,  7.25it/s]


In [8]:
from torch import nn

class FineTunedModel(nn.Module):
    def __init__(self, base_model):
        super(FineTunedModel, self).__init__()
        self.base_model = base_model

    def _mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


    def forward(self, input_ids, attention_mask):
        model_output = self.base_model(input_ids = input_ids, attention_mask=attention_mask)
        sentence_embeddings = self._mean_pooling(model_output, attention_mask)

        return sentence_embeddings

In [9]:
from transformers import AutoTokenizer, AutoModel
import torch
import base64


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = torch.load("fine_tuned_model_complete.pth")
model.eval()

  model = torch.load("fine_tuned_model_complete.pth")


FineTunedModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(250037, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, ele

In [None]:
import torch.nn.functional as F
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import torch
from datetime import datetime
import urllib

def compute_similarity(base, targets):
    to_ret = []
    for target in targets:
        to_ret.append(F.cosine_similarity(base, target, dim=0))

    return to_ret

new_docs = random_sample_documents(50)

SUBJ_SAMPLE_NR = 500
test_data = []
for doc in tqdm(new_docs):
    subj_to_sample = SUBJ_SAMPLE_NR - len(doc[1])
    sample = sample_random_subjects(subj_to_sample)
    sample.extend(doc[1])
    test_data.append((doc, sample))

pos_thresh1 = 20
pos_thresh2 = 5
pos_thresh3 = 100
final5 = 0
final20 = 0
final100 = 0
for sample in tqdm(test_data):
    document = sample[0][0]
    document_title = urllib.parse.unquote(document['properties']['title']) + " " + urllib.parse.unquote(document['properties']['content'])
    all_sentences = [urllib.parse.unquote(a['properties']['name']) + " " + urllib.parse.unquote(a['properties']['classification_name']) for a in sample[1]]
    actual_related = [urllib.parse.unquote(a['properties']['name']) + " " + urllib.parse.unquote(a['properties']['classification_name']) for a in sample[0][1]]
    all_sentences.insert(0, document_title)
    tokenized_sentences = tokenizer(all_sentences, padding=True, truncation=True, return_tensors='pt')
    token_ids = tokenized_sentences["input_ids"]
    att_mask  = tokenized_sentences["attention_mask"]
    with torch.no_grad():
        embeddings = model(token_ids, att_mask)
    
    # embeddings = mean_pooling(model_output, tokenized_sentences['attention_mask'])
    
    similarity_scores = [(i, a) for i,a in enumerate(compute_similarity(embeddings[0], embeddings[1:]))]
    similarity_scores.sort(key= lambda x: x[1], reverse=True)
    top_scored = [a[0] for a in similarity_scores]
    top_scored = [all_sentences[i+1] for i in top_scored]
    
    count_actuals100 = 0
    count_actuals20 = 0
    count_actuals5  = 0

    for actual in actual_related:
        count_actuals20 += 1 if actual in top_scored[:pos_thresh1] else 0
        count_actuals5 += 1 if actual in top_scored[:pos_thresh2] else 0
        count_actuals100 += 1 if actual in top_scored[:pos_thresh3] else 0
    
    count_actuals5 = round(count_actuals5/len(actual_related)*100 , 2)
    count_actuals20 = round(count_actuals20/len(actual_related)*100 , 2)
    count_actuals100 = round(count_actuals100/len(actual_related)*100 , 2)
    
    final5 += count_actuals5
    final20 += count_actuals20
    final100 += count_actuals100

print(f"Precision of the sentence transformer over first 5 matches: {final5/len(test_data):.2f}")
print(f"Precision of the sentence transformer over first 20 matches: {final20/len(test_data):.2f}")
print(f"Precision of the sentence transformer over first 100 matches: {final100/len(test_data):.2f}")




100%|██████████| 50/50 [00:08<00:00,  5.85it/s]
100%|██████████| 50/50 [18:08<00:00, 21.76s/it]

Precision of the sentence transformer over first 5 matches: 39.22
Precision of the sentence transformer over first 20 matches: 68.34
Precision of the sentence transformer over first 100 matches: 90.24





In [14]:
import urllib
def grab_top_embeddings(doc_emb, n):

    conn = connect_to_db()
    cursor = conn.cursor()

    query = f"""
        SELECT label_code
        FROM label_embeddings
        ORDER BY embedding <=> '{doc_emb}'::vector
        LIMIT {n};
    """

    cursor.execute(query)
    top_labels = cursor.fetchall()

    cursor.close()
    conn.close()

    return [a[0] for a in top_labels]

new_docs = random_sample_documents(1000)
with tqdm(new_docs, desc="Starting..") as pbar:
    overall_acc = 0
    for i,doc in enumerate(new_docs):
        document = doc[0]
        related_subjects = [a['properties']['code'] for a in doc[1]]
        document_content = urllib.parse.unquote(document['properties']['title']) + " " + urllib.parse.unquote(document['properties']['content'])
        tokenized_text = tokenizer([document_content], padding=True, truncation=True, return_tensors='pt')
        token_ids = tokenized_text["input_ids"]
        att_mask = tokenized_text["attention_mask"]
        with torch.no_grad():
            embeddings = model(token_ids, att_mask)
        
        emb_vector = embeddings.cpu().numpy().flatten()
        embedding_list = emb_vector.tolist()
        embedding_str = '[' + ','.join(map(str, embedding_list)) + ']'

        top_embeddings = grab_top_embeddings(embedding_str, 100)
        doc_acc = 0
        for subject_code in related_subjects:
            doc_acc += 1 if subject_code in top_embeddings else 0
        
        doc_acc = round(doc_acc/len(related_subjects) * 100, 2)
        overall_acc += doc_acc

        pbar.set_description(f"Current computed accuracy: {overall_acc / (i+1):.2f}")
        pbar.update()
        
    print(f"Actual subjects accuracy in top 100 closest in the dataset: {overall_acc/len(new_docs):.2f}")
        

Current computed accuracy: 21.22: 100%|██████████| 1000/1000 [02:59<00:00,  5.58it/s]

Actual subjects accuracy in top 100 closest in the dataset: 21.22



