In [138]:
import fasttext
import fasttext.util
import numpy as np
import heapq
import pandas as pd
from scipy.optimize import linear_sum_assignment
import random
from pprint import pprint

# Load FastText embeddings
fasttext.util.download_model('en', if_exists='ignore')  # English model
en_model = fasttext.load_model('cc.en.300.bin')

In [2]:
fasttext.util.reduce_model(en_model, 128)

<fasttext.FastText._FastText at 0x10b788280>

In [55]:
import json
from pprint import pprint
from functools import lru_cache
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import Levenshtein

In [4]:
stemmer = PorterStemmer()
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tomi_owolabi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tomi_owolabi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
SIM_VALUES_PATH = "similarity_data/sim.values.txt"
SIM_WORDS_PATH = "similarity_data/sim.words.txt"

In [6]:
class WordDistance():
    def __init__(self, emb_model, words_path=None, sim_path=None):
        self.model = emb_model
        self.words = []
        self.sim_values = []
        if words_path and sim_path: 
            with open(words_path, "r") as f:
                self.words = f.readlines()
            with open(sim_path, "r") as f:
                self.sim_values = f.readlines()
        self.num_words = len(self.words)
    
    def embed_word(self, word):
        """Get the FastText embedding for a word."""
        return self.model.get_word_vector(word)
    
    @staticmethod
    def cosine_distance(vec1, vec2):
        """Compute cosine distance between two vectors."""
        return 1 - np.dot(vec1, vec2) / ((np.linalg.norm(vec1) * np.linalg.norm(vec2)) + 1e-8) #numerical stability
    
    @lru_cache(maxsize=1024)
    def _word_cosine_distance(self, word1, word2):
        word1_emb = self.embed_word(word1)
        word2_emb = self.embed_word(word2)
        return round(self.cosine_distance(word1_emb, word2_emb), 6)
    
    def c_word_cosine_distance(self, word1, word2):
        # return random.randint(1, 10)/10
        try:
            w1_index = self.words.index(word1)
            w2_index = self.words.index(word2)
            cache_index = w1_index * self.num_words + w2_index
            return self.sim_values[cache_index]
        except ValueError:
            return self._word_cosine_distance(word1, word2)

In [7]:
def load_word_data(words_path=SIM_WORDS_PATH, sim_path=SIM_VALUES_PATH):
    """Load word data and similarity values."""
    words = []
    sim_values = []
    if words_path and sim_path:
        with open(words_path, "r") as f:
            words = [x.strip() for x in f.readlines()]
        with open(sim_path, "r") as f:
            sim_values = [float(x) for x in f.readlines()]
    return words, sim_values

In [None]:
__model = en_model
__words, __sim_values = load_word_data(SIM_WORDS_PATH, SIM_VALUES_PATH)
__num_words = len(__words)
__word2idx = {word: idx for word, idx in zip(__words, range(len(__words)))}

__hits = 0
__misses = 0

def embed_word(word):
    """Get the FastText embedding for a word."""
    return __model.get_word_vector(word)

def cosine_distance(vec1, vec2):
    """Compute cosine distance between two vectors."""
    return 1 - np.dot(vec1, vec2) / ((np.linalg.norm(vec1) * np.linalg.norm(vec2)) + 1e-8)  # numerical stability

@lru_cache(maxsize=1024)
def word_cosine_distance(word1, word2):
    """Compute cosine distance between two words."""
    word1_emb = embed_word(word1)
    word2_emb = embed_word(word2)
    return cosine_distance(word1_emb, word2_emb)
    # return int(1000 * cosine_distance(word1_emb, word2_emb))

@lru_cache(maxsize=10000)
def get_stem(word):
    return stemmer.stem(word)

@lru_cache(maxsize=2048)
def cached_word_cosine_distance(word1, word2):
    # return 0
    """Compute cached cosine distance between two words."""
    global __hits, __misses
    w1_index = __word2idx.get(word1, None)
    w2_index = __word2idx.get(word2, None)
    if w1_index and w2_index:
        __hits += 1
        cache_index = w1_index * __num_words + w2_index
        # return int(1000 * __sim_values[cache_index])
        return __sim_values[cache_index]

    else:
        __misses += 1
        # return 0
        return word_cosine_distance(word1, word2)

In [9]:
# len(__words) * len(__words)
len(__sim_values)

45927729

In [10]:
def clean_text(text):
    """
    Removes stop words and punctuation from the given text.

    Args:
        text (str): The input text.

    Returns:
        str: The cleaned text.
    """
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Get the stop words and punctuation
    stop_words = set(stopwords.words('english'))
    punctuation = set(string.punctuation)
    
    # Filter out stop words and punctuation
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words and word not in punctuation]
    
    # Join the tokens back into a string
    return ' '.join(filtered_tokens)

In [11]:
import zipfile

def read_file_from_zip(zip_path, file_name):
    """
    Reads the content of a specific file from a zipped folder without extracting the entire folder.

    Args:
        zip_path (str): Path to the zip file.
        file_name (str): Name of the file to read within the zip.

    Returns:
        str: Content of the file as a string.
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        with zip_ref.open(file_name) as file:
            return file.read().decode('utf-8')

In [140]:
DEV_KNOWLEDGE_STORE = "baseline/AVeriTeC/data_store/knowledge_store/dev_knowledge_store.zip"
TRAIN_KNOWLEDGE_STORE_999 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/train/train_0_999.zip"
TRAIN_KNOWLEDGE_STORE_1999 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/train/train_1000_1999.zip"
TRAIN_KNOWLEDGE_STORE_3067 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/train/train_2000_3067.zip"
TEST_KNOWLEDGE_STORE_499 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/test_updated/output_test_0_499.zip"
TEST_KNOWLEDGE_STORE_999 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/test_updated/output_test_500_999.zip"
TEST_KNOWLEDGE_STORE_1499 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/test_updated/output_test_1000_1499.zip"
TEST_KNOWLEDGE_STORE_1999 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/test_updated/output_test_1500_1999.zip"
TEST_KNOWLEDGE_STORE_2214 = "/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data_store/knowledge_store/test_updated/output_test_2000_2214.zip"

In [None]:
def get_knowledge_store_for_claim(claim_id, div="train"):
    references = []
    try:
        if div == "train":
            if claim_id <= 999:
                train_path = TRAIN_KNOWLEDGE_STORE_999
            elif claim_id <= 1999:
                train_path = TRAIN_KNOWLEDGE_STORE_1999
            else:
                train_path = TRAIN_KNOWLEDGE_STORE_3067
            claim_file = read_file_from_zip(train_path, f"{claim_id}.json")
        elif div == "val":
            claim_file = read_file_from_zip("baseline/AVeriTeC/data_store/knowledge_store/dev_knowledge_store.zip", f"output_dev/{claim_id}.json")
        elif div == "test":
            if claim_id <= 499:
                test_path = TEST_KNOWLEDGE_STORE_499
            elif claim_id <= 999:
                test_path = TEST_KNOWLEDGE_STORE_999
            elif claim_id <= 1499:
                test_path = TEST_KNOWLEDGE_STORE_1499
            elif claim_id <= 1999:
                test_path = TEST_KNOWLEDGE_STORE_1999
            else:
                test_path = TEST_KNOWLEDGE_STORE_2214
            claim_file = read_file_from_zip(test_path, f"{claim_id}.json")
            
    except Exception as e:
        print(e)
        return None

    for line in claim_file.splitlines():
        try:
            this_ref = json.loads(line)
            references.append(this_ref)
        except Exception as e:
            print(e)
            continue
    return references

In [102]:
def optimal_matching(query_embeddings, text_embeddings):
    """Find an optimal matching between query words and text words using the Hungarian algorithm."""
    cost_matrix = np.zeros((len(query_embeddings), len(text_embeddings)))
    
    for i, q_emb in enumerate(query_embeddings):
        for j, t_emb in enumerate(text_embeddings):
            cost_matrix[i, j] = cosine_distance(q_emb, t_emb)
    
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    total_score = -cost_matrix[row_ind, col_ind].sum()
    
    return total_score


cost_matrix = None #avoid having to assign memory with each call. Will resize cost matrix when shape changes
def word_optimal_matching(query_words, text_words):
    # cost_matrix = np.zeros((len(query_words), len(text_words)))
    # return 0.1
    global cost_matrix
    shape = (len(query_words), len(text_words))
    if cost_matrix is None or cost_matrix.shape != shape:
        cost_matrix = np.zeros(shape)
    else:
        cost_matrix[:] = 1
    for i, word1 in enumerate(query_words):
        for j, word2 in enumerate(text_words):
            dist = cached_word_cosine_distance(word1, word2)
            # if dist > 0.7:
            #     dist = 1
            cost_matrix[i, j] = dist
            
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    total_score = -cost_matrix[row_ind, col_ind].sum()
    lev_score = Levenshtein.ratio(col_ind, sorted(col_ind))
    match_width = max(col_ind) - min(col_ind)
    
    
    return total_score#, lev_score, match_width

def find_top_n_matches(query_text, target_text, patch_size=50, overlap=25, top_n=5, ret_string=False):
    """Find the top N scoring spans in the target text using a sliding window without redundant embeddings."""
    # target_words = [get_stem(word) for word in target_text.split()]
    # query_words = [get_stem(word) for word in query_text.split()]
    target_words = [word for word in target_text.split()]
    query_words = [word for word in query_text.split()]
    heap = []    
    for start in range(0, max(1, len(target_words) - patch_size + 1), patch_size - overlap):
        end = start + patch_size
        curr_target_words = target_words[start:end]
        score = word_optimal_matching(query_words, curr_target_words)
        heapq.heappush(heap, (score, start, end))
        if len(heap) > top_n:
            heapq.heappop(heap)
    if not ret_string: 
        return sorted(heap, reverse=True)
    else:
        top_matching = sorted(heap, reverse=True)
        str_list = [" ".join(target_text.split()[x[1]: x[2]]) for x in top_matching]
        return [(score, span) for (score, _, _), span in zip(top_matching, str_list)]
        
        
        # str_list = [" ".join(query_words[x[-2]: x[-1]]) for x in top_matching]
        # return [(str_score[0], str_val) for str_score, str_val in zip(top_matching, str_list)]
        

In [14]:
dev_tasks = None
with open("/Users/tomi_owolabi/projects/cpsc601/baseline/AVeriTeC/data/dev.json") as f:
    dev_tasks = json.load(f)

In [15]:
def get_claim_by_id(claim_list, claim_id):
    filtered = claim_list[claim_id:claim_id+1]
    return filtered[0] if filtered else None

In [16]:
def get_span_text_from_claim_doc(claim_doc, span): 
    span = span[-2:]
    claim_text = " ".join(claim_doc.get("url2text")).split()
    return " ".join(claim_text[span[0]: span[1]])

In [17]:
def filter_claim_doc(claim, claim_doc_dict, patch_size):
    claim_text = " ".join(claim_doc_dict.get("url2text")).lower()[:1500]
    top_matches = find_top_n_matches(claim, claim_text, patch_size=patch_size, overlap=0)
    # claim_doc_dict["most_rel"] =[]
    # for i, match in enumerate(top_matches):
    #     # match_text = get_span_text_from_claim_doc(claim_doc_dict, match)
    #     claim_doc_dict["most_rel"].append(match)
    return claim_doc_dict    

In [18]:
# claim_id = 4
# claim = get_claim_by_id(dev_tasks, claim_id)
# pprint(claim.get("label"))
# claim = claim.get("claim")
# claim_knowledge = get_knowledge_store_for_claim(claim_id)
# print(len(claim_knowledge))
# claim_knowledge = filter_claim_doc(claim, claim_knowledge[5])

In [19]:
def process_claim(claim_list, claim_id):
    claim_id = claim_id
    claim = get_claim_by_id(claim_list, claim_id)
    claim_text = claim.get("claim", "").lower()
    patch_size = 96
    claim_docs = get_knowledge_store_for_claim(claim_id)
    claim_docs = [
        filter_claim_doc(claim_text, doc, patch_size) for doc in claim_docs[:]
    ]
    return claim_docs

In [20]:
print(f"calls: {__hits + __misses}")
print(f"hits: {__hits}, {(__hits/(__hits+__misses+1)):.2f}%")
print(f"misses: {__misses}, {(__misses/(__hits+__misses+1)):.2f}%")

calls: 0
hits: 0, 0.00%
misses: 0, 0.00%


In [27]:
import time
start_time = time.time()
g = process_claim(dev_tasks, 5)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

Unterminated string starting at: line 1 column 29355 (char 29354)
Expecting value: line 1 column 1 (char 0)
Execution time: 7.05 seconds


In [28]:
# Example Usage
target_text = "This is a robust graph matching algorithm for string search. We apply it to find patterns in text efficiently."
query_text = "graph matching algorithm in bipartite graphs"

top_matches = find_top_n_matches(query_text, target_text)
for score, start, end in top_matches:
    print(f"Match score: {score:.4f}, Span: {' '.join(target_text.split()[start:end])}")

Match score: -1.3169, Span: This is a robust graph matching algorithm for string search. We apply it to find patterns in text efficiently.


In [139]:
"""
Code for generating similarity_data 
"""

N = 10000
words = pd.read_csv("unigram_freq.csv", dtype={"word": 'object'}, keep_default_na=False, na_values=[])["word"]
words = [x.lower() for x in words]
words = words[:N]
words = [stemmer.stem(word) for word in words]
words = sorted(list(set(words)))

similarity_index = [-1] * (len(words) * len(words))
for idx, word in enumerate(words):
    for idx2, word2 in enumerate(words):
        similarity_index[idx * len(words) + idx2] = word_cosine_distance(word, word2)

with open("similarity_data/sim.values.txt", "w") as f:
    for sim in similarity_index:
        f.write(f"{sim:.2f}\n")

with open("similarity_data/sim.words.txt", "w") as f: 
    for word in words:
        f.write(f"{word}\n")

In [22]:
# similarity_index = None
# with open("similarity_index.txt", "r") as f:
#     similarity_index = [float(x) for x in f.readlines()]

# similarity_dict = {}
# for idx, word in enumerate(words):
#     for idx2, word2 in enumerate(words):
#         similarity_dict[f"{word}#{word2}"] = similarity_index[idx * len(words) + idx2]

In [28]:
from docx import Document

def read_docx(file_path):
    """
    Reads the content of a .docx file.

    Args:
        file_path (str): Path to the .docx file.

    Returns:
        str: The content of the .docx file as a single string.
    """
    doc = Document(file_path)
    content = []
    for paragraph in doc.paragraphs:
        if paragraph.text.strip(): content.append(paragraph.text)
    return '\n'.join(content)

In [29]:
ZOL = "/Users/tomi_owolabi/projects/starting_afresh/Sandoz_Concordia_chatbot/product_monographs/Zoledronic Acid - Z PMe 20170721.docx"

In [None]:
# zol_text = read_docx(ZOL)

In [None]:
# s_strings = ["How does zoledronic acid work in the body?",
# "What is the pharmacological mechanism behind zoledronic acid?",
# "By what mechanism does zoledronic acid exert its effects?",
# "What is the mode of action of zoledronic acid?",
# "How does zoledronic acid produce its therapeutic effects?"]

In [None]:
# query_text = "what are the contraindications of zoledronic acid"
# t = find_top_n_matches(query_text, target_text=zol_text, ret_string=True, top_n=10, overlap=0, patch_size=120)
# pprint(t, width=120)

[(np.float64(-0.46955166902160644),
  'acid is contraindicated in breast-feeding women (see CONTRAINDICATIONS). There is no clinical experience with '
  'zoledronic acid in lactating women and it is not known whether zoledronic acid passes into breast milk. A study in '
  'lactating rats has shown that another bisphosphonate, pamidronate, passes into the milk. Mothers treated with '
  'zoledronic acid should therefore not breast feed their infants. Fertility The fertility was decreased in rats dosed '
  'subcutaneously with 0.01 mg/kg/day of zoledronic acid, with systemic exposures of 0.12 times the human systemic '
  'exposure following an intravenous dose of 4 mg (based on AUC). The effects observed included an increase in '
  'pre-implantation losses and a decrease in the number of implantations and live foetuses. There are no data '
  'available in humans.'),
 (np.float64(-0.5317228898010253),
  'acid is administered in conjunction with drugs that are potentially nephrotoxic (e.g. 

In [137]:
word_cosine_distance("theater", "sing")

np.float32(0.73509175)

In [None]:
def get_top_sentences_for_claim(div="train"):
    
    