## sec 1

In [17]:
# Extracting data from the OCR output
rankings = {
    "Algorithm A": {
        "Query 1": [1, 2, 6, 5, 9, 10, 7, 4, 8, 3],
        "Query 2": [1, 2, 4, 5, 7, 10, 8, 3, 9, 6]
    },
    "Algorithm B": {
        "Query 1": [10, 9, 8, 7, 5, 4, 6, 2, 1, 3],
        "Query 2": [1, 3, 2, 4, 5, 6, 8, 7, 10, 9]
    },
    "Ground truth": {
        "Query 1": [1, 4, 5],
        "Query 2": [3, 6]
    }
}

# Helper functions to calculate metrics
def calculate_precision_at_k(ranking, ground_truth, k):
    relevant_docs = set(ground_truth)
    retrieved_docs_at_k = ranking[:k]
    relevant_at_k = [doc for doc in retrieved_docs_at_k if doc in relevant_docs]
    precision_at_k = len(relevant_at_k) / k
    return precision_at_k

def calculate_average_precision(ranking, ground_truth):
    relevant_docs = set(ground_truth)
    relevant_found = 0
    precision_sum = 0

    for rank, doc in enumerate(ranking, start=1):
        if doc in relevant_docs:
            relevant_found += 1
            precision_sum += relevant_found / rank

    average_precision = precision_sum / len(relevant_docs)
    return average_precision

def calculate_reciprocal_rank(ranking, ground_truth):
    relevant_docs = set(ground_truth)
    for rank, doc in enumerate(ranking, start=1):
        if doc in relevant_docs:
            return 1 / rank
    return 0  # If no relevant document is found

def calculate_mean_reciprocal_rank(rankings, ground_truths):
    rr_sum = 0
    for query in ground_truths:
        rr_sum += calculate_reciprocal_rank(rankings[query], ground_truths[query])
    mean_reciprocal_rank = rr_sum / len(ground_truths)
    return mean_reciprocal_rank

# Calculating the required metrics
p_at_5_query_1_A = calculate_precision_at_k(rankings["Algorithm A"]["Query 1"], rankings["Ground truth"]["Query 1"], 5)
avg_precision_query_1_A = calculate_average_precision(rankings["Algorithm A"]["Query 1"], rankings["Ground truth"]["Query 1"])
reciprocal_rank_query_2_B = calculate_reciprocal_rank(rankings["Algorithm B"]["Query 2"], rankings["Ground truth"]["Query 2"])
mean_reciprocal_rank_B = calculate_mean_reciprocal_rank(rankings["Algorithm B"], rankings["Ground truth"])

# Comparing Average Precision between algorithms (for Query 1 only, as no data for Query 2)
avg_precision_query_1_B = calculate_average_precision(rankings["Algorithm B"]["Query 1"], rankings["Ground truth"]["Query 1"])
avg_precision_query_2_A = calculate_average_precision(rankings["Algorithm A"]["Query 2"], rankings["Ground truth"]["Query 2"])
avg_precision_query_2_B = calculate_average_precision(rankings["Algorithm B"]["Query 2"], rankings["Ground truth"]["Query 2"])
mean_avg_precision_A = (avg_precision_query_1_A + avg_precision_query_2_A) / 2
mean_avg_precision_B = (avg_precision_query_1_B + avg_precision_query_2_B) / 2
higher_map = "Algorithm A" if mean_avg_precision_A > mean_avg_precision_B else "Algorithm B"

print("mean average precision for Algorithm A: {:.3f}".format(mean_avg_precision_A))
print("mean average precision for Algorithm B: {:.3f}".format(mean_avg_precision_B))

p_at_5_query_1_A, avg_precision_query_1_A, reciprocal_rank_query_2_B, mean_reciprocal_rank_B, higher_map


mean average precision for Algorithm A: 0.394
mean average precision for Algorithm B: 0.353


(0.4, 0.625, 0.5, 0.35, 'Algorithm A')

In [18]:
import math

# Given values
Le3 = 2  # Number of incoming links to entity e3
Le6 = 4  # Number of incoming links to entity e6
Le3_inter_e6 = 1  # Number of common incoming links
E = 6  # Total number of entities

# WLM formula calculation using base 2 logs
WLM_e3_e6 = 1 - (math.log(max(Le3, Le6), 2) - math.log(Le3_inter_e6, 2)) / (math.log(E, 2) - math.log(min(Le3, Le6), 2))

WLM_e3_e6


-0.26185950714291506

## section2 

In [19]:
import re
from typing import List, Dict

def create_entity_repr(docs: List[str], window_size: int) -> Dict[str, str]:
    entity_repr = {}

    for doc in docs:
        # Find all entity annotations in the document
        entities = re.findall(r'\[\[(.*?)\|(.*?)\]\]', doc)
        
        # Replace the entity annotations with a placeholder to prevent overlapping
        for entity in entities:
            doc = doc.replace(f'[[{entity[0]}|{entity[1]}]]', f' {entity[1]} ', 1)
        
        # Split the document into terms after replacing entities
        terms = doc.split()
        
        # Iterate over the entities and create the representation
        for entity in entities:
            entity_id, entity_text = entity
            entity_repr[entity_id] = entity_repr.get(entity_id, '')

            # Find all occurrences of the entity text in terms
            for i, term in enumerate(terms):
                if term == entity_text:
                    # Get the window of terms around the entity mention
                    window_terms = terms[max(0, i - window_size):i] + \
                                   terms[i + 1:min(len(terms), i + window_size + 1)]
                    # Add the entity mention itself
                    window_terms.insert(window_size, entity_text)
                    # Add to the entity representation
                    entity_repr[entity_id] += ' ' + ' '.join(window_terms)
                    entity_repr[entity_id] = entity_repr[entity_id].strip()

    return entity_repr

# Test the function with the provided example input
docs_example = [
    "first document that mentions [[entity1|entity-one]] alone",
    "2nd document with [[entity2|ABC]] and [[entity1|entity-one]] together",
    "xxx yyy zzz [[entity2|ABC]] aaa bbb ccc ddd [[entity3|ZZZ]] eee fff",
]

# Expected output with window_size=2
# {
#     "entity1": "that mentions entity one-alone ABC and entity-one together",
#     "entity2": "document with ABC and entity-one yyy zzz ABC aaa bbb",
#     "entity3": "ccc ddd ZZZ eee fff",
# }

# We'll test the function again with the same example input.
create_entity_repr(docs_example, window_size=2)


{'entity1': 'that mentions entity-one alone ABC and entity-one together',
 'entity2': 'document with ABC and entity-one yyy zzz ABC aaa bbb',
 'entity3': 'ccc ddd ZZZ eee fff'}

In [20]:
import random

def generate_skipgram_examples(sentence, window_size=2):
    """
    Generate positive training examples for Skip-gram model from a given sentence.

    :param sentence: A string representing the input sentence.
    :param window_size: The size of the context window.
    :return: A list of tuples, each containing the target word and a context word.
    """
    # Tokenize the sentence
    words = sentence.split()

    # Generate positive context pairs
    positive_pairs = []
    for index, target in enumerate(words):
        # Define the context window range
        start = max(0, index - window_size)
        end = min(len(words), index + window_size + 1)

        # Generate pairs (target, context)
        for context_index in range(start, end):
            if context_index != index:
                positive_pairs.append((target, words[context_index]))

    return positive_pairs

# Example usage
sentence = "The quick brown fox jumps over the lazy dog"
examples = generate_skipgram_examples(sentence)
print(examples)


[('The', 'quick'), ('The', 'brown'), ('quick', 'The'), ('quick', 'brown'), ('quick', 'fox'), ('brown', 'The'), ('brown', 'quick'), ('brown', 'fox'), ('brown', 'jumps'), ('fox', 'quick'), ('fox', 'brown'), ('fox', 'jumps'), ('fox', 'over'), ('jumps', 'brown'), ('jumps', 'fox'), ('jumps', 'over'), ('jumps', 'the'), ('over', 'fox'), ('over', 'jumps'), ('over', 'the'), ('over', 'lazy'), ('the', 'jumps'), ('the', 'over'), ('the', 'lazy'), ('the', 'dog'), ('lazy', 'over'), ('lazy', 'the'), ('lazy', 'dog'), ('dog', 'the'), ('dog', 'lazy')]


## sec 3

In [21]:
import math

def sdm_with_dirichlet_smoothing(query, documents, mu=6, window_size=4, weight_single=0.85, weight_ordered=0.1, weight_unordered=0.05):

    query_terms = query.split()
    query_ordered_pairs = [(query_terms[i], query_terms[i + 1]) for i in range(len(query_terms) - 1)]
    query_unordered_pairs = [(query_terms[i], query_terms[j]) for i in range(len(query_terms)) for j in range(i + 1, min(i + window_size, len(query_terms)))]

    # Initialize scores for each document
    scores = {doc: 0 for doc in documents}

    # Calculate the length of the corpus
    corpus_length = sum(len(doc.split()) for doc in documents)

    # Calculate term and pair frequencies in the corpus
    corpus_term_freq = {}
    corpus_ordered_pair_freq = {}
    corpus_unordered_pair_freq = {}
    for doc in documents:
        words = doc.split()
        for term in query_terms:
            corpus_term_freq[term] = corpus_term_freq.get(term, 0) + words.count(term)
        for pair in query_ordered_pairs:
            corpus_ordered_pair_freq[pair] = corpus_ordered_pair_freq.get(pair, 0) + sum(1 for i in range(len(words) - 1) if words[i] == pair[0] and words[i + 1] == pair[1])
        for i in range(len(words)):
            for j in range(i + 1, min(i + window_size, len(words))):
                pair = (words[i], words[j])
                if pair in query_unordered_pairs or pair[::-1] in query_unordered_pairs:
                    corpus_unordered_pair_freq[pair] = corpus_unordered_pair_freq.get(pair, 0) + 1

    # Scoring documents
    for doc in documents:
        doc_length = len(doc.split())
        for term in query_terms:
            term_freq_in_doc = doc.split().count(term)
            term_prob = (term_freq_in_doc + mu * (corpus_term_freq[term] / corpus_length)) / (doc_length + mu)
            scores[doc] += weight_single * math.log(term_prob)

        for pair in query_ordered_pairs:
            pair_freq_in_doc = sum(1 for i in range(len(doc.split()) - 1) if doc.split()[i] == pair[0] and doc.split()[i + 1] == pair[1])
            pair_prob = (pair_freq_in_doc + mu * (corpus_ordered_pair_freq.get(pair, 0) / corpus_length)) / (doc_length + mu)
            scores[doc] += weight_ordered * math.log(pair_prob)

        for pair in query_unordered_pairs:
            pair_freq_in_doc = 0
            words = doc.split()
            for i in range(len(words)):
                for j in range(i + 1, min(i + window_size, len(words))):
                    if (words[i], words[j]) == pair or (words[j], words[i]) == pair:
                        pair_freq_in_doc += 1
            pair_prob = (pair_freq_in_doc + mu * (corpus_unordered_pair_freq.get(pair, 0) / corpus_length)) / (doc_length + mu)
            scores[doc] += weight_unordered * math.log(pair_prob)

    return scores


In [22]:
docs = [
    "t1 t2 t3 t4 t5 t6",
    "t3 t4 t3 t1 t8 t2 t2 t7",
    "t2 t9 t4 t1 t8 t2 t3 t1 t4"
]
query = "t4 t3"

sdm_with_dirichlet_smoothing(query, docs)




{'t1 t2 t3 t4 t5 t6': -3.5049499402474984,
 't3 t4 t3 t1 t8 t2 t2 t7': -3.264782173133931,
 't2 t9 t4 t1 t8 t2 t3 t1 t4': -3.579170015890382}

In [24]:
a = [1, 0, 0, 1, 1, 0, 1, 1, 0, 1]
b = [1,1,0,1,0,0,1,0,1,1]
import numpy as np

def calculate_similarity(vec1, vec2):
    # Ensure the vectors are NumPy arrays
    vec1, vec2 = np.array(vec1), np.array(vec2)

    # Jaccard Similarity
    intersection = np.sum(np.logical_and(vec1, vec2))
    union = np.sum(np.logical_or(vec1, vec2))
    jaccard_similarity = intersection / union if union != 0 else 0

    # Cosine Similarity
    dot_product = np.dot(vec1, vec2)
    magnitude_vec1 = np.linalg.norm(vec1)
    magnitude_vec2 = np.linalg.norm(vec2)
    cosine_similarity = dot_product / (magnitude_vec1 * magnitude_vec2) if magnitude_vec1 != 0 and magnitude_vec2 != 0 else 0

    return jaccard_similarity, cosine_similarity

# Example usage

jaccard_sim, cosine_sim = calculate_similarity(b, a)
print("Jaccard Similarity:", jaccard_sim)
print("Cosine Similarity:", cosine_sim)


Jaccard Similarity: 0.5
Cosine Similarity: 0.6666666666666667


In [None]:
def create_query_rewrites(topics: pd.DataFrame) -> pd.DataFrame:
    """
    Create query rewrites.
    
    Args:
        topics: A dataframe containing the queries for each topic.

    Returns:
        Modified dataframe with the query rewrites.
    """
    rewrites = pd.DataFrame()
    
    # Iterate through each group of topics
    for _, topic in topics.groupby("topic_number"):
        topic.reset_index(inplace=True, drop=True)
        
        # Keep track of the previous query
        previous_query = None
        
        for i, row in topic.iterrows():
            # Skip the first query of each topic since there's no previous query to rewrite from
            if i == 0:
                previous_query = row["query"]
                topic.at[i, "rewrite"] = row["query"]
                continue
            
            # Encode the previous and current query and concatenate them with the separator
            input_text = f"{previous_query} {SEPARATOR} {row['query']}"
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            
            # Generate the rewritten query
            outputs = model.generate(input_ids, max_length=512, early_stopping=True)
            
            # Decode the generated tokens to get the rewritten string
            rewrite = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Assign the rewrite to the current row
            topic.at[i, "rewrite"] = rewrite
            
            # Update the previous query
            previous_query = rewrite

        # Append the rewrites of the current topic to the main rewrites dataframe
        rewrites = pd.concat([rewrites, topic])
    
    return rewrites
