## Adding system paths

In [1]:
sc

In [2]:
import sys
sys.path.append('/home/sebaq/Documents/GitHub/LWMD_assignments')

In [3]:
DATA_NAME = 'toy'

In [5]:
SIMILARITY = 0.9

## Loading document info

In [6]:
from assignment3.model.documents import DocumentVectors

In [7]:
docs_vet = DocumentVectors(data_name=DATA_NAME, idf_order=True)

Loading vectors... 
Loading mapping... 
Loading inverse mapping... 


In [8]:
docs_vet

toy Vector Documents [10]

In [9]:
docs_info = docs_vet.get_documents_info(similarity=SIMILARITY)

## Spark

In [10]:
rdd = sc.parallelize(docs_info)

In [11]:
def document_map(doc_info: tuple[str, int, list[tuple[int, float]]]) -> list[tuple[int, tuple[str, int, list[tuple[int, float]]]]]:
    """
    Mapping function
    :param doc_info: document information as a pair (doc-id ; term-threshold ;  list(term-id, value)),
        where the second value of the pair represent entries of a csr vector
    :return: list of key-value pair (term-id : (doc-id ; list(term-id, value))) for each term appearing in the document
    """

    doc_id, term_threshold, sparse_entries = doc_info

    return [
        (term_id, (doc_id, term_id, sparse_entries))
        for term_id, value in sparse_entries
        if term_id > term_threshold
    ]

In [24]:
def documents_reduce(docs: list[tuple[int, int, list[tuple[int, float]]]]) -> list[tuple[tuple[str, str], float]]:

    pairs = []

    n_docs = len(docs)

    for i in range(n_docs-1):

        for j in range(i+1, n_docs):

            doc1_id, term_id, doc1  = docs[i]
            doc2_id,       _, doc2  = docs[j]

            all_terms: set[int] = set([t_id1 for t_id1, _ in doc1]).intersection([t_id2 for t_id2, _ in doc2])
            max_term: int = max(all_terms)

            if term_id != max_term:
                pass

            iter_doc1 = iter(doc1)
            iter_doc2 = iter(doc2)

            term1, value1 = next(iter_doc1)
            term2, value2 = next(iter_doc2)

            sim = 0

            while True:

                try:
                    if term1 == term2:
                        sim += value1 * value2
                        term1, value1 = next(iter_doc1)
                        term2, value2 = next(iter_doc2)
                        pass
                    elif term1 < term2:
                        term1, value1 = next(iter_doc1)
                    else:
                        term2, value2 = next(iter_doc2)
                except StopIteration:
                    break

            pairs.append(((doc1_id, doc2_id), sim))

    return pairs

In [25]:
myrdd = sc.parallelize(docs_info)

In [26]:
out = myrdd.flatMap(document_map).combineByKey(lambda x: [x], lambda x, y: x + [y], lambda x, y: x + y).flatMapValues(documents_reduce).filter(lambda x: x[1][1] > SIMILARITY).map(lambda x: x[1][0]).distinct()

In [27]:
collected = out.collect()

                                                                                

In [28]:
collected

[('doc0', 'doc3'), ('doc1', 'doc3'), ('doc0', 'doc1')]

In [29]:
from assignment3.io_ import load_evaluation, get_exact_solution_file

exact = load_evaluation(path_=get_exact_solution_file(data_name=DATA_NAME))['pairs']
exact = [(a, b) for a, b in exact]

In [30]:
exact

[('doc0', 'doc1'), ('doc0', 'doc3'), ('doc1', 'doc3')]

In [31]:
from assignment3.utils import jaccard

jaccard(set(collected), set(exact))

1.0