In [1]:
import sys
sys.path.append('/home/sebaq/Documents/GitHub/LWMD_assignments')

## Definition of Map and Reduce functions

In [2]:
def document_map(
        doc_info: tuple[str, int, list[tuple[int, float]]]
) -> list[tuple[str, tuple[str, int, list[tuple[int, float]]]]]:
    """
    Mapping function
    :param doc_info: document information is represented as a triple:
        - doc-id, represented as a string
        - term-threshold, referring to the index of a specific column up to which do not map terms
        - document vector, a sparse vector as a list of pairs (column, value) for each non-zero entries,
            where the column is actually a term-id
    :return: list of key-value pairs:
        - key: term-id, which is actually a column index
        - value: consists of a triple:
            - doc-id  (the same as input)
            - term-id (the same as the key)
            - document vector (the same as input)
    """

    # unpacking
    doc_id: str
    term_threshold: int
    sparse_entries: list[tuple[int, float]]
    doc_id, term_threshold, sparse_entries = doc_info

    mapped: list[tuple[str, tuple[str, int, list[tuple[int, float]]]]] = [

        (str(term_id), (doc_id, term_id, sparse_entries))
        for term_id, value in sparse_entries  # document terms by using non-zero entries
        if term_id > term_threshold  # OPTIMIZATION 1:
        # we only map term with higher term-id with respect to the threshold one
        #  (thus, we only consider columns after the threshold one)
    ]

    return mapped

In [3]:
def documents_reduce(docs: list[tuple[int, int, list[tuple[int, float]]]]) -> list[tuple[tuple[int, int], float]]:
    """
    Reduce function
    :param docs: list of triplets:
        - doc-id
        - term-id (actually a column index of the vector)
        - document vector as a sparse matrix of pairs (column, value)
    :return: list of tuples:
        - the first element is the pair of documents represented by their doc-id
        - the second element represent their cosine-similarity
    """

    # list of output pairs
    pairs = []

    # total number of documents
    n_docs = len(docs)

    # loop among all possible pairs
    for i in range(n_docs - 1):

        for j in range(i + 1, n_docs):

            doc1_id, term_id, doc1 = docs[i]
            doc2_id, _, doc2 = docs[j]  # since the operation is an aggregation by key,
            # term_id is expected to be the same

            # ----------------- OPTIMIZATION 2 -----------------

            # collect term-ids of each document
            terms_1: list[int] = [t_id1 for t_id1, _ in doc1]  # term-ids for the first document
            terms_2: list[int] = [t_id2 for t_id2, _ in doc2]  # term-ids for the second document

            # perform their intersection
            common_terms: set[int] = set(terms_1).intersection(terms_2)

            # get the maximum term-id
            max_term: int = max(common_terms)

            # if the maximum term-id is not the same of aggregation key, skip similarity computation
            if term_id != max_term:
                pass

            # --------------------------------------------------

            # Computing similarity with dot-product

            # getting iterator
            iter_doc1 = iter(doc1)
            iter_doc2 = iter(doc2)

            # we assume documents with at least on term
            term1, value1 = next(iter_doc1)
            term2, value2 = next(iter_doc2)

            sim = 0.  # total similarity

            # we use iterators to keep a pointer over term-ids of the two vectors
            # if they have the same term-id, we add its contribution to the cumulative sum and we move both pointers over
            # otherwise we move over the one with smallest term-id

            while True:

                try:
                    if term1 == term2:  # they have common term-id; we add its contribution to final similarity
                        sim += value1 * value2
                        term1, value1 = next(iter_doc1)
                        term2, value2 = next(iter_doc2)
                    elif term1 < term2:  # the first one has a smaller term-id
                        term1, value1 = next(iter_doc1)
                    else:  # the second one has a smaller term-id
                        term2, value2 = next(iter_doc2)
                except StopIteration:  # we scanned all terms of one of the vectors so there's no more term in common
                    break

            # we add the pairwise similarity to final output
            pairs.append(((doc1_id, doc2_id), sim))

    return pairs

In [4]:
from assignment3.utils import jaccard
from assignment3.io_ import load_evaluation, get_exact_solution_file
from typing import List, Tuple

def compare_with_exact(data_name: str, collected_: List[Tuple[str, str]]) -> float:
    """
    Compares results coming from spark to sequential execution
    :param data_name: name of dataset
    :param collected_: pairs of similar docs from spark
    :return: jaccard similarity with exact solution
    """

    exact = load_evaluation(path_=get_exact_solution_file(data_name=data_name))['pairs']
    exact = [(a, b) for a, b in exact]

    return jaccard(set(collected_), set(exact))


## Evaluation over small example

In [5]:
DATA_NAME = 'small'

In [6]:
SIMILARITY = 0.8

In [7]:
IDF_ORDER = True

### Loading document info

In [8]:
from assignment3.model.documents import DocumentVectors
docs_vet = DocumentVectors(data_name=DATA_NAME, idf_order=True)

Loading vectors... 
Loading mapping... 
Loading inverse mapping... 


In [9]:
docs_vet

small Vector Documents [4735]

In [10]:
docs_info = docs_vet.get_documents_info(similarity=SIMILARITY)

### Spark

In [11]:
rdd = sc.parallelize(docs_info)

In [12]:
out = rdd.flatMap(document_map).\
    combineByKey(lambda x: [x], lambda x, y: x + [y], lambda x, y: x + y).\
    flatMapValues(documents_reduce).\
    filter(lambda x: x[1][1] > SIMILARITY).\
    map(lambda x: x[1][0]).\
    distinct()

In [None]:
collected = out.collect()

23/05/24 18:21:40 WARN TaskSetManager: Stage 0 contains a task of very large size (2112 KiB). The maximum recommended task size is 1000 KiB.
[Stage 1:>                                                          (0 + 4) / 4]

In [15]:
collected

[('doc0', 'doc3'), ('doc1', 'doc3'), ('doc0', 'doc1')]

In [None]:
compare_with_exact(data_name=DATA_NAME, collected_=collected)

## Evaluation over medium example

In [3]:
DATA_NAME = 'medium'

In [4]:
SIMILARITY = 0.85

In [None]:
IDF_ORDER = True

### Loading document info

In [5]:
from assignment3.model.documents import DocumentVectors
docs_vet = DocumentVectors(data_name=DATA_NAME, idf_order=True)

In [7]:
docs_vet

toy Vector Documents [10]

In [8]:
docs_info = docs_vet.get_documents_info(similarity=SIMILARITY)

### Spark

In [9]:
rdd = sc.parallelize(docs_info)

In [13]:
out = rdd.flatMap(document_map).\
    combineByKey(lambda x: [x], lambda x, y: x + [y], lambda x, y: x + y).\
    flatMapValues(documents_reduce).\
    filter(lambda x: x[1][1] > SIMILARITY).\
    map(lambda x: x[1][0]).\
    distinct()

In [14]:
collected = out.collect()

                                                                                

In [15]:
collected

[('doc0', 'doc3'), ('doc1', 'doc3'), ('doc0', 'doc1')]

In [None]:
compare_with_exact(data_name=DATA_NAME, collected_=collected)

## Evaluation over large example

In [3]:
DATA_NAME = 'large'

In [4]:
SIMILARITY = 0.9

In [None]:
IDF_ORDER = True

### Loading document info

In [5]:
from assignment3.model.documents import DocumentVectors
docs_vet = DocumentVectors(data_name=DATA_NAME, idf_order=True)

In [7]:
docs_vet

toy Vector Documents [10]

In [8]:
docs_info = docs_vet.get_documents_info(similarity=SIMILARITY)

### Spark

In [9]:
rdd = sc.parallelize(docs_info)

In [13]:
out = rdd.flatMap(document_map).\
    combineByKey(lambda x: [x], lambda x, y: x + [y], lambda x, y: x + y).\
    flatMapValues(documents_reduce).\
    filter(lambda x: x[1][1] > SIMILARITY).\
    map(lambda x: x[1][0]).\
    distinct()

In [14]:
collected = out.collect()

                                                                                

In [15]:
collected

[('doc0', 'doc3'), ('doc1', 'doc3'), ('doc0', 'doc1')]

In [None]:
compare_with_exact(data_name=DATA_NAME, collected_=collected)