# Assignment 3: All Pairs Similarity

## Setup & Dataset Retrieval

In [1]:
import os.path
import pickle
import random

from tqdm import tqdm

import src.utilities.io as io
import src.utilities.preprocess as pp
import src.utilities.similarity_sequential as sim_seq

  from tqdm.autonotebook import tqdm
[nltk_data] Downloading package punkt to /Users/a/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/a/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/a/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
documents, data_path = io.get_data() #trec-covid as default

2023-05-26 23:13:46 - Loading Corpus...


  0%|          | 0/171332 [00:00<?, ?it/s]

2023-05-26 23:13:49 - Loaded 171332 TEST Documents.
2023-05-26 23:13:49 - Doc Example: {'text': 'OBJECTIVE: This retrospective chart review describes the epidemiology and clinical features of 40 patients with culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia. METHODS: Patients with positive M. pneumoniae cultures from respiratory specimens from January 1997 through December 1998 were identified through the Microbiology records. Charts of patients were reviewed. RESULTS: 40 patients were identified, 33 (82.5%) of whom required admission. Most infections (92.5%) were community-acquired. The infection affected all age groups but was most common in infants (32.5%) and pre-school children (22.5%). It occurred year-round but was most common in the fall (35%) and spring (30%). More than three-quarters of patients (77.5%) had comorbidities. Twenty-four isolates (60%) were associated with pneumonia, 14 (35%) with upper respiratory tract 

In [3]:
for doc_id, doc in list(documents.items())[:10]:
    num_words = len(doc["text"].split())
    print(f"Document {doc_id}: {num_words} words")

Document ug7v899j: 262 words
Document 02tnwd4m: 142 words
Document ejv2xln0: 219 words
Document 2b73a28n: 68 words
Document 9785vg6d: 110 words
Document zjufx4fo: 174 words
Document 5yhe786e: 113 words
Document 8zchiykl: 83 words
Document 8qnrcgnk: 188 words
Document jg13scgo: 189 words


Sort the documents by their length (Longest to Shortest)

In [4]:
sort_docs = sorted(documents.items(), key=lambda x: len(x[1]['text']), reverse=True)
sorted_docs: pp.Documents = {}
for doc_id, doc in sort_docs:
    sorted_docs[doc_id] = doc

In [5]:
len(sorted_docs)

171332

In [6]:
for doc_id, doc in list(sorted_docs.items())[:10]:
    num_words = len(doc["text"].split())
    print(f"Document {doc_id}: {num_words} words")

Document ij3ncdb6: 18000 words
Document c4pt07zk: 15448 words
Document 1vimqhdp: 12745 words
Document pd1g119c: 4642 words
Document gvh0wdxn: 3507 words
Document jkrj0lbm: 3783 words
Document jkvr9lq2: 3679 words
Document y00pz4fd: 1705 words
Document lqdgvsaq: 1627 words
Document vslevssr: 1504 words


This dictionary is used to map the index of a document to the corresponding text_id in the original dataset.
This index-to-ID mappings is useful for keeping track of the original text_id of the documents.

In [7]:
doc_idx_to_id = {i: doc_id for i, doc_id in enumerate(sorted_docs.keys())}

In [8]:
doc_idx_to_id

{0: 'ij3ncdb6',
 1: 'c4pt07zk',
 2: '1vimqhdp',
 3: 'pd1g119c',
 4: 'gvh0wdxn',
 5: 'jkrj0lbm',
 6: 'jkvr9lq2',
 7: 'y00pz4fd',
 8: 'lqdgvsaq',
 9: 'vslevssr',
 10: 'o1vxgzpj',
 11: 'bt59txxp',
 12: 'yt9rj42s',
 13: 'qapwb5e4',
 14: 'rff3ikak',
 15: '0prx11md',
 16: '84hh0mg9',
 17: 'uumzl37q',
 18: 'vr7vm64u',
 19: 'v45cb2xq',
 20: 'yn7pu9i8',
 21: 't473tbm9',
 22: 'nq0qe8qf',
 23: '16b8drw2',
 24: '4cbjbo7t',
 25: 'tvs1snq8',
 26: '2wb007gf',
 27: 'st1epx6u',
 28: 'syl7wq15',
 29: 'dt3li3bk',
 30: 'ec798c6s',
 31: 'xvnxkcxq',
 32: 'rxvxf34z',
 33: '735dkles',
 34: 'oq2jgo6z',
 35: '23ovz927',
 36: 'mtjughzl',
 37: '1uwyrst6',
 38: '6ngxo8ff',
 39: 'yysf3sxe',
 40: 'tpic8ddl',
 41: 'l7hdrmbf',
 42: 'n9x8k008',
 43: '40hube0m',
 44: 'xclgf1kk',
 45: 'zu7i1et3',
 46: 'c8l1p66u',
 47: '6xdo6op6',
 48: 'p2jp5fiq',
 49: 'zev4fpe5',
 50: '0x02bmti',
 51: 't1gxo4mw',
 52: 'skahzbni',
 53: 'jdsn8vej',
 54: 'aacp7221',
 55: 'qwy3xfth',
 56: 'uix6b0tm',
 57: 'zy19qf3c',
 58: 'pqegqjyu',
 59: 'q

## Due to the size, we will use different dataset samples
(1000 longest docs, 1000 shortest docs, 1000 random docs)

### Define the paths to save and read the samples and their mappings

In [9]:
samples_dir = os.path.join(data_path, "samples")
if not os.path.exists(samples_dir):
    os.mkdir(samples_dir)

longest_docs_path = os.path.join(samples_dir, "longest_docs.jsonl")
shortest_docs_path = os.path.join(samples_dir, "shortest_docs.jsonl")
random_docs_path = os.path.join(samples_dir, "random_docs.jsonl")
random_docs_idx_path = os.path.join(samples_dir, "random_docs_idx.pkl")

In [10]:
sample_size = 100 #1000

if not all([os.path.exists(p) for p in [longest_docs_path, shortest_docs_path, random_docs_path, random_docs_idx_path]]):
    longest_docs = pp.Documents = {}
    for doc_id, doc in list(sorted_docs.items())[:sample_size]:
        longest_docs[doc_id] = doc
    longest_doc_idx_to_id = {i:doc_idx_to_id[idx] for i, idx in enumerate(range(sample_size))}

    shortest_docs = pp.Documents = {}
    for doc_id, doc in list(sorted_docs.items())[-sample_size:]:
        shortest_docs[doc_id] = doc
    shortest_doc_idx_to_id = {i:doc_idx_to_id[idx] for i, idx in enumerate(range(len(sorted_docs)-sample_size, len(sorted_docs)))}

    random_docs = pp.Documents = {}
    s_docs = list(sorted_docs.items())
    random_docs_idx = random.sample(range(len(sorted_docs)), k=sample_size)
    for index in random_docs_idx:
        doc_id, doc = s_docs[index]
        random_docs[doc_id] = doc
    random_doc_idx_to_id = {i:doc_idx_to_id[idx] for i, idx in enumerate(random_docs_idx)}

    # io.save_sample_data(longest_docs_path, longest_docs)
    # io.save_sample_data(shortest_docs_path, shortest_docs)
    # io.save_sample_data(random_docs_path, random_docs)

    with open(random_docs_idx_path, "wb") as f:
        pickle.dump(random_doc_idx_to_id, f)
else:
    # longest_docs = io.get_sample_data(longest_docs_path)
    # shortest_docs = io.get_sample_data(shortest_docs_path)
    # random_docs = io.get_sample_data(random_docs_path)



    longest_doc_idx_to_id = {i:doc_idx_to_id[idx] for i, idx in enumerate(range(sample_size))}
    shortest_doc_idx_to_id = {i:doc_idx_to_id[idx] for i, idx in enumerate(range(len(sorted_docs)-sample_size, len(sorted_docs)))}
    with open(random_docs_idx_path, "rb") as f:
        random_docs_idx = pickle.load(f)
    random_doc_idx_to_id = {i:doc_idx_to_id[idx] for i, idx in enumerate(random_docs_idx)}



    longest_docs = pp.Documents = {}
    for doc_id, doc in list(sorted_docs.items())[:sample_size]:
        longest_docs[doc_id] = doc

    shortest_docs = pp.Documents = {}
    for doc_id, doc in list(sorted_docs.items())[-sample_size:]:
        shortest_docs[doc_id] = doc

    random_docs = pp.Documents = {}
    s_docs = list(sorted_docs.items())
    for index in random_docs_idx:
        doc_id, doc = s_docs[index]
        random_docs[doc_id] = doc

In [11]:
print(len(longest_docs))
print(len(shortest_docs))
print(len(random_docs))

100
100
100


In [12]:
samples = [("longest_docs", longest_docs, longest_doc_idx_to_id),
           ("shortest_docs", shortest_docs, shortest_doc_idx_to_id),
           ("random_docs", random_docs, random_doc_idx_to_id)]

## Tokenization
Document cleaning and tokenization

In [13]:
tokens_dir = os.path.join(data_path, "tokens")
if not os.path.exists(tokens_dir):
    os.mkdir(tokens_dir)

docs_tokens_path = os.path.join(tokens_dir, "doc_tokens.pkl")

In [14]:
if not os.path.exists(docs_tokens_path):
    tokenized_samples = []
    for name, docs, idx_to_id in samples:
        tokenized_docs: pp.TokenizedDocuments = pp.get_tokenized_documents(docs)
        tokenized_samples.append((name, tokenized_docs, idx_to_id))

    with open(docs_tokens_path, "wb") as f:
        pickle.dump(tokenized_samples, f)
else:
    with open(docs_tokens_path, "rb") as f:
        tokenized_samples = pickle.load(f)

## VECTORIZE

In [15]:
tfidf_dir = os.path.join(samples_dir, "tfidf")
if not os.path.exists(tfidf_dir):
    os.mkdir(tfidf_dir)

tfidf_results_path = os.path.join(tfidf_dir, "tfidf_docs.pkl")

In [16]:
if not os.path.exists(tfidf_results_path):
    tfidf_docs = []
    for name, docs, idx_to_id in tqdm(tokenized_samples):
        vectorized_docs, time_taken = pp.vectorize(docs)
        tfidf_docs.append((name, vectorized_docs, time_taken, idx_to_id))

    with open(tfidf_results_path, "wb") as f:
        pickle.dump(tfidf_docs, f)
else:
    with open(tfidf_results_path, "rb") as f:
        tfidf_docs = pickle.load(f)

In [17]:
for name, _, time_taken, _ in tfidf_docs:
    print(f"Time taken to vectorize {name}: {time_taken:.2f} seconds")

Time taken to vectorize longest_docs: 0.17 seconds
Time taken to vectorize shortest_docs: 0.01 seconds
Time taken to vectorize random_docs: 0.01 seconds


In [18]:
for _, mat, _, _ in tfidf_docs:
    print(mat)

  (0, 10637)	0.0062706793201121125
  (0, 17207)	0.0062706793201121125
  (0, 6611)	0.0062706793201121125
  (0, 20086)	0.0062706793201121125
  (0, 1620)	0.004871028227207774
  (0, 7383)	0.0062706793201121125
  (0, 16639)	0.0062706793201121125
  (0, 9853)	0.0062706793201121125
  (0, 18578)	0.0062706793201121125
  (0, 19635)	0.0062706793201121125
  (0, 7400)	0.0062706793201121125
  (0, 12767)	0.0062706793201121125
  (0, 17842)	0.0062706793201121125
  (0, 13136)	0.0062706793201121125
  (0, 7397)	0.0062706793201121125
  (0, 13937)	0.0062706793201121125
  (0, 8726)	0.0062706793201121125
  (0, 6779)	0.0062706793201121125
  (0, 1624)	0.004871028227207774
  (0, 7395)	0.0062706793201121125
  (0, 15897)	0.0062706793201121125
  (0, 17261)	0.0062706793201121125
  (0, 18755)	0.0062706793201121125
  (0, 7392)	0.0062706793201121125
  (0, 13379)	0.0062706793201121125
  :	:
  (99, 1050)	0.37303556662688425
  (99, 30)	0.009180980713333987
  (99, 18)	0.01681966279786275
  (99, 128)	0.005866291564547258
  (

## Sequential APDS

In [19]:
thresholds = [0.5, 0.6, 0.7, 0.8, 0.9] # Chosen thresholds

In [20]:
seq_results_dir = os.path.join(samples_dir, "seq_result")
if not os.path.exists(seq_results_dir):
    os.mkdir(seq_results_dir)

seq_results_path = os.path.join(seq_results_dir, "seq_results.pkl")

In [21]:
all_seq_results = {} #sample: pairs indices, pairs docs id, additional information

if not os.path.exists(seq_results_path):
    for name, vectorized_docs, _, idx_to_id in tqdm(tfidf_docs):
        seq_results = []
        for threshold in thresholds:
            similar_pairs, information = sim_seq.compute_all_pairs_docs_sim(vectorized_docs, name, threshold)
            mapped_pairs = sim_seq.map_doc_idx_to_id(similar_pairs, idx_to_id)
            seq_results.append((similar_pairs, mapped_pairs, information))
        all_seq_results[name] = seq_results

    with open(seq_results_path, "wb") as f:
        pickle.dump(all_seq_results, f)
else:
    with open(seq_results_path, "rb") as f:
        all_seq_results = pickle.load(f)

### longest 1000

In [22]:
sim_seq.print_results(all_seq_results, "longest_docs")

--Run info--
sample_name: longest_docs
threshold: 0.5
pairs_count: 40
cosine_time: 0.006117820739746094
find_time: 0.0012700557708740234
--Similarity pairs--
1vimqhdp and pd1g119c have 0.5466 similarity
vslevssr and yt9rj42s have 0.9542 similarity
o1vxgzpj and bt59txxp have 1.0000 similarity
qapwb5e4 and rff3ikak have 0.9315 similarity
0prx11md and yn7pu9i8 have 0.9895 similarity

--Run info--
sample_name: longest_docs
threshold: 0.6
pairs_count: 35
cosine_time: 0.006940126419067383
find_time: 0.001611948013305664
--Similarity pairs--
vslevssr and yt9rj42s have 0.9542 similarity
o1vxgzpj and bt59txxp have 1.0000 similarity
qapwb5e4 and rff3ikak have 0.9315 similarity
0prx11md and yn7pu9i8 have 0.9895 similarity
uumzl37q and vr7vm64u have 0.9882 similarity

--Run info--
sample_name: longest_docs
threshold: 0.7
pairs_count: 34
cosine_time: 0.0063512325286865234
find_time: 0.001544952392578125
--Similarity pairs--
vslevssr and yt9rj42s have 0.9542 similarity
o1vxgzpj and bt59txxp have 1.0

In [23]:
sim_seq.print_pairs(all_seq_results, "longest_docs", tokenized_samples)

1vimqhdp and pd1g119c have 0.5466 similarity

1vimqhdp:

pd1g119c:
abstract 3rd international genomic medicine conference 3rd igmc 2015 jeddah kingdom saudi arabia 30 november 3 december 2015 o1 regulation gene telomere length long distance jerry w. shay o2 microtubule destabilizer kif2a regulates postnatal establishment neuronal circuit addition prenatal cell survival cell migration axon elongation loss leading malformation cortical development severe epilepsy noriko homma ruyun zhou muhammad imran naseer adeel g. chaudhary mohammed al-qahtani nobutaka hirokawa o3 integration metagenomics metabolomics gut microbiome research maryam goudarzi albert j. fornace jr. o4 unique integrated system discern pathogenesis central nervous system tumor saleh baeesa deema hussain mohammed bangash fahad alghamdi hans-juergen schulten angel carracedo ishaq khan hanadi qashqari nawal madkhali mohamad saka kulvinder s. saini awatif jamal jaudah al-maghrabi adel abuzenadah adeel chaudhary mohammed al qah

### shortest 1000

In [24]:
sim_seq.print_results(all_seq_results, "shortest_docs")

--Run info--
sample_name: shortest_docs
threshold: 0.5
pairs_count: 3
cosine_time: 0.0009410381317138672
find_time: 0.0015442371368408203
--Similarity pairs--
8gadvw16 and y7f7ktrm have 0.5141 similarity
4slsrd3n and qcjrdjf3 have 0.5131 similarity
0tchw1x2 and qcjrdjf3 have 0.5224 similarity

--Run info--
sample_name: shortest_docs
threshold: 0.6
pairs_count: 0
cosine_time: 0.0012841224670410156
find_time: 0.0007967948913574219
--Similarity pairs--

--Run info--
sample_name: shortest_docs
threshold: 0.7
pairs_count: 0
cosine_time: 0.0005869865417480469
find_time: 0.0007932186126708984
--Similarity pairs--

--Run info--
sample_name: shortest_docs
threshold: 0.8
pairs_count: 0
cosine_time: 0.0005128383636474609
find_time: 0.0007917881011962891
--Similarity pairs--

--Run info--
sample_name: shortest_docs
threshold: 0.9
pairs_count: 0
cosine_time: 0.0004818439483642578
find_time: 0.0007929801940917969
--Similarity pairs--



In [25]:
sim_seq.print_pairs(all_seq_results, "shortest_docs", tokenized_samples)

8gadvw16 and y7f7ktrm have 0.5141 similarity

8gadvw16:
double trouble methanol outbreak wake covid-19 pandemic iran—a cross-sectional assessment

y7f7ktrm:
la obesidad el coronavirus 2019-ncov una relación de riesgo

There are 0 pairs
There are 0 pairs
There are 0 pairs
There are 0 pairs



### random 1000

In [26]:
sim_seq.print_results(all_seq_results, "random_docs")

--Run info--
sample_name: random_docs
threshold: 0.5
pairs_count: 0
cosine_time: 0.0008769035339355469
find_time: 0.0010578632354736328
--Similarity pairs--

--Run info--
sample_name: random_docs
threshold: 0.6
pairs_count: 0
cosine_time: 0.0010838508605957031
find_time: 0.0013451576232910156
--Similarity pairs--

--Run info--
sample_name: random_docs
threshold: 0.7
pairs_count: 0
cosine_time: 0.0015711784362792969
find_time: 0.0014379024505615234
--Similarity pairs--

--Run info--
sample_name: random_docs
threshold: 0.8
pairs_count: 0
cosine_time: 0.0016367435455322266
find_time: 0.0014851093292236328
--Similarity pairs--

--Run info--
sample_name: random_docs
threshold: 0.9
pairs_count: 0
cosine_time: 0.0013909339904785156
find_time: 0.0018270015716552734
--Similarity pairs--



In [27]:
sim_seq.print_pairs(all_seq_results, "random_docs", tokenized_samples)

There are 0 pairs
There are 0 pairs
There are 0 pairs
There are 0 pairs
There are 0 pairs



## Parallel APDS

# MIRACLE PARALLEL?

In [12]:
from typing import Dict, List, Tuple
import csv

In [13]:
def create_doc_sim_csv(pairs_list: List[Tuple[str, str, float]], ds_name: str,
                       threshold: float, type: str | None, workers: None | int = None ) -> None:
    '''
    PURPOSE: create the .csv file sotring the list of similar documents pairs with the cosine similarity
    ARGUMENTS:
        - pairs_list (List[Tuple[str, str, float]]): list of unique similar pair with the similarity
        - ds_name: (str): dataset name
        - threshold (float): used threshold
        - type (str | None): type of sequential version
        - workers (None | int): number of workers used
    RETURN: None
    '''

    path = ''
    if type is not None:
        if not os.path.exists(f'./results/{ds_name}/{threshold}'): os.makedirs(f'./results/{ds_name}/{threshold}')
        path = f'./results/{ds_name}/{threshold}/{type}_sequential.csv'
    else:
        if not os.path.exists(f'./results/{ds_name}/{threshold}/pyspark/'): os.makedirs(f'./results/{ds_name}/{threshold}/pyspark/')
        path = f'./results/{ds_name}/{threshold}/pyspark/{workers}_workers.csv'
    if not os.path.exists(path): # If there is already a file, return
        with open(path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(pairs_list)

In [14]:
import itertools
from src.utilities.similarity_parallel_spark import pyspark_APDS
import pandas as pd
from typing import Tuple, Type, List, Dict, Any

In [None]:
# datasets = ['nfcorpus'] # Choosen datasets
# thresholds: List[float] = [0.5, 0.6, 0.7, 0.8, 0.9] # Choosen thresholds
# numslices_factor = [1, 2, 5, 10, 15] # Choosen numslices factors N_PARTITIONS
# max_workers = [1, 2, 5, 10, 15] # n_executors
# considered_docs = 750

# # Download datasets
# datasets_data = {dataset: download_dataset(dataset) for dataset in datasets}
#
# # Pre-process and sample with the original datasets
# pre_processed_data = {dataset: sample_dict(documents_preprocessing(dataset, docs_dict), considered_docs)
#                       for dataset, docs_dict in datasets_data.items()}

In [None]:
# for collection_name, collection_docs, _ in samples:
pyspark_results = []

thresholds: List[float] = [0.001] # Choosen thresholds
numslices_factor = [5] # Choosen numslices factors N_PARTITIONS
max_workers = 5# n_executors

for ds_name, sampled_dict, _ in samples:
# for ds_name, sampled_dict in pre_processed_data.items():

    print(f'\n------------ALL PAIRS DOCUMENTS SIMILARITY - {ds_name}------------')

    for threshold in thresholds:
        print(f'\n--------Running with threshold: {threshold}--------')

        for s_factor, workers in itertools.product(numslices_factor, range(1, max_workers + 1)):
            # PySpark Execution

            print(f'\nPySpark Parallel Execution with {workers} workers and slice factor of {s_factor}')
            sim_doc_ps, ps_res = pyspark_APDS(ds_name=ds_name, sampled_dict=sampled_dict, threshold=threshold, workers=workers, s_factor=s_factor)
            pyspark_results.append(ps_res)
            create_doc_sim_csv(sim_doc_ps, ds_name, threshold, None, workers)
            print(' Done')

        print('\n')


In [20]:

print('\nSaving pyspark_results')
dio = pd.DataFrame.from_dict(
    dict(zip(range(len(pyspark_results)), pyspark_results)),
    orient='index',
    columns=[
        'ds_name',
        'elapsed',
        'threshold',
        'unique_pairs_sim_docs',
        'workers',
        'slice_factor'
    ],
)
dio.to_csv('./results/pyspark_results.csv', index=False)
print( 'Done')


Saving pyspark_results
Done


In [21]:
dio

Unnamed: 0,ds_name,elapsed,threshold,uniqie_pairs_sim_docs,workers,slice_factor
0,longest_docs,7.082828,0.001,0,1,5
1,longest_docs,3.8728,0.001,0,2,5
2,longest_docs,4.391929,0.001,0,3,5
3,longest_docs,4.689024,0.001,0,4,5
4,longest_docs,5.829722,0.001,0,5,5
5,shortest_docs,2.616789,0.001,0,1,5
6,shortest_docs,3.228978,0.001,0,2,5
7,shortest_docs,3.729301,0.001,0,3,5
8,shortest_docs,4.766977,0.001,0,4,5
9,shortest_docs,7.235433,0.001,0,5,5


In [17]:
pyspark_results

[['longest_docs', 7.082828044891357, 0.001, 0, 1, 5],
 ['longest_docs', 3.87280011177063, 0.001, 0, 2, 5],
 ['longest_docs', 4.3919291496276855, 0.001, 0, 3, 5],
 ['longest_docs', 4.689023971557617, 0.001, 0, 4, 5],
 ['longest_docs', 5.829722166061401, 0.001, 0, 5, 5],
 ['shortest_docs', 2.6167891025543213, 0.001, 0, 1, 5],
 ['shortest_docs', 3.228977918624878, 0.001, 0, 2, 5],
 ['shortest_docs', 3.7293009757995605, 0.001, 0, 3, 5],
 ['shortest_docs', 4.766976594924927, 0.001, 0, 4, 5],
 ['shortest_docs', 7.235433101654053, 0.001, 0, 5, 5],
 ['random_docs', 2.5393941402435303, 0.001, 0, 1, 5],
 ['random_docs', 2.735858201980591, 0.001, 0, 2, 5],
 ['random_docs', 4.076803922653198, 0.001, 0, 3, 5],
 ['random_docs', 4.270469903945923, 0.001, 0, 4, 5],
 ['random_docs', 5.21570897102356, 0.001, 0, 5, 5]]