# Assignment 3 - All Pair Documents Similarity Search

## Setup

In [10]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['SPARK_LOCAL_IP'] = "127.0.0.1"

# UserWarning: 'PYARROW_IGNORE_TIMEZONE' environment variable was not set.
# It is required to set this environment variable to '1' in both driver and executor
#   sides if you use pyarrow>=2.0.0.
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

import pyspark.sql as psql
from pyspark import SparkFiles

# IMPORTANT: create session prior to importing pyspark.pandas, else
#   spark won't use all specified cores
from src.utils import AVAILABLE_CORES
spark: psql.SparkSession = (
    psql.SparkSession.builder
    # https://cloudxlab.com/assessment/displayslide/500/apache-spark-running-on-cluster-local-mode
    .master(f"local[{AVAILABLE_CORES}]")  # local-mode with using all available threads

    .appName("APDSS")
    .config("spark.driver.host", "localhost")
    .config("spark.driver.memory", "40g")
    .getOrCreate()
)

# Add local dependencies (local python source files) to SparkContext and sys.path
src_zip_path = os.path.abspath("../../src.zip")
spark.sparkContext.addPyFile(src_zip_path)
sys.path.insert(0, SparkFiles.getRootDirectory())


import src.tokenization as tok
import src.apdss.map_reduce as mr
import src.apdss.sequential as seq
import src.apdss.spark_dataframe as spdf
import src.io_ as io

SIM_THRESHOLD = 0.3

In [11]:
corpus_path = io.download_beir_dataset()

## Feature Extraction/Tokenization

In [12]:
docs_scores_df = tok.get_document_features(
    spark=spark,
    corpus_json_path=os.path.join(io.DATA_DIR, io.DEFAULT_DATASET_NAME, "corpus-trial.jsonl")
)
docs_scores_df = docs_scores_df.cache()
docs_scores_df.show(truncate=50, vertical=True)

[32m2023-05-26 01:09:47.865[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36mget_document_features[0m:[36m53[0m - [1mLoading corpus...[0m
[32m2023-05-26 01:09:48.067[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36mget_document_features[0m:[36m64[0m - [1mCompacting document texts (merging title and actual text)...[0m
[32m2023-05-26 01:09:48.212[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36mget_document_features[0m:[36m67[0m - [1mSplitting texts into words...[0m
[32m2023-05-26 01:09:48.387[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36mget_document_features[0m:[36m70[0m - [1mCalculating Scores (TF-IDF)...[0m
[32m2023-05-26 01:09:49.967[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36mget_document_features[0m:[36m75[0m - [1mNormalizing Scores...[0m
[32m2023-05-26 01:09:50.146[0m | [1mINFO    [0m | [36msrc.tokenization[0m:[36mget_document_features[0m:[36m78[0m - [1mDocs successfully tokenized[0m

-RECORD 0----------------------------------------------------
 _id    | 7e8r61e7                                           
 scores | (233,[10,16,17,23,33,175,195],[0.12322741652357... 
-RECORD 1----------------------------------------------------
 _id    | lxk6b297                                           
 scores | (233,[0,1,2,3,4,5,6,7,9,10,15,19,20,23,26,30,31... 
-RECORD 2----------------------------------------------------
 _id    | pnl9th2c                                           
 scores | (233,[0,4,6,12,16,17,103,146],[0.03791543455503... 
-RECORD 3----------------------------------------------------
 _id    | 08gqn86z                                           
 scores | (233,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,19,... 
-RECORD 4----------------------------------------------------
 _id    | p6uqakzs                                           
 scores | (233,[0,2,3,44,136,155],[0.01774194162798637,0.... 
-RECORD 5----------------------------------------------------
 _id    

                                                                                

## APDSS - Different Approaches

## Low-level MapReduce

In [13]:
docs_similarities_mr = mr.MapReduceAPDSS().apdss(
    spark=spark,
    docs_scores_df=docs_scores_df,
    threshold=SIM_THRESHOLD,
    num_partitions=AVAILABLE_CORES
)

[32m2023-05-26 01:09:54.601[0m | [34m[1mDEBUG   [0m | [36msrc.apdss.map_reduce[0m:[36m_get_d_star[0m:[36m139[0m - [34m[1m_get_d_star partitions 4[0m
[32m2023-05-26 01:10:07.405[0m | [34m[1mDEBUG   [0m | [36msrc.apdss.map_reduce[0m:[36m_get_b_map[0m:[36m167[0m - [34m[1m_get_b_map partitions 4[0m
[32m2023-05-26 01:10:07.872[0m | [34m[1mDEBUG   [0m | [36msrc.apdss.map_reduce[0m:[36mapdss[0m:[36m101[0m - [34m[1m_apply_prefix_filtering partitions 4[0m
[32m2023-05-26 01:10:07.895[0m | [34m[1mDEBUG   [0m | [36msrc.apdss.map_reduce[0m:[36mapdss[0m:[36m111[0m - [34m[1mterm_doc_sequence_rdd partitions 4[0m
[32m2023-05-26 01:10:08.948[0m | [34m[1mDEBUG   [0m | [36msrc.apdss.map_reduce[0m:[36mapdss[0m:[36m126[0m - [34m[1mUn-persisting input df[0m


In [14]:
print(docs_similarities_mr.time)
docs_similarities_mr.similar_docs[0:10]

14.380352020263672


[('lxk6b297', '6jittbis', 0.34844077926115397),
 ('rcwck1y3', '5oew0vrr', 0.33299106742968887),
 ('5nw3828d', '6jittbis', 0.3590313026467476)]

## PySpark DataFrame

In [15]:
docs_similarity_spdf = spdf.SparkDataFrameAPDSS().apdss(
    spark=spark,
    docs_scores_df=docs_scores_df,
    threshold=SIM_THRESHOLD,
    num_partitions=AVAILABLE_CORES
)

                                                                                

In [16]:
print(docs_similarity_spdf.time)
docs_similarity_spdf.similar_docs[0:10]

12.806149005889893


[('5nw3828d', '6jittbis', 0.3590312898159027),
 ('5oew0vrr', 'rcwck1y3', 0.3329910635948181),
 ('6jittbis', 'lxk6b297', 0.3484407663345337),
 ('fvhq8yud', 'ma3ndg41', 0.3124379813671112),
 ('fvhq8yud', 'pnl9th2c', 0.5505577325820923)]

In [9]:
spark.stop()