In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [44]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [45]:
import findspark
findspark.init()

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [56]:
import string
from collections import defaultdict

In [57]:
def load_stopwords():
    with open('/content/drive/MyDrive/stopwords.txt', "r") as file:
        stopwords = file.read().splitlines()
    return stopwords

In [63]:
# Preprocessing a text by converting to lowercase and removing punctuation
def preprocess_text(text):
    text = text.lower()
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

In [59]:
# Indexing the documents (version 1 without lambda function)
def index_v1(documents_folder):
    stopwords = load_stopwords()
    index = defaultdict(dict)  # {word: {doc_id: frequency}}

    for filename in os.listdir(documents_folder):
        with open(os.path.join(documents_folder, filename), "r", errors='ignore') as file:
            text = file.read()
            text = preprocess_text(text)
            words = text.split()

            for word in words:
                if word not in stopwords:
                    index[word][filename] = index[word].get(filename, 0) + 1

    return index

In [60]:
# Indexing the documents (version 2 using lambda function)
def index_v2(documents_folder):
    stopwords = load_stopwords()
    index = defaultdict(lambda: defaultdict(int))  # {word: {doc_id: frequency}}

    for filename in os.listdir(documents_folder):
        with open(os.path.join(documents_folder, filename), "r", errors='ignore') as file:
            text = file.read()
            text = preprocess_text(text)
            words = text.split()

            for word in words:
                if word not in stopwords:
                    index[word][filename] += 1

    return index

In [61]:
def search(query, index):
    stopwords = load_stopwords()
    query = preprocess_text(query)
    query_words = query.split()
    scores = defaultdict(int)

    for word in query_words:
        if word not in stopwords and word in index:
            for doc_id, freq in index[word].items():
                scores[doc_id] += freq

    # Sort documents by score in descending order
    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Print document IDs and scores
    for doc_id, score in sorted_scores:
        if score > 0:
            print(f"Document ID: {doc_id}, Score: {score}")

In [62]:
if __name__ == "__main__":
    documents_folder = '/content/drive/MyDrive/Assignment1_data'  # Change to the path of your documents folder
    index1 = index_v1(documents_folder)
    index2 = index_v2(documents_folder)

    query = input("Enter a set of words to search: ")
    print("\nSearch Results (Version 1 without lambda function):")
    search(query, index1)

    print("\nSearch Results (Version 2 using lambda function):")
    search(query, index2)

Enter a set of words to search: Sharapova black hole

Search Results (Version 1 without lambda function):
Document ID: 14.txt, Score: 10
Document ID: 15.txt, Score: 3
Document ID: 20.txt, Score: 1

Search Results (Version 2 using lambda function):
Document ID: 14.txt, Score: 10
Document ID: 15.txt, Score: 3
Document ID: 20.txt, Score: 1
