In [5]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m153.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [6]:
from collections import defaultdict, Counter
import numpy as np
import re
import os
import PyPDF2
from math import log


In [7]:
def preprocess(text):
    """Cleans text by removing special characters and converting it to lowercase."""
    text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
    return text.lower()


In [8]:
def load_documents(path):
    """Loads and preprocesses all documents (PDFs) from a given path."""
    documents = []
    for filename in os.listdir(path):
        if filename.endswith('.pdf'):
            with open(os.path.join(path, filename), 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ''
                for page in reader.pages:
                    text += page.extract_text()
                documents.append(preprocess(text))
    return documents


In [9]:
def load_queries(file_path):
    """Loads queries from a specified text file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        queries = [line.strip() for line in file.readlines()]
    return queries


In [10]:
def compute_statistics(documents):
    """Computes term frequencies, document frequencies, and document lengths."""
    term_frequency = defaultdict(Counter)
    document_frequency = defaultdict(int)
    document_lengths = []

    for i, doc in enumerate(documents):
        words = doc.split()
        doc_length = len(words)
        document_lengths.append(doc_length)

        for word in words:
            term_frequency[word][i] += 1  # Increment term count in doc i

        for word in set(words):
            document_frequency[word] += 1  # Increment document frequency for the word

    avg_doc_length = np.mean(document_lengths)
    return term_frequency, document_frequency, document_lengths, avg_doc_length


In [11]:
def compute_corpus_probabilities(documents):
    """Calculates corpus-wide word probabilities."""
    corpus_frequency = defaultdict(int)
    total_words = 0

    for doc in documents:
        words = doc.split()
        for word in words:
            corpus_frequency[word] += 1
        total_words += len(words)

    # Probability of each word in the entire corpus
    corpus_prob = {word: freq / total_words for word, freq in corpus_frequency.items()}
    return corpus_prob


In [12]:
def compute_bm25_scores(query, documents, term_frequency, document_frequency, document_lengths, avg_doc_length, k1=1.5, b=0.75):
    """Calculates BM25 scores for the query and documents."""
    N = len(documents)  # Number of documents
    scores = {}

    for i, doc in enumerate(documents):
        score = 0
        doc_length = document_lengths[i]
        for word in query.split():
            if word in document_frequency:
                idf = log((N - document_frequency[word] + 0.5) / (document_frequency[word] + 0.5) + 1)
                tf = term_frequency[word][i]
                score += idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (doc_length / avg_doc_length))))
        scores[i] = score
    return scores


In [13]:
def compute_jm_scores(query, documents, term_frequency, document_lengths, corpus_word_prob, lambda_param=0.7):
    """Calculates Jelinek-Mercer smoothing scores for the query and documents."""
    scores = {}

    for i, doc in enumerate(documents):
        score = 1
        for word in query.split():
            term_prob_doc = term_frequency[word][i] / document_lengths[i] if term_frequency[word][i] else 0
            term_prob_corpus = corpus_word_prob[word] if word in corpus_word_prob else 0
            score *= (lambda_param * term_prob_doc) + ((1 - lambda_param) * term_prob_corpus)
        scores[i] = score
    return scores


In [14]:
path_to_documents = "/content/drive/My Drive/News"
documents = load_documents(path_to_documents)

queries_file_path = "/content/drive/My Drive/News/Queries.txt"
queries = load_queries(queries_file_path)


In [15]:
term_frequency, document_frequency, document_lengths, avg_doc_length = compute_statistics(documents)
corpus_prob = compute_corpus_probabilities(documents)


In [16]:
for query in queries:
    bm25_scores = compute_bm25_scores(query, documents, term_frequency, document_frequency, document_lengths, avg_doc_length)
    jm_scores = compute_jm_scores(query, documents, term_frequency, document_lengths, corpus_prob)
    print(f"Query: {query}\nBM25 Scores: {bm25_scores}\nJelinek-Mercer Scores: {jm_scores}\n")


Query: Top news in Nepal
BM25 Scores: {0: 0.9880331176070418, 1: 0.0, 2: 0.9863711296388644, 3: 0.996674819285081, 4: 0.9848585684005589, 5: 0.3621647943600512, 6: 0.8096808926081307, 7: 0.995424544652235, 8: 0.9143220083959823, 9: 0.9974168710128559}
Jelinek-Mercer Scores: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}

Query: Impact of economic recession in Nepal
BM25 Scores: {0: 0.7192482580523278, 1: 0.0, 2: 0.7266062252047922, 3: 3.3560448193991577, 4: 0.7227144340578676, 5: 2.7169509438860757, 6: 3.0692929248818657, 7: 0.7258347389034157, 8: 3.1207195107217647, 9: 0.7244760263530663}
Jelinek-Mercer Scores: {0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}

Query: Political news in Nepal
BM25 Scores: {0: 0.9880331176070418, 1: 0.0, 2: 0.9863711296388644, 3: 0.996674819285081, 4: 0.9848585684005589, 5: 0.3621647943600512, 6: 0.8096808926081307, 7: 0.995424544652235, 8: 0.9143220083959823, 9: 0.9974168710128559}
Jeline