In [None]:
import math
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
stemmer = PorterStemmer()

In [18]:
def preprocess(doc):
  tokens = word_tokenize(doc.lower())
  return [stemmer.stem(token) for token in tokens if token.isalnum()]

from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset = "train", categories = ["sci.space", "comp.graphics"], remove = ("headers", "footers", "quotes"))
corpus = dataset.data[:10]

In [20]:
k1 = 1.25 # Term frequency Scaling
b = 0.80 # Document Length Scaling

preprocessed_corpus = [preprocess(doc) for doc in corpus]
document_lengths = [len(doc) for doc in preprocessed_corpus]
avgdl = sum(document_lengths)/len(document_lengths)

term_frequencies = [Counter(doc) for doc in preprocessed_corpus]

In [21]:
def compute_document_frequencies(corpus):
  df = Counter()
  for doc in corpus:
    unique_terms = set(doc)
    for term in unique_terms:
      df[term] += 1
  return df

In [22]:
document_frequencies = compute_document_frequencies(preprocessed_corpus)
N = len(corpus)

def compute_idf(term, df, N):
  return math.log((N - df[term] + 0.5)/(df[term] + 0.5) + 1)

idf = {term : compute_idf(term, document_frequencies, N) for term in document_frequencies}

def compute_bm25_score(query, doc_index):
  query_terms = preprocess(query)
  score = 0
  for term in query_terms:
    if term not in idf:
      continue

    tf = term_frequencies[doc_index][term]
    dl = document_lengths[doc_index]

    numerator = tf * (k1 + 1)
    denominator = tf + k1 * (1 - b + b * dl/avgdl)
    score += idf[term] * (numerator/denominator)

  return score

In [23]:
def rank_documents(query):
  scores = []
  for doc_index in range (len(corpus)):
    score = compute_bm25_score(query, doc_index)
    scores.append((doc_index, corpus[doc_index]))

  return sorted(scores, key = lambda x: x[0], reverse = True)

query = "space technology"
ranked_results = rank_documents(query)

print(f"Query: {query}'\n")
print("Ranked results: ")
for score, doc in ranked_results:
  print(f"Score: {score:.4f} | Document: {doc[:100]}...")


Query: space technology'

Ranked results: 
Score: 9.0000 | Document: 

THANKS!  It did work, and it is just what I needed thanks......
Score: 8.0000 | Document: 
I don't know about that...I've used Photoshop 2.5 on both a 486dx-50 and a Quadra
950...I'd say the...
Score: 7.0000 | Document: ====
If that were true, I'd go for it.. I have a few friends who we could pool our
resources and do ...
Score: 6.0000 | Document: 
Hi,
It might be nice to know, what's possible on different hard ware platforms.
But usually the har...
Score: 5.0000 | Document: I'm a mac user who wants to use some of the rayshade models I've built
using macrayshade (rayshade-M...
Score: 4.0000 | Document: [Lots of stuff about how the commerical moonbase=fantasyland]

Then what do you believe will finally...
Score: 3.0000 | Document: I read it refered to as the "parabolic cross-section" rule;
the idea was that if you plot the area o...
Score: 2.0000 | Document: I am currently using POVRay on Mac and was wondering if any

In [None]:
import math
import nltk
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
stemmer = PorterStemmer()

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    return [stemmer.stem(token) for token in tokens if token.isalnum()]

from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset="train", categories=["sci.space", "comp.graphics"], remove=("headers", "footers", "quotes"))
corpus = dataset.data[:10]

k1 = 1.25
b = 0.80

preprocessed_corpus = [preprocess(doc) for doc in corpus]
document_lengths = [len(doc) for doc in preprocessed_corpus]
avgdl = sum(document_lengths) / len(document_lengths)

term_frequencies = [Counter(doc) for doc in preprocessed_corpus]

def compute_document_frequencies(corpus):
    df = Counter()
    for doc in corpus:
        unique_terms = set(doc)
        for term in unique_terms:
            df[term] += 1
    return df

document_frequencies = compute_document_frequencies(preprocessed_corpus)
N = len(corpus)

def compute_idf(term, df, N):
    return math.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1)

idf = {term: compute_idf(term, document_frequencies, N) for term in document_frequencies}

def compute_bm25_score(query, doc_index):
    query_terms = preprocess(query)
    score = 0
    for term in query_terms:
        if term not in idf:
            continue
        tf = term_frequencies[doc_index][term]
        dl = document_lengths[doc_index]
        numerator = tf * (k1 + 1)
        denominator = tf + k1 * (1 - b + b * dl / avgdl)
        score += idf[term] * (numerator / denominator)
    return score

def rank_documents(query):
    scores = []
    for doc_index in range(len(corpus)):
        score = compute_bm25_score(query, doc_index)
        scores.append((score, corpus[doc_index]))
    return sorted(scores, key=lambda x: x[0], reverse=True)

query = "space technology"
ranked_results = rank_documents(query)

print(f"Query: {query}\n")
print("Ranked results: ")
for score, doc in ranked_results:
    print(f"Score: {score:.4f} | Document: {doc[:100]}...")
