In [32]:
from nltk.tokenize import word_tokenize
import math
import torch
from torch.nn import Softmax

In [50]:
def score(query: str, docs: list) -> float:
    """An implementation of 'practical scoring function' originally from lucene, adapted to our needs.
    
    Consists of term frequency, query coordination factor, id, field-length norm. 
    queryNorm, boost are skipped.

    Parameters
    ----------
    query : str
        The word from the sentence.
    docs : list
        list of values from triples. Each value is a object string from triple.

    Returns
    -------
    list
        list of matching scores for each of the doc in the docs list
    """

    scores = []
    
    query_tok = word_tokenize(query)
    docs_tok = [word_tokenize(doc) for doc in docs]
    
    # idf: list stores idf values for each term in query. idf values are used later in code.
    idf_scores = []
    for tok in query_tok:
        doc_freq = sum([1 for doc in docs_tok if tok in doc])
        idf = 1 + math.log(len(docs)/(doc_freq + 1))
        idf_scores.append(idf)
    
    for doc in docs:
        doc_tok = word_tokenize(doc)
        score = 0
        for id, term in enumerate(query_tok):
            tf = sum([1 for tok in doc_tok if tok==term])
            idf = idf_scores[id]
            norm = 1 / len(doc)
            score += tf * idf * norm
        query_coord = sum([1 for tok in query_tok if tok in doc]) / len(query_tok)
        score *= query_coord
        scores.append(score)
    
    return scores
    

In [51]:
query = 'cricketer 2020'
docs = ['cricketer 2020', 'footballer', 'cricketer of the year 2020']
result = score(query=query, docs=docs)
# result = torch.tensor(result)
# S = Softmax(dim=-1)
# S(result)