In [1]:
!pip install unidecode
!pip install contractions
!pip install nltk
!pip install numpy
import re
import string
import nltk
import unidecode
import contractions
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
class TextNormalizer:
    def normalize(self, text, verbose=False):
        raise NotImplementedError()

class SimpleTextNormalizer:
    def __init__(self):
        self.punctuation_table = str.maketrans('','', string.punctuation)
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    
    def normalize(self, text, verbose=False):
        # remove extra withespaces
        text = re.sub(' +', ' ', text)

        # convert unicode characters to ascii
        text = unidecode.unidecode(text)

        # convert text to lowercase
        text = text.lower()

        # expand the contractions so this words can be removed
        text = contractions.fix(text)

        # remove punctuation symbols
        text = text.translate(self.punctuation_table)

        # tokenize the text
        words = word_tokenize(text)

        result = []
        for w in words:
            w = w.strip() # remove possible trailing spaces 
            if w in self.stop_words: # discard stopword
                continue
            w = self.lemmatizer.lemmatize(w) # lemmatize the word
            if w not in self.stop_words: # discard stopword
                result.append(w)

        return result       

In [3]:
from collections import Counter
class FrequencyModel:
    def __init__(self, text_normalizer):
        self.text_normalizer = text_normalizer
        self.tf_table = {}
        self.idf_table = {}
        self.doc_len = {}
        self.doc_max_tf = {}
        self.corpus_len = 0
        self.avg_doc_len = 0

    def fit(self, corpus, normalized_tf=True):
        for doc in corpus:
            self.corpus_len += 1
            self.doc_max_tf[doc.doc_id] = 0

            terms = self.text_normalizer.normalize(doc.text)
            doc_tf = Counter(terms)
            self.doc_len[doc.doc_id] = len(doc_tf)
            self.avg_doc_len += len(doc_tf)

            for term, freq in doc_tf.items():
                try:
                    self.tf_table[term][doc.doc_id] = freq
                except KeyError:
                    self.tf_table[term] = {doc.doc_id : freq}
                
                self.doc_max_tf[doc.doc_id] = max(self.doc_max_tf[doc.doc_id], freq)

        self.avg_doc_len /= self.corpus_len
            
        for term in self.tf_table:
            self.idf_table[term] = self._compute_idf(term)

            if normalized_tf:
                for doc, freq in self.tf_table[term].items():
                    self.tf_table[term][doc] = freq / self.doc_max_tf[doc]

    def _compute_idf(self, term):
        raise NotImplementedError()

    
    def retrieve(self, query):
        raise NotImplementedError()  

In [4]:
class VectorSpaceModel(FrequencyModel):
    def fit(self, corpus):
        super().fit(corpus)

        self.doc_norm = {}
        for term, docs in self.tf_table.items():
            for doc, freq in docs.items():
                self.doc_norm[doc] = self.doc_norm.get(doc, 0) + (self.tf_table[term][doc] * self.idf_table[term]) ** 2

    def retrieve(self, query):
        a = 0.4
        terms = self.text_normalizer.normalize(query)
        counter = Counter(terms)
        max_freq = max(counter.values())

        score = {}
        query_norm = 0
        for term, freq in counter.items():
            if term not in self.tf_table:
                continue
            # compute weight of the term in the query
            w_iq = self.idf_table[term] *(a + (1-a) * freq / max_freq) 
            query_norm += w_iq ** 2

            # only check documents where appears the term
            for doc, tf in self.tf_table[term].items():
                try:
                    score[doc] += w_iq * tf * self.idf_table[term]
                except:
                    score[doc] = w_iq * tf * self.idf_table[term]
        
        for doc in score:
            score[doc] = score[doc] / np.sqrt(w_iq * self.doc_norm[doc]) 
        return sorted(score.items(), key=lambda item: item[1], reverse=True)
         
    def _compute_idf(self, term):
        return np.log10(self.corpus_len / len(self.tf_table[term]))
  

In [5]:
class OkapiBM25Model(FrequencyModel):
    def __init__(self, text_normalizer, k1=1.5, b=0.75):
        super().__init__(text_normalizer)
        self.k1 = k1
        self.b = b

    def retrieve(self, query):
        terms = self.text_normalizer.normalize(query)
        counter = Counter(terms)
        max_freq = max(counter.values())

        score = {}
        for term, freq in counter.items():
            if term not in self.tf_table:
                continue
            for doc, tf in self.tf_table[term].items():
                numerator = self.idf_table[term] * self.tf_table[term][doc] * (self.k1 + 1)
                denominator = self.tf_table[term][doc] + self.k1 * (1 - self.b + self.b * self.doc_len[doc] / self.avg_doc_len)
                try:
                    score[doc] += numerator/denominator
                except:
                    score[doc] = numerator/denominator
        
        return sorted(score.items(), key=lambda item: item[1], reverse=True)
         
    def _compute_idf(self, term):
        return np.log10( 1 + (self.corpus_len - len(self.tf_table[term]) + 0.5) / len(self.tf_table[term]) + 0.5)
  

In [6]:
!pip install ir_datasets
!pip install pandas
import ir_datasets
import pandas as pd
cranfield = ir_datasets.load('cranfield')



In [7]:
cranfield.qrels_defs()

{-1: 'References of no interest.',
 1: 'References of minimum interest, for example, those that have been included from an historical viewpoint.',
 2: 'References which were useful, either as general background to the work or as suggesting methods of tackling certain aspects of the work.',
 3: 'References of a high degree of relevance, the lack of which either would have made the research impracticable or would have resulted in a considerable amount of extra work.',
 4: 'References which are a complete answer to the question.'}

In [8]:
# some documents relevant to query 1 used for evaluation
df = pd.DataFrame(filter(lambda x: x.query_id == "1", cranfield.qrels_iter()))
df

Unnamed: 0,query_id,doc_id,relevance,iteration
0,1,184,2,0
1,1,29,2,0
2,1,31,2,0
3,1,12,3,0
4,1,51,3,0
5,1,102,3,0
6,1,13,4,0
7,1,14,4,0
8,1,15,4,0
9,1,57,2,0


In [9]:
text_normalizer = SimpleTextNormalizer()
vector_model = VectorSpaceModel(text_normalizer)
vector_model.fit(cranfield.docs_iter())

In [10]:
query = list(cranfield.queries_iter())[0]
print(query)
list(vector_model.retrieve(query.text))[:11]

GenericQuery(query_id='1', text='what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .')


[('13', 1.0319418735338295),
 ('359', 1.0156193565682672),
 ('56', 0.8098472770603728),
 ('51', 0.7915209937794212),
 ('1186', 0.7204737576574822),
 ('665', 0.6829625562413993),
 ('12', 0.6549532419624435),
 ('1268', 0.5654093726232157),
 ('327', 0.557975578510696),
 ('486', 0.5561497215854606),
 ('746', 0.5273731520717708)]

In [11]:
okapi_model = OkapiBM25Model(text_normalizer)
okapi_model.fit(cranfield.docs_iter())

In [12]:
query = list(cranfield.queries_iter())[0]
print(query)
list(okapi_model.retrieve(query.text))[:11]

GenericQuery(query_id='1', text='what similarity laws must be obeyed when constructing aeroelastic models\nof heated high speed aircraft .')


[('359', 3.44470422594676),
 ('12', 3.399315017424871),
 ('13', 3.370183889780178),
 ('184', 2.928419229117983),
 ('875', 2.758998415206621),
 ('429', 2.42491993791195),
 ('663', 2.410842356650673),
 ('51', 2.3318772977784348),
 ('1186', 2.3301020192507096),
 ('56', 2.305697295289492),
 ('141', 2.3042831156270567)]

In [None]:
query = "enter your query here"

In [None]:
okapi_model.retrieve(query)

In [None]:
vector_model.retrieve(query)