In [28]:
import gensim
from gensim import corpora
import math

In [35]:
class BM25 :
    def __init__(self, fn_docs, delimiter='|') :
        self.dictionary = corpora.Dictionary()
        self.DF = {}
        self.delimiter = delimiter
        self.DocTF = []
        self.DocIDF = {}
        self.N = 0
        self.DocAvgLen = 0
        self.fn_docs = fn_docs
        self.DocLen = []
        self.buildDictionary()
        self.TFIDF_Generator()

    def buildDictionary(self) :
        raw_data = []
        for line in open(self.fn_docs) :
            raw_data.append(line.strip().split(self.delimiter))
        self.dictionary.add_documents(raw_data)

    def TFIDF_Generator(self, base=math.e) :
        docTotalLen = 0
        for line in open(self.fn_docs) :
            doc = line.strip().split(self.delimiter)
            docTotalLen += len(doc)
            self.DocLen.append(len(doc))
            #print self.dictionary.doc2bow(doc)
            bow = dict([(term, freq*1.0/len(doc)) for term, freq in self.dictionary.doc2bow(doc)])
            for term, tf in bow.items() :
                if term not in self.DF :
                    self.DF[term] = 0
                self.DF[term] += 1
            self.DocTF.append(bow)
            self.N = self.N + 1
        for term in self.DF:
            self.DocIDF[term] = math.log((self.N - self.DF[term] +0.5) / (self.DF[term] + 0.5), base)
        self.DocAvgLen = docTotalLen / self.N

    def BM25Score(self, Query=[], k1=1.5, b=0.75) :
        query_bow = self.dictionary.doc2bow(Query)
        scores = []
        for idx, doc in enumerate(self.DocTF) :
            commonTerms = set(dict(query_bow).keys()) & set(doc.keys())
            tmp_score = []
            doc_terms_len = self.DocLen[idx]
            for term in commonTerms :
                upper = (doc[term] * (k1+1))
                below = ((doc[term]) + k1*(1 - b + b*doc_terms_len/self.DocAvgLen))
                tmp_score.append(self.DocIDF[term] * upper / below)
            scores.append(sum(tmp_score))
        return scores

    def TFIDF(self) :
        tfidf = []
        for doc in self.DocTF :
            doc_tfidf  = [(term, tf*self.DocIDF[term]) for term, tf in doc.items()]
            doc_tfidf.sort()
            tfidf.append(doc_tfidf)
        return tfidf

    def Items(self) :
        items = sorted(self.dictionary.items())    
        return items
    

In [36]:
fn_docs = 'doc.txt'
bm25 = BM25(fn_docs, delimiter=' ')
Query = 'The intersection graph of paths in trees survey Graph'
Query = Query.split()
scores = bm25.BM25Score(Query)
tfidf = bm25.TFIDF()
print(bm25.Items())
for i, tfidfscore in enumerate(tfidf):
    print(i, tfidfscore)

[(0, 'Human'), (1, 'abc'), (2, 'applications'), (3, 'computer'), (4, 'for'), (5, 'interface'), (6, 'lab'), (7, 'machine'), (8, 'A'), (9, 'of'), (10, 'opinion'), (11, 'response'), (12, 'survey'), (13, 'system'), (14, 'time'), (15, 'user'), (16, 'EPS'), (17, 'The'), (18, 'management'), (19, 'System'), (20, 'and'), (21, 'engineering'), (22, 'human'), (23, 'testing'), (24, 'Relation'), (25, 'error'), (26, 'measurement'), (27, 'perceived'), (28, 'to'), (29, 'binary'), (30, 'generation'), (31, 'random'), (32, 'trees'), (33, 'unordered'), (34, 'graph'), (35, 'in'), (36, 'intersection'), (37, 'paths'), (38, 'Graph'), (39, 'IV'), (40, 'Widths'), (41, 'ordering'), (42, 'quasi'), (43, 'well'), (44, 'minors')]
0 [(0, 0.2168251319235133), (1, 0.2168251319235133), (2, 0.2168251319235133), (3, 0.13732653608351372), (4, 0.2168251319235133), (5, 0.13732653608351372), (6, 0.2168251319235133), (7, 0.2168251319235133)]
1 [(3, 0.10986122886681099), (8, 0.10986122886681099), (9, -0.1238078416812447), (10, 0