In [2]:
import pickle
import spacy
import codecs
import pprint
import math
from bs4 import BeautifulSoup

iInd = pickle.load(open("scripts/inv_ind_f.p", "rb"))
word_counters = pickle.load(open("scripts/word_count_f.p", "rb"))
unique_word_counters = pickle.load(open("scripts/unique_words_count_f.p", "rb"))

nlp = spacy.load('en')

### load topic to calculate

In [3]:
text_file = codecs.open("data/TREC8all/testtopic.txt", 'r', "iso-8859-1").read()
soup = BeautifulSoup(text_file, "lxml")
topic_words = {}
topic_to_score = 401
pos = topic_to_score - 401
texts = [soup.find_all("top")[pos].desc.text, 
         soup.find_all("top")[pos].narr.text, 
         soup.find_all("top")[pos].title.text]
docs = nlp.pipe(texts)
word_count = 0

for doc in docs:
    for token in doc:
        if token.is_alpha and not token.is_stop and len(token.orth_) > 1:
            word_count = word_count + 1
            strtok = token.lemma_.strip()
            if strtok not in topic_words.keys():
                topic_words[strtok] = 1
            else:
                topic_words[strtok] = topic_words[strtok] + 1



In [4]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(topic_words)

{'house': 2, 'parking': 1}


## Base functions

In [5]:
def tf(word_occ, word_count):
    return word_occ / word_count

def idf(doc_count, contains_count):
    return math.log(doc_count / contains_count)

def avgdl(all_dwords_count, doc_count):
    return all_dwords_count / doc_count

## TF-IDF

In [6]:
def tf_idf(word_occ, word_count, doc_count, contains_count):
    return tf(word_occ, word_count) * idf(doc_count, contains_count)

## BM25

In [7]:
def bm25(word_occ, word_count, doc_count, contains_count, all_dwords_count, b = 0.75, k = 1.2):
    return (idf(doc_count, contains_count) * (tf(word_occ, word_count) * (k + 1)) / 
            (tf(word_occ, word_count) + k * (1 - b + b * word_count / avgdl(all_dwords_count, doc_count))))

## BM25VA

In [8]:
mavgtf_v = None

def avgtf(word_count, unique_word_count):
    return word_count / unique_word_count

def mavgtf(doc_count):
    global mavgtf_v
    if mavgtf_v == None:
        sum_m = 0
        for docno, u_c in unique_word_counters.items():
            sum_m = sum_m + avgtf(word_counters[docno], u_c)
        mavgtf_v = sum_m / doc_count
    return mavgtf_v

def bva(word_count, doc_count, all_dwords_count, unique_word_count):
    mavgtf_l = mavgtf(doc_count)
    return (pow(mavgtf_l, -2) * avgtf(word_count, unique_word_count) + (1 - pow(mavgtf_l, -1)) * 
            word_count / avgdl(all_dwords_count, doc_count))

## Score one topic

In [23]:
def score(scoring_method = "tf-idf"):
    scores = {}
    print(word_counters["doc_counter"])
    print(word_count)
    print(word_counters["word_counter"])
    
    for tk, tv in topic_words.items():
        print("Current word: {} - Occurence in topic: {}".format(tk, tv))
        if scoring_method == "tf-idf":
            t_score = tf_idf(tv, word_count, word_counters["doc_counter"], len(iInd[tk]) + 1)
        elif scoring_method == "bm25":
            t_score = bm25(tv, word_count, word_counters["doc_counter"], len(iInd[tk]) + 1, 
                           word_counters["word_counter"])
        elif scoring_method == "bm25va":
            b = bva(word_count, word_counters["doc_counter"], word_counters["word_counter"], len(topic_words))
            t_score = bm25(tv, word_count, word_counters["doc_counter"], len(iInd[tk]) + 1, 
                           word_counters["word_counter"], b)        
        print("Score of word \"{}\" in topic: \"{}\"".format(tk, t_score))
        print("docs containing the word {}".format(len(iInd[tk]) + 1))
        for dk, dv in iInd[tk].items():
            if scoring_method == "tf-idf":
                d_score = tf_idf(dv, word_counters[dk], word_counters["doc_counter"], len(iInd[tk]) + 1)
            elif scoring_method == "bm25":
                d_score = bm25(dv, word_counters[dk], word_counters["doc_counter"], len(iInd[tk]) + 1, 
                               word_counters["word_counter"])
            elif scoring_method == "bm25va":
                b = bva(word_counters[dk], word_counters["doc_counter"], word_counters["word_counter"], 
                        unique_word_counters[dk])
                d_score = bm25(dv, word_counters[dk], word_counters["doc_counter"], len(iInd[tk]) + 1, 
                               word_counters["word_counter"], b)
            if scores.get(dk) == None:
                scores[dk] = t_score * d_score
            else:
                scores[dk] = scores[dk] + t_score * d_score
            if dk == "LA091990-0103" or dk == "LA012889-0064":
                print("The document \"{}\" has the word \"{}\" \"{}\" times.".format(dk, unique_word_counters[dk], dv))
                print("The score in the doc is {}. The total score of the document is {}.".format(d_score, scores[dk]))
    print(mavgtf_v)
    print("Length LA091990 {} length LA012889 {}".format(word_counters["LA091990-0103"], word_counters["LA012889-0064"]))
    return scores

In [14]:
results = score("bm25")

[(k, results[k])[:1000] for k in sorted(results, key=results.get, reverse=True)]

528106
3
137955388
Current word: house - Occurence in topic: 2
Score of word "house" in topic: "3.4377938095387037"
docs containing the word 53477
The document "LA091990-0103" has the word "house" "4" times.
The score in the doc is 0.02499236737078172. The total score of the document is 0.08591860583299048.
The document "LA012889-0064" has the word "house" "8" times.
The score in the doc is 0.011611290182283367. The total score of the document is 0.03991722150941129.
Current word: parking - Occurence in topic: 1
Score of word "parking" in topic: "5.310809301499194"
docs containing the word 4992
The document "LA091990-0103" has the word "parking" "3" times.
The score in the doc is 0.03820194758211394. The total score of the document is 0.28880186438746586.
The document "LA012889-0064" has the word "parking" "6" times.
The score in the doc is 0.017736583013978388. The total score of the document is 0.13411283155686032.
None
Length LA091990 441 length LA012889 960


[('FT911-856', 16.803873840168738),
 ('FT924-7823', 12.536108789788052),
 ('FT921-10189', 10.687242701341782),
 ('LA090990-0017', 9.812111919819335),
 ('FT933-16081', 8.980695527355419),
 ('LA091390-0139', 8.90572216232051),
 ('LA092090-0080', 8.682822361263293),
 ('LA092790-0091', 8.473301188101333),
 ('LA030189-0054', 7.5373355630210614),
 ('LA121390-0171', 7.247432016390892),
 ('LA081690-0230', 7.144291639146748),
 ('LA011590-0055', 6.961322095426645),
 ('FT934-16935', 6.959338183792705),
 ('FT924-12632', 6.856144572042819),
 ('FT911-1809', 6.856144572042819),
 ('LA122490-0084', 6.731054747788109),
 ('LA100390-0099', 6.712967150902295),
 ('LA051790-0084', 6.500651218325379),
 ('LA070689-0049', 6.465845039339682),
 ('LA031089-0108', 6.290992270286418),
 ('LA032990-0150', 6.077825415680307),
 ('LA062289-0093', 5.979283373332015),
 ('FT934-14934', 5.923313761336297),
 ('FT942-15662', 5.923313761336297),
 ('FT932-17083', 5.923313761336297),
 ('LA033190-0094', 5.911414115879052),
 ('LA06

In [23]:
results = score("tf-idf")

[(k, results[k])[:1000] for k in sorted(results, key=results.get, reverse=True)]

528106
33
137955388
Current word: description - Occurence in topic: 1
Score of word "description" in topic: "0.127709222799227"
docs containing the word 7806
Current word: language - Occurence in topic: 1
Score of word "language" in topic: "0.0602320357724622"
docs containing the word 72359
Current word: cultural - Occurence in topic: 1
Score of word "cultural" in topic: "0.11780397364437657"
docs containing the word 10824
Current word: difference - Occurence in topic: 1
Score of word "difference" in topic: "0.09556651520389511"
docs containing the word 22547
Current word: impede - Occurence in topic: 1
Score of word "impede" in topic: "0.17425767356720442"
docs containing the word 1680
Current word: integration - Occurence in topic: 2
Score of word "integration" in topic: "0.2630978491391039"
docs containing the word 6877
The document "FT942-15632" has the word "integration" "1" times.
The score in the doc is 0.0070131090642895225. The total score of the document is 0.0018451339105925

[('LA102690-0021', 0.13890256674957088),
 ('FR940817-0-00050', 0.12779456394362168),
 ('FBIS3-4730', 0.12370996141167616),
 ('LA070789-0082', 0.11319836271872762),
 ('FT924-14033', 0.11152540239747188),
 ('FT934-791', 0.10734827732388387),
 ('FT931-5571', 0.10734827732388387),
 ('LA122890-0124', 0.10412208298039455),
 ('FT923-7355', 0.10409037557097377),
 ('FT933-10906', 0.10409037557097377),
 ('FT933-2793', 0.10383071733243401),
 ('FT922-8550', 0.10376034802619162),
 ('FT923-1528', 0.10167133285559596),
 ('FT922-8412', 0.10150742013379707),
 ('FR940817-0-00048', 0.10082986714479582),
 ('FBIS4-32758', 0.09942821247526344),
 ('FT941-15226', 0.09874509240791236),
 ('FBIS4-42635', 0.09758472709778791),
 ('FR940817-0-00062', 0.09593905286531769),
 ('FR940817-0-00061', 0.09357234827263484),
 ('FT922-4874', 0.09184444903321215),
 ('LA021289-0171', 0.09184444903321215),
 ('FT924-6537', 0.09114476376556178),
 ('LA040689-0047', 0.09114476376556178),
 ('FBIS4-42105', 0.08945689776990323),
 ('LA0

In [24]:
results = score("bm25va")
# [(k, results[k])[:1000] for k in sorted(results, key=results.get, reverse=True)]

528106
3
137955388
Current word: house - Occurence in topic: 2
Score of word "house" in topic: "2.7379158466823537"
docs containing the word 53477
The document "LA091990-0103" has the word "284" "4" times.
The score in the doc is 0.020510586143401982. The total score of the document is 0.05615625882676379.
The document "LA012889-0064" has the word "577" "8" times.
The score in the doc is 0.005338018059166682. The total score of the document is 0.01461504423406904.
Current word: parking - Occurence in topic: 1
Score of word "parking" in topic: "3.8262229556610348"
docs containing the word 4992
The document "LA091990-0103" has the word "284" "3" times.
The score in the doc is 0.03134436572898966. The total score of the document is 0.17608679050965906.
The document "LA012889-0064" has the word "577" "6" times.
The score in the doc is 0.008151438695920972. The total score of the document is 0.045804266094065516.
1.6746965554278923
Length LA091990 441 length LA012889 960
