# BM25 检索模型
## 流程
- 数据预处理
- 建立到排索引
- 实现OR操作
- 实现f(q,d）核函数
- 查询操作返回topK relevance 结果

In [100]:
import json
import nltk
import math
from bs4 import BeautifulSoup
import string

### 数据预处理
- 解析json格式数据得到dict_list
- 去非关键性标点，换行符等，split
- 建立vocab（Tokenize）
- 保存结果数据
###### 注意这里的 text texts的顺序相同 vbocab 一个文档只记录一次

In [108]:
f = open('./data/tweets.txt','r+')
lines = f.readlines()
text = []
docid = []
for l in lines:
    text.append(json.loads(l)['text'])
    docid.append(json.loads(l)['tweetId'])

In [111]:
texts = []
symbol = [',',':','_','!','\"','*','>','<','@','~','-','(',')','%','=','\\','^'
          ,'&','|','#','$','[',']','+',':','#','|'] 
for l in text:
    trantab = str.maketrans({key: None for key in string.punctuation})
    j = l.translate(trantab)
    j = j.split()
    texts.append(j)

### 建立vocab

In [28]:
vocab = {}
for line in texts:
    line_set = set(line) ##去重
    for word in line_set:       
            if vocab.get(word) == None:
                vocab[word] = 1
            else :
                vocab[word] += 1
v_tuple = sorted(vocab.items(),key = lambda x:x[1],reverse=True) ##从大到小排序
vocab = {}
for t in v_tuple:
    vocab[t[0]] = t[1]

In [29]:
f = open('./data/vocab.txt','w+')
line_str = json.dumps(vocab)
f.write(line_str)
f.close()

## 建立倒排索引

In [30]:
word2inverted_index = {} ##一个字典 存frequency 一个字典存 每个词出现的docID
keys = list(vocab.keys())
for k in keys:
    word2inverted_index[k] = []##初始化

for i,line in enumerate(texts):
    line_set = set(line) ##去重
    for word in line_set:
        word2inverted_index[word].append(i)

### 建立IDF字典

In [31]:
d_idf = [len(word2inverted_index[k]) for k in keys]

### 计算avgD

In [32]:
lend = [len(t) for t in texts]
avgD = sum(lend)/len(lend)

### OR操作

In [33]:
def OR_op(l1,l2):
    ##两个有序链表 返回并集 docID 并且去重 有序
    result = []
    i = 0
    j = 0
    while(i<len(l1) and j < len(l2)):
        if l1[i] == l2[j]:
            result.append(l1[i])
            
            i += 1
            j += 1
        else:
            if l1[i] < l2[j]:
                result.append(l1[i])
                i += 1
            else:
                result.append(l2[j])
                j += 1
    result.extend(l1[i:])
    result.extend(l2[j:])
    ##注意这里注意剩余
    return result

## 实现计算relevance 的核函数f(q,d)
- q:列表 query
- d:列表 document


In [322]:
ki = 5
b = 0.5
M = len(texts)
def count(w,d):
    i = 0
    for j in d:
        if  w== j:
            i+=1
    return i

def fqd(q,d):
    result_relevance  = 0
    qs = set(q)#去重
    middel = ki * (1-b + b*len(d)/avgD)
    for w in qs:
        if w in d:
            if w in keys:
                result_relevance += count(w,q)*(ki+1)*count(w,d)/(count(w,d)+middel)*math.log2((M+1.0)/d_idf[keys.index(w)])
    return result_relevance

### 实现topk查询

In [323]:
f_stop = open('stopword','r+')
stopwords = f_stop.readlines()
stopwords = [s.strip('\n') for s in stopwords]
f_stop.close()

def query_topk(Q,k = 20):
    
    trantab = str.maketrans({key: None for key in string.punctuation})
    j = Q.translate(trantab)
    j = j.split()
    qs = []
    for qqq in j:
        if qqq not in stopwords:
            qs.append(qqq)
        
    ls = [word2inverted_index[w]  for w in qs if w in keys]
    ds = []
    for l in ls:
        ds = OR_op(ds,l)
    relevace = {}## d - > relevance
    for d in ds:
        relevace[d] = fqd(qs,texts[d])
    
    v_tuple = sorted(relevace.items(),key = lambda x:x[1],reverse=True) ##从大到小排序
    relevace = {}
    for t in v_tuple:
        relevace[t[0]] = t[1]
    kk = list(relevace.keys())
    k = min(k,len(kk))
    return [texts[kk[i]] for i in range(k)]

def query_topk_id(Q,k = 20):
    
    trantab = str.maketrans({key: None for key in string.punctuation})
    j = Q.translate(trantab)
    j = j.split()
    qs = []
    for qqq in j:
        if qqq not in stopwords:
            qs.append(qqq)
    
    ls = [word2inverted_index[w]  for w in qs if w in keys]
    ds = []
    for l in ls:
        ds = OR_op(ds,l)
    relevace = {}## d - > relevance
    for d in ds:
        relevace[d] = fqd(qs,texts[d])
    
    v_tuple = sorted(relevace.items(),key = lambda x:x[1],reverse=True) ##从大到小排序
    relevace = {}
    for t in v_tuple:
        relevace[t[0]] = t[1]
    kk = list(relevace.keys())
    k = min(k,len(kk))
    return [docid[kk[i]] for i in range(k)]   

In [324]:
def inred( s ):
    return"%s[31;2m%s%s[0m"%(chr(27), s, chr(27))
def print_line_with_important(line , Q):
    trantab = str.maketrans({key: None for key in string.punctuation})
    j = Q.translate(trantab)
    j = j.split()
    s_line = j
    to_be_print = ''
    for l in line:
        if l in s_line:
            to_be_print += inred(l) +' ' ##这里强调
        else :
            to_be_print += l + ' '
    print(to_be_print)

def print_top_k(Q):
    for line in query_topk(Q):
        print_line_with_important(line,Q)

def save_top_k(Qid,Q,k,f):
    for l in query_topk_id(Q,k):
        f.write(str(Qid)+" "+str(l)+"\n") 

In [325]:
f = open('query.txt','r+')
Soup = BeautifulSoup(f,'lxml'); 
query = Soup.select('top > query')
query = [q.getText() for q in query]
queryid = Soup.select('top > num')
queryid = [q.getText()[11:-1] for q in queryid]
f.close()

In [326]:
f = open('result.txt','w+')
for i,q in  enumerate(query):
    save_top_k(queryid[i],q,200,f)
f.close()

In [327]:
print_top_k(query[4])
query[4]

I am [31;2mnaming[0m this [31;2mstorm[0m Finding [31;2mNemo[0m wheresheat 
They are [31;2mnaming[0m snow storms now Winter [31;2mstorm[0m [31;2mNemo[0m Really 
When the hell did we start [31;2mnaming[0m nonhurricanes Storm [31;2mNemo[0m Isnt that a cartoon fish How scary is a [31;2mstorm[0m named [31;2mNemo[0m 
Did they name the [31;2mstorm[0m [31;2mNemo[0m so every news and weather outlet can use the phrase Finding [31;2mNemo[0m when tracking the [31;2mstorm[0m [31;2mNemo[0m 
NormansCat [31;2mNemo[0m They named the [31;2mstorm[0m [31;2mNemo[0m Wow awesome 
Wait they named this [31;2mstorm[0m [31;2mNemo[0m As in Finding [31;2mNemo[0m 
Winter [31;2mstorm[0m [31;2mNemo[0m Who gives names to these storms and when did they start [31;2mnaming[0m winter storms 
httptcoheJAhFSp is kind of undercutting their terrifying YOU MUST PREPARE NOW headlines by [31;2mnaming[0m the [31;2mstorm[0m [31;2mNemo[0m 
Nycmoon this [31;2mnaming[0m of winte

' commentary on naming storm Nemo '

### 对结果进行evaluate

In [328]:
from eval_hw4.eval_hw4 import *

#### 我的结果


In [332]:
k = 200
# query relevance file
file_qrels_path = 'eval_hw4/qrels.txt'
# qrels_dict = {query_id:{doc_id:gain, doc_id:gain, ...}, ...}
qrels_dict = generate_tweetid_gain(file_qrels_path)
# ur result, format is in function read_tweetid_test, or u can write by ur own
file_test_path = 'result.txt'
# test_dict = {query_id:[doc_id, doc_id, ...], ...}
test_dict = read_tweetid_test(file_test_path)
MAP = MAP_eval(qrels_dict, test_dict, k)
print('MAP', ' = ', MAP, sep='')
NDCG = NDCG_eval(qrels_dict, test_dict, k)
print('NDCG', ' = ', NDCG, sep='')

query: 171 ,AP: 0.751931648831
query: 172 ,AP: 0.596615551401
query: 173 ,AP: 0.133333333333
query: 174 ,AP: 0.573125485625
query: 175 ,AP: 0.686814188789
query: 176 ,AP: 0.618201010357
query: 177 ,AP: 0.711951809919
query: 178 ,AP: 0.841574334187
query: 179 ,AP: 0.525652825887
query: 180 ,AP: 0.278287629881
query: 181 ,AP: 0.888888888889
query: 182 ,AP: 0.3861003861
query: 183 ,AP: 0.755045992217
query: 184 ,AP: 0.697272570827
query: 185 ,AP: 0.805110874964
query: 186 ,AP: 0.73929657782
query: 187 ,AP: 0.868070082366
query: 188 ,AP: 0.469349245212
query: 189 ,AP: 0.0432049675471
query: 190 ,AP: 0.46514161684
query: 191 ,AP: 0.553197303165
query: 192 ,AP: 0.457157081692
query: 193 ,AP: 0.526663705071
query: 194 ,AP: 0.372395833333
query: 195 ,AP: 0.424510787276
query: 196 ,AP: 0.761638118757
query: 197 ,AP: 0.79110887911
query: 198 ,AP: 0.495060370516
query: 199 ,AP: 0.472034088134
query: 200 ,AP: 0.663251218227
query: 201 ,AP: 0.713372587697
query: 202 ,AP: 0.795626081736
query: 203 ,

#### 对比结果

In [333]:
k = 200
# query relevance file
file_qrels_path = 'eval_hw4/qrels.txt'
# qrels_dict = {query_id:{doc_id:gain, doc_id:gain, ...}, ...}
qrels_dict = generate_tweetid_gain(file_qrels_path)
# ur result, format is in function read_tweetid_test, or u can write by ur own
file_test_path = 'eval_hw4/hisresult.txt'
# test_dict = {query_id:[doc_id, doc_id, ...], ...}
test_dict = read_tweetid_test(file_test_path)
MAP = MAP_eval(qrels_dict, test_dict, k)
print('MAP', ' = ', MAP, sep='')
NDCG = NDCG_eval(qrels_dict, test_dict, k)
print('NDCG', ' = ', NDCG, sep='')

query: 171 ,AP: 0.94980405976
query: 172 ,AP: 0.682593856655
query: 173 ,AP: 0.997813620072
query: 174 ,AP: 0.567534780035
query: 175 ,AP: 0.778210116732
query: 176 ,AP: 0.827412933877
query: 177 ,AP: 0.465198145065
query: 178 ,AP: 0.915379647212
query: 179 ,AP: 0.971163259061
query: 180 ,AP: 0.220973360037
query: 181 ,AP: 1.0
query: 182 ,AP: 0.3861003861
query: 183 ,AP: 0.851063829787
query: 184 ,AP: 0.953864220785
query: 185 ,AP: 0.565475418125
query: 186 ,AP: 0.986660079051
query: 187 ,AP: 0.909298308663
query: 188 ,AP: 0.818160705003
query: 189 ,AP: 0.37571536147
query: 190 ,AP: 0.93514660103
query: 191 ,AP: 0.878837663343
query: 192 ,AP: 1.0
query: 193 ,AP: 1.0
query: 194 ,AP: 0.977071661998
query: 195 ,AP: 0.515391404296
query: 196 ,AP: 1.0
query: 197 ,AP: 0.998919203817
query: 198 ,AP: 1.0
query: 199 ,AP: 0.475059382423
query: 200 ,AP: 0.712377070122
query: 201 ,AP: 0.740740740741
query: 202 ,AP: 0.924676840984
query: 203 ,AP: 0.713699248416
query: 204 ,AP: 0.899091826819
query: