# 倒排索引与布尔检索——Pivoted Length Normalization VSM and BM25

## 载入基本所需模块

In [1]:
import math
import re
import json

### 将每条推特文本存入数组
#### 这里的数组用于后来布尔检索时的对应的文本内容输出

In [14]:
d=open('tweets.txt','r+')
text1=[]
ID=[]
texts1=d.readlines()
for w in texts1:
    text1.append(json.loads(w)['text'])
    ID.append(json.loads(w)['tweetId'])
print(text1[:3])
print(ID[:20])

['House may kill Arizona-style immigration bill, Rep. Rick Rand says: The House is unlikely to pass the "Ari... http://tinyurl.com/4jrjcdz', "Mourners recall Sarge Shriver's charity, idealism \n    (AP): AP - R. Sargent Shriver was always an optimist, pio... http://bit.ly/gqMcdG", 'Bass Fishing Techniques: 2 Fantastic Tips To Improve Your Casting Skills']
['28965792812892160', '28967095878287360', '28967672074993664', '28967914417688576', '28968479176531969', '28968581949558787', '28969422056071169', '28971749961891840', '28973080491589632', '28974862038994945', '28974904342740992', '28976409057697792', '28976831738683393', '28977078074343425', '28977078074343425', '28977806142603264', '28978641706684416', '28978659108855809', '28978692289994752', '28978703102902273']



## 建立倒排索引

### 文本去标点，分词
#### 去掉指定的一些标点，但保留完整的网站链接

In [3]:

text2=[]
for t in text1:
    line=re.sub("[&!#.,|()-]+",'',t)
    line=line.split()
    z=[]
    for word in line:
        lower=word.lower()
        z.append(lower)
    text2.append(z)   
print(text2[:10])
print(len(text2))
print(type(line))

[['house', 'may', 'kill', 'arizonastyle', 'immigration', 'bill', 'rep', 'rick', 'rand', 'says:', 'the', 'house', 'is', 'unlikely', 'to', 'pass', 'the', '"ari', 'http://tinyurlcom/4jrjcdz'], ['mourners', 'recall', 'sarge', "shriver's", 'charity', 'idealism', 'ap:', 'ap', 'r', 'sargent', 'shriver', 'was', 'always', 'an', 'optimist', 'pio', 'http://bitly/gqmcdg'], ['bass', 'fishing', 'techniques:', '2', 'fantastic', 'tips', 'to', 'improve', 'your', 'casting', 'skills'], ['financial', 'aid', 'proper', 'method', 'of', 'getting', 'financial', 'aid', 'for', 'education', 'http://pingfm/bk0r3', 'applyingforfinancialaid', 'financialaidessay'], ['supreme', 'court:', "nasa's", 'intrusive', 'background', 'checks', 'ok', 'http://bitly/h2jgy9'], ['the', 'mcdonalds', 'music', 'to', 'fireworks', 'is', 'an', 'all', 'time', 'low'], ['@alyce', 'very', 'sweet', 'and', 'quiet', 'if', 'not', 'polished', 'bono', 'hansard', 'at', 'sgt', "shriver's", 'funeral', '2day:', 'http://youtube/bf14xbbcvzg', 'when', 'wa

### 构建词袋

In [4]:
wordsbag={}
for text in text2:
    text=set(text)
    for word in text:
        word=word.lower()
        if word not in wordsbag:
            wordsbag[word]=1
        else:
            wordsbag[word]+=1
L = sorted(wordsbag.items(),key = lambda x:x[1],reverse=True) #按升序排序
print(type(L))
print(L[:10])

<class 'list'>
[('the', 9661), ('to', 7701), ('in', 5867), ('of', 5587), ('a', 5456), ('for', 4358), ('on', 4179), ('and', 4157), ('rt', 3669), ('is', 3203)]


In [5]:
vocab=[]
for m in range(len(L)):
    vocab.append(L[m][0])
print(vocab[:10])
print(len(L))

['the', 'to', 'in', 'of', 'a', 'for', 'on', 'and', 'rt', 'is']
67947


In [6]:
dl=[]
for i in range(len(text2)):
    dl.append(len(text2[i]))
print(dl[:10])
avdl=sum(dl)/len(text2)
print(sum(dl))
print(avdl)

[19, 17, 11, 13, 8, 10, 19, 23, 13, 22]
453521
14.846176509100433


### 找到每个词出现过的位置（位置为所在的tweetID）以及该词分别在对应文档中的个数
### 这里创建了两个字典，一个既存词的所在位置也存词在该tweet中出现的次数，另一个只存词的所在位置。这么做是为了方便之后Pivoted Length Normalization VSM 以及 BM25 相应公式的计算


In [15]:
%%time
#初始化所需的倒排索引字典
inverted_index={}#既存词的所在位置也存词在该tweet中出现的次数
justindex={}#只存词的所在位置
for i in vocab:
    inverted_index[i]=[]
    justindex[i]=[]
    
for t in text2:
    number={}
    for word in t:
        if word not in number:
            number[word]=1
        else:
            number[word]+=1    
    line=set(t)
    for word in line:
        inverted_index[word].append((ID[text2.index(t)],number[word]))
        justindex[word].append(ID[text2.index(t)])

Wall time: 9min 40s


### 输入任意词进行测试

In [21]:
print(inverted_index['win'])#找到任意一个指定词所在的tweetID及其在对应tweet中的出现次数
print(justindex['win'])

[('29072363442147329', 1), ('29123642625368064', 1), ('29168726137901056', 1), ('29208577998594048', 1), ('29771509908111361', 1), ('29852246200029184', 1), ('29917417236860928', 1), ('29956594770907137', 1), ('30037191400890368', 1), ('30091359255789568', 1), ('30101530644316160', 1), ('30110157539123200', 1), ('30141947922358272', 1), ('30229057178177536', 1), ('30544174625333249', 1), ('30566403673165824', 1), ('30601061328756736', 1), ('30601504758964224', 1), ('31098415081332736', 1), ('31223664930193409', 1), ('31345998840209409', 1), ('31457280905973760', 1), ('31467720469905408', 1), ('31469968528769025', 1), ('31629044269191168', 1), ('31632894636068864', 1), ('31659605524226048', 1), ('31675229210152960', 1), ('31681152569835520', 1), ('31684419873935361', 1), ('31803439788855297', 1), ('31909514391322624', 1), ('31911131589120001', 1), ('31924949920784385', 1), ('31925408349814784', 1), ('31932133563961344', 1), ('31936128210702336', 1), ('31963355287650304', 1), ('319901113

## 构建Pivoted Length Normalization VSM 与 BM25相应的计算函数，每个函数中最后将根据query查询到的所有相关tweetID依据ranking funtion值降序排列

In [44]:
def Pivoted(Query):
    w=Query.split()
    score1={}
    label1=[]
    cwq={}
    fdql={}
    b=0.5
    z={}
    for i in w:
        if i not in cwq:
            cwq[i]=1
        else:
            cwq[i]+=1
    
    for i in w:
        score1[i]={}
    
    for i in w:
        for item in justindex[i]:
            if item in label1:
                fdql[item]+=cwq[i]*(math.log((1+math.log((inverted_index[i][justindex[i].index(item)][1]+1),math.e)),math.e))*math.log(len(text2)/len(inverted_index[i]))/(1-b+b*dl[justindex[i].index(item)]/avdl)
            else:
                label1.append(item)
                fdql[item]=cwq[i]*(math.log((1+math.log((inverted_index[i][justindex[i].index(item)][1]+1),math.e)),math.e))*math.log(len(text2)/len(inverted_index[i]))/(1-b+b*dl[justindex[i].index(item)]/avdl)
            score1[i][item]=fdql[item]
            z[item]=fdql[item]
        score1[i]=sorted(score1[i].items(),key = lambda x:x[1],reverse=True)#按照计算所得函数值将query中对应word的tweet的（tweetID，ranking fuction值）降序排列
    z=sorted(z.items(),key = lambda x:x[1],reverse=True) #按照计算所得函数值将（tweetID，ranking fuction值）降序排列
    return score1,z,len(z) #返回query中每个词的所有所在位置（tweetID），整个query所查找到的所有相关tweetID，所有相关tweetID的数量      
            
            
def BM25(Query):
    w=Query.split()
    score2={}
    label2=[]
    cwq={}
    fdql={}
    b=0.5
    k=2
    z={}
    for i in w:
        if i not in cwq:
            cwq[i]=1
        else:
            cwq[i]+=1
    
    for i in w:
        score2[i]={}
    
    for i in w:
        for item in justindex[i]:
            if item in label2:
                fdql[item]+=cwq[i]*(inverted_index[i][justindex[i].index(item)][1])*(k+1)*(math.log(len(text2)/len(inverted_index[i])))/(k*(1-b+b*dl[justindex[i].index(item)]/avdl)+(inverted_index[i][justindex[i].index(item)][1]))
            else:
                label2.append(item)
                fdql[item]=cwq[i]*(inverted_index[i][justindex[i].index(item)][1])*(k+1)*(math.log(len(text2)/len(inverted_index[i])))/(k*(1-b+b*dl[justindex[i].index(item)]/avdl)+(inverted_index[i][justindex[i].index(item)][1]))
            score2[i][item]=fdql[item]
            z[item]=fdql[item]
        score2[i]=sorted(score2[i].items(),key = lambda x:x[1],reverse=True)#按照计算所得函数值将query中对应word的tweet的（tweetID，ranking fuction值）降序排列
    z=sorted(z.items(),key = lambda x:x[1],reverse=True)#按照计算所得函数值将所有相关tweet的（tweetID，ranking fuction值）降序排列
    return score2,z,len(z) #返回query中每个词的所有所在位置（weetID），整个query所查找到的所有相关tweetID，所有相关tweetID的数量       
    


In [37]:
Pivoted('ron weasley birthday')

({'ron': [('307349038309732352', 12.273419462665586),
   ('307497638310326272', 5.800447988407496),
   ('307557365186699264', 5.475423298359763),
   ('307409448899588096', 4.923637084441025),
   ('307388573810819072', 4.472882044318558),
   ('307628148269404161', 4.472882044318558),
   ('307655801345024001', 4.472882044318558),
   ('307462808801513472', 4.459332493604074),
   ('307581469876944897', 4.459332493604074),
   ('33199661187600385', 4.277099522786482),
   ('307407506920067073', 4.277099522786482),
   ('307592941277433856', 4.277099522786482),
   ('307695965991735296', 4.277099522786482),
   ('307494886838530048', 4.097737455196561),
   ('307562306101981184', 4.097737455196561),
   ('307491908907855873', 3.947444769860326),
   ('299228458914037760', 3.9328131879276844),
   ('307364469183483906', 3.9328131879276844),
   ('307410191257841664', 3.9328131879276844),
   ('307488347935358976', 3.9328131879276844),
   ('307360182604820481', 3.8373229896794414),
   ('32921017177341953

In [42]:
BM25('australian fashion designers')

({'australian': [('30899786374709248', 70.8972005302958),
   ('623407150409224192', 11.623292235571743),
   ('623399718098354176', 10.351384414262803),
   ('33903004490928129', 9.917294512667368),
   ('32583035073335298', 9.71362198714862),
   ('299343068283428864', 9.71362198714862),
   ('30477678087770113', 9.002519011077727),
   ('299419735949455360', 8.834368725842735),
   ('299334390289424384', 7.944084522052474),
   ('299354288067592192', 7.944084522052474),
   ('299415768137871360', 7.563003391007437),
   ('30965883492900864', 7.462038407371762),
   ('30517355016622080', 7.443973198661583),
   ('30928608558059520', 7.443973198661583),
   ('31045240588603392', 7.233788137248426),
   ('31348236769820672', 6.6257765443820515),
   ('31673993819856896', 6.6257765443820515),
   ('297812583526846465', 6.6257765443820515),
   ('299382037570281473', 6.6257765443820515),
   ('31524647337988096', 6.44520004820871),
   ('31574200246341633', 6.44520004820871),
   ('31692185296441344', 6.4452