# 倒排索引与布尔检索——Pivoted Length Normalization VSM and BM25

## 载入基本所需模块

In [39]:
import math
import re
import json
import codecs

### 将每条推特文本存入数组
#### 这里的数组用于后来布尔检索时的对应的文本内容输出

In [2]:
d=open('tweets.txt','r+')
text1=[]
ID=[]
texts1=d.readlines()
for w in texts1:
    text1.append(json.loads(w)['text'])
    ID.append(json.loads(w)['tweetId'])
print(text1[:3])
print(ID[:20])

['House may kill Arizona-style immigration bill, Rep. Rick Rand says: The House is unlikely to pass the "Ari... http://tinyurl.com/4jrjcdz', "Mourners recall Sarge Shriver's charity, idealism \n    (AP): AP - R. Sargent Shriver was always an optimist, pio... http://bit.ly/gqMcdG", 'Bass Fishing Techniques: 2 Fantastic Tips To Improve Your Casting Skills']
['28965792812892160', '28967095878287360', '28967672074993664', '28967914417688576', '28968479176531969', '28968581949558787', '28969422056071169', '28971749961891840', '28973080491589632', '28974862038994945', '28974904342740992', '28976409057697792', '28976831738683393', '28977078074343425', '28977078074343425', '28977806142603264', '28978641706684416', '28978659108855809', '28978692289994752', '28978703102902273']



## 建立倒排索引

### 文本去标点，分词
#### 去掉指定的一些标点，但保留完整的网站链接

In [3]:

text2=[]
for t in text1:
    line=re.sub("[&!#.,|()-]+",'',t)
    line=line.split()
    z=[]
    for word in line:
        lower=word.lower()
        z.append(lower)
    text2.append(z)   
print(text2[:10])
print(len(text2))
print(type(line))

[['house', 'may', 'kill', 'arizonastyle', 'immigration', 'bill', 'rep', 'rick', 'rand', 'says:', 'the', 'house', 'is', 'unlikely', 'to', 'pass', 'the', '"ari', 'http://tinyurlcom/4jrjcdz'], ['mourners', 'recall', 'sarge', "shriver's", 'charity', 'idealism', 'ap:', 'ap', 'r', 'sargent', 'shriver', 'was', 'always', 'an', 'optimist', 'pio', 'http://bitly/gqmcdg'], ['bass', 'fishing', 'techniques:', '2', 'fantastic', 'tips', 'to', 'improve', 'your', 'casting', 'skills'], ['financial', 'aid', 'proper', 'method', 'of', 'getting', 'financial', 'aid', 'for', 'education', 'http://pingfm/bk0r3', 'applyingforfinancialaid', 'financialaidessay'], ['supreme', 'court:', "nasa's", 'intrusive', 'background', 'checks', 'ok', 'http://bitly/h2jgy9'], ['the', 'mcdonalds', 'music', 'to', 'fireworks', 'is', 'an', 'all', 'time', 'low'], ['@alyce', 'very', 'sweet', 'and', 'quiet', 'if', 'not', 'polished', 'bono', 'hansard', 'at', 'sgt', "shriver's", 'funeral', '2day:', 'http://youtube/bf14xbbcvzg', 'when', 'wa

### 构建词袋

In [4]:
wordsbag={}
for text in text2:
    text=set(text)
    for word in text:
        word=word.lower()
        if word not in wordsbag:
            wordsbag[word]=1
        else:
            wordsbag[word]+=1
L = sorted(wordsbag.items(),key = lambda x:x[1],reverse=True) #按升序排序
print(type(L))
print(L[:10])

<class 'list'>
[('the', 9661), ('to', 7701), ('in', 5867), ('of', 5587), ('a', 5456), ('for', 4358), ('on', 4179), ('and', 4157), ('rt', 3669), ('is', 3203)]


In [5]:
vocab=[]
for m in range(len(L)):
    vocab.append(L[m][0])
print(vocab[:10])
print(len(L))

['the', 'to', 'in', 'of', 'a', 'for', 'on', 'and', 'rt', 'is']
67947


In [6]:
dl=[]
for i in range(len(text2)):
    dl.append(len(text2[i]))
print(dl[:10])
avdl=sum(dl)/len(text2)
print(sum(dl))
print(avdl)

[19, 17, 11, 13, 8, 10, 19, 23, 13, 22]
453521
14.846176509100433


### 找到每个词出现过的位置（位置为所在的tweetID）以及该词分别在对应文档中的个数
### 这里创建了两个字典，一个既存词的所在位置也存词在该tweet中出现的次数，另一个只存词的所在位置。这么做是为了方便之后Pivoted Length Normalization VSM 以及 BM25 相应公式的计算


In [7]:
%%time
#初始化所需的倒排索引字典
inverted_index={}#既存词的所在位置也存词在该tweet中出现的次数
justindex={}#只存词的所在位置
for i in vocab:
    inverted_index[i]=[]
    justindex[i]=[]
    
for t in text2:
    number={}
    for word in t:
        if word not in number:
            number[word]=1
        else:
            number[word]+=1    
    line=set(t)
    for word in line:
        inverted_index[word].append((ID[text2.index(t)],number[word]))
        justindex[word].append(ID[text2.index(t)])

Wall time: 9min 1s


### 输入任意词进行测试

In [99]:
print(inverted_index['win'])#找到任意一个指定词所在的tweetID及其在对应tweet中的出现次数
print(justindex['win'][:20])

[('29072363442147329', 1), ('29123642625368064', 1), ('29168726137901056', 1), ('29208577998594048', 1), ('29771509908111361', 1), ('29852246200029184', 1), ('29917417236860928', 1), ('29956594770907137', 1), ('30037191400890368', 1), ('30091359255789568', 1), ('30101530644316160', 1), ('30110157539123200', 1), ('30141947922358272', 1), ('30229057178177536', 1), ('30544174625333249', 1), ('30566403673165824', 1), ('30601061328756736', 1), ('30601504758964224', 1), ('31098415081332736', 1), ('31223664930193409', 1), ('31345998840209409', 1), ('31457280905973760', 1), ('31467720469905408', 1), ('31469968528769025', 1), ('31629044269191168', 1), ('31632894636068864', 1), ('31659605524226048', 1), ('31675229210152960', 1), ('31681152569835520', 1), ('31684419873935361', 1), ('31803439788855297', 1), ('31909514391322624', 1), ('31911131589120001', 1), ('31924949920784385', 1), ('31925408349814784', 1), ('31932133563961344', 1), ('31936128210702336', 1), ('31963355287650304', 1), ('319901113

## 构建Pivoted Length Normalization VSM 与 BM25相应的计算函数，每个函数中最后将根据query查询到的所有相关tweetID依据ranking funtion值降序排列

In [94]:
def Pivoted(Query):
    w=Query.split()
        
    score1={}
    label1=[]
    cwq={}
    fdql={}
    b=0.5
    z={}
    for word in w:
        i=word.lower()  #将query中的每个词统一为小写（使输入无论大写小写都能得到相同的结果），下面代码同理
        if i not in vocab:
            w.remove(word) #如果query中存在所有tweet都不存在的单词，就将这个词移出,这样这些多余的查询词便不会影响到查询的结果，下面代码此处同理。
    
    for word in w:
        i=word.lower()
        if i not in cwq:
            cwq[i]=1
        else:
            cwq[i]+=1
    
    for word in w:
        i=word.lower()
        score1[i]={}
    
    for word in w:
        i=word.lower()
        for item in justindex[i]:
            if item in label1:
                fdql[item]+=cwq[i]*(math.log((1+math.log((inverted_index[i][justindex[i].index(item)][1]+1),math.e)),math.e))*math.log(len(text2)/len(inverted_index[i]))/(1-b+b*dl[justindex[i].index(item)]/avdl)
            else:
                label1.append(item)
                fdql[item]=cwq[i]*(math.log((1+math.log((inverted_index[i][justindex[i].index(item)][1]+1),math.e)),math.e))*math.log(len(text2)/len(inverted_index[i]))/(1-b+b*dl[justindex[i].index(item)]/avdl)
            score1[i][item]=fdql[item]
            z[item]=fdql[item]
        score1[i]=sorted(score1[i].items(),key = lambda x:x[1],reverse=True)#按照计算所得函数值将query中对应word的tweet的（tweetID，ranking fuction值）降序排列
    z=sorted(z.items(),key = lambda x:x[1],reverse=True) #按照计算所得函数值将（tweetID，ranking fuction值）降序排列
    return z #整个query所查找到的所有相关tweetID及对应的function值     
            
            
def BM25(Query):
    w=Query.split()
    score2={}
    label2=[]
    cwq={}
    fdql={}
    b=0.5
    k=2
    z={}
    for word in w:
        i=word.lower()
        if i not in vocab:
            w.remove(word)
            
    for item in w:
        i=item.lower()
        if i not in cwq:
            cwq[i]=1
        else:
            cwq[i]+=1
    
    for word in w:
        i=word.lower()
        score2[i]={}
    
    for word in w:
        i=word.lower()
        for item in justindex[i]:
            if item in label2:
                fdql[item]+=cwq[i]*(inverted_index[i][justindex[i].index(item)][1])*(k+1)*(math.log(len(text2)/len(inverted_index[i])))/(k*(1-b+b*dl[justindex[i].index(item)]/avdl)+(inverted_index[i][justindex[i].index(item)][1]))
            else:
                label2.append(item)
                fdql[item]=cwq[i]*(inverted_index[i][justindex[i].index(item)][1])*(k+1)*(math.log(len(text2)/len(inverted_index[i])))/(k*(1-b+b*dl[justindex[i].index(item)]/avdl)+(inverted_index[i][justindex[i].index(item)][1]))
            score2[i][item]=fdql[item]
            z[item]=fdql[item]
        score2[i]=sorted(score2[i].items(),key = lambda x:x[1],reverse=True)#按照计算所得函数值将query中对应word的tweet的（tweetID，ranking fuction值）降序排列
    z=sorted(z.items(),key = lambda x:x[1],reverse=True)#按照计算所得函数值将所有相关tweet的（tweetID，ranking fuction值）降序排列
    return z #整个query所查找到的所有相关tweetID及对应的function值       
    


### query测试 

In [98]:
Pivoted('Commentary on naming Storm Nemo')

[('625313390299234304', 477.9448924626409),
 ('625788743325122560', 36.65991747902494),
 ('625790899226812416', 18.32995873951247),
 ('625876605609603073', 17.08467140909998),
 ('625838844294791168', 16.097740480022335),
 ('625876928570896384', 15.173808956239908),
 ('623148777118007296', 11.71992333511445),
 ('299889720320720896', 11.513406166826009),
 ('626233998084280320', 11.363110927074588),
 ('622571762501451776', 11.091548137318638),
 ('625315617441218560', 10.998711805251459),
 ('299906824705028096', 10.9933892552836),
 ('625888630670626816', 10.412121558025163),
 ('32820448949444608', 9.877837052435845),
 ('626118176548519936', 9.785385567029337),
 ('299993193800413184', 8.764743767546872),
 ('299665757061652480', 8.758574247010984),
 ('300004723916951552', 8.517813593435362),
 ('299760346992877568', 8.405749939642341),
 ('299578574258720768', 8.351571643683872),
 ('300777138582327297', 8.263316259239323),
 ('623584925980184576', 8.066580463504463),
 ('626431784666791936', 7.8

In [97]:
BM25('muscle pain from statins')

[('625879210268057600', 36.73683713804207),
 ('623164329588838400', 23.06916275384538),
 ('625878685992665088', 20.185517409614707),
 ('306712133272477696', 17.192366328683615),
 ('303616678170857472', 16.811257596429705),
 ('311458067944468480', 13.160351156289355),
 ('626158106322579458', 12.245612379347357),
 ('33642490078691328', 12.221555220234606),
 ('625380046157426689', 11.29769446014931),
 ('30765413604265984', 10.39453355208829),
 ('307073900364308482', 10.301023749504902),
 ('304653875019931649', 9.541591842613393),
 ('308040586785615872', 9.171959859875116),
 ('312536457040523265', 8.709807371426304),
 ('313590837948866560', 8.709807371426304),
 ('305061393634304001', 8.495766975002667),
 ('316656064697356288', 8.097767503229317),
 ('311233077068300289', 8.097767503229317),
 ('314417275241390081', 8.097767503229317),
 ('623253634697023488', 7.83004499849358),
 ('29974270754947072', 7.496266683585069),
 ('32792717511630848', 7.404054980318101),
 ('301874825683689474', 7.4040

### 按照网站上的信息构建了querys.txt，储存编号为171-225共55个query，每一行的格式为（queryID，queryContent）
### 逐行读入并将所有query的内容放到一个list（querylist）中

In [88]:

querylist=[]
with open('querys.txt', "r",encoding="utf-16",errors='ignore') as q:
    queryline=q.readlines()
    for i in queryline:
        querylist.append(i[4:-1])
print(querylist)

['Ron Weasley birthday', 'Merging of US Air and American', 'muscle pain from statins', 'Hubble oldest star', '         commentary on naming storm Nemo', 'book club members', 'Boko Haram kidnapped French tourists', 'Tiger Woods regains title', 'care of Iditarod dogs', 'Sherlock Elementary BBC CBS', 'Costa Concordia shipwreck', 'Chinua Achebe death', 'Evernote hacked', 'Election of Hugo Chavez successor', 'National Zoo Panda insemination', 'Dorner truck compensation', 'Pope washed Muslims feet', 'Bombing police headquarters Kirkuk', 'injuries by pets', 'Organized crime sports doping Australia', 'Irish laundries apology', 'whooping cough epidemic', 'Bulgarian protesters self immolate', 'cherry blossom Washington', 'Argo wins Oscar', 'US fines Google over Street View ', 'Mad Men season 6', 'Hostess bought by Apollo', 'Ed Koch death', 'UK passes marriage bill', 'Higgs Boson discovery', 'Boko Haram Amnesty opposition', 'Eastern Australia Floods', 'Sotomayor prosecutor racial comments', 'Port

### 构建两个字典，字典的每个key为queryID，key下的item为该query所查询到的所有相关tweetID所组成的list，第一个字典存依据BM25查询所得的结果 ，第二个字典存Pivoted Length Normalization VSM的结果。

In [95]:
match={}
for i in range(len(querylist)):
    z=BM25(querylist[i])
    match[171+i]=[r[0] for r in z]

match_2={}
for i in range(len(querylist)):
    o=Pivoted(querylist[i])
    match_2[171+i]=[r[0] for r in o]

### 将依据两种方法查询所得的结果分别存入两个txt文件中  txt每一行为:(queryID    查询到的相应tweetID)

In [96]:
with open('result.txt','w+',encoding='utf-16') as f:
    for i in range(171,226):
        for j in range(len(match[i])):
            f.writelines((str(i),'\t',match[i][j],'\n'))

with open('result_2.txt','w+',encoding='utf-16') as f:
    for i in range(171,226):
        for j in range(len(match_2[i])):
            f.writelines((str(i),'\t',match_2[i][j],'\n'))
    
    
