# 倒排索引与布尔检索

## 载入基本所需模块

In [1]:

import re
import json

### 将每条推特文本存入数组
#### 这里的数组用于后来布尔检索时的对应的文本内容输出

In [2]:
d=open('tweets.txt','r+')
text1=[]
texts1=d.readlines()
for w in texts1:
    text1.append(json.loads(w)['text'])
print(text1[:3])

['House may kill Arizona-style immigration bill, Rep. Rick Rand says: The House is unlikely to pass the "Ari... http://tinyurl.com/4jrjcdz', "Mourners recall Sarge Shriver's charity, idealism \n    (AP): AP - R. Sargent Shriver was always an optimist, pio... http://bit.ly/gqMcdG", 'Bass Fishing Techniques: 2 Fantastic Tips To Improve Your Casting Skills']



## 建立倒排索引

### 文本去标点，分词
#### 去掉指定的一些标点，但保留完整的网站链接

In [3]:

text2=[]
for t in text1:
    line=re.sub("[&!#.,|()-]+",'',t)
    line=line.split()
    z=[]
    for word in line:
        lower=word.lower()
        z.append(lower)
    text2.append(z)   
print(text2[:10])
print(len(text2))
print(type(line))

[['house', 'may', 'kill', 'arizonastyle', 'immigration', 'bill', 'rep', 'rick', 'rand', 'says:', 'the', 'house', 'is', 'unlikely', 'to', 'pass', 'the', '"ari', 'http://tinyurlcom/4jrjcdz'], ['mourners', 'recall', 'sarge', "shriver's", 'charity', 'idealism', 'ap:', 'ap', 'r', 'sargent', 'shriver', 'was', 'always', 'an', 'optimist', 'pio', 'http://bitly/gqmcdg'], ['bass', 'fishing', 'techniques:', '2', 'fantastic', 'tips', 'to', 'improve', 'your', 'casting', 'skills'], ['financial', 'aid', 'proper', 'method', 'of', 'getting', 'financial', 'aid', 'for', 'education', 'http://pingfm/bk0r3', 'applyingforfinancialaid', 'financialaidessay'], ['supreme', 'court:', "nasa's", 'intrusive', 'background', 'checks', 'ok', 'http://bitly/h2jgy9'], ['the', 'mcdonalds', 'music', 'to', 'fireworks', 'is', 'an', 'all', 'time', 'low'], ['@alyce', 'very', 'sweet', 'and', 'quiet', 'if', 'not', 'polished', 'bono', 'hansard', 'at', 'sgt', "shriver's", 'funeral', '2day:', 'http://youtube/bf14xbbcvzg', 'when', 'wa

### 构建词袋

In [4]:
wordsbag={}
for text in text2:
    text=set(text)
    for word in text:
        word=word.lower()
        if word not in wordsbag:
            wordsbag[word]=1
        else:
            wordsbag[word]+=1
L = sorted(wordsbag.items(),key = lambda x:x[1],reverse=True) #按升序排序
print(type(L))
print(L[:10])

<class 'list'>
[('the', 9661), ('to', 7701), ('in', 5867), ('of', 5587), ('a', 5456), ('for', 4358), ('on', 4179), ('and', 4157), ('rt', 3669), ('is', 3203)]


In [5]:
vocab=[]
for m in range(len(L)):
    vocab.append(L[m][0])
print(vocab[:10])
print(len(L))

['the', 'to', 'in', 'of', 'a', 'for', 'on', 'and', 'rt', 'is']
67947


### 找到每个词出现过的位置

In [6]:
%%time
#初始化所需的倒排索引字典
inverted_index={}
for i in vocab:
    inverted_index[i]=[]
    
for t in text2:
    line=set(t)
    for word in line:
        inverted_index[word].append(text2.index(t))   

Wall time: 2min 12s


In [7]:
print(inverted_index['win'])#找到任意一个指定词的索引

[99, 155, 193, 238, 1050, 1094, 1202, 1301, 1469, 1545, 1574, 1586, 1625, 1734, 2189, 2213, 2264, 2265, 3132, 3284, 3399, 3535, 3548, 3551, 3672, 3678, 3720, 3754, 3767, 3771, 3906, 4055, 4066, 4091, 4094, 4104, 4113, 4152, 4179, 4236, 4239, 4256, 4275, 4548, 4831, 5320, 5395, 5977, 6089, 6706, 6718, 6787, 6980, 7143, 7232, 7415, 7471, 7849, 9637, 9648, 10997, 11054, 11438, 11054, 11988, 14992, 15023, 15024, 15025, 15028, 15029, 15065, 15079, 15285, 15289, 15442, 15486, 15506, 15519, 15523, 15525, 15530, 15545, 15564, 15589, 15589, 15598, 15600, 15610, 15613, 15620, 15622, 15623, 15631, 15633, 15646, 15656, 15662, 15662, 15668, 15694, 15703, 15708, 15718, 15742, 15759, 15764, 15787, 15790, 15799, 15816, 15822, 15830, 15864, 15872, 15874, 15875, 15879, 15925, 15926, 15931, 16002, 16043, 16049, 16056, 16082, 16102, 16104, 16112, 16314, 16314, 16320, 16320, 16324, 16338, 16338, 16340, 16340, 16342, 16342, 16348, 16348, 16359, 16359, 16618, 16618, 16952, 17180, 17181, 17791, 18428, 19052, 


# 构建简单布尔查询

### not 非运算

In [8]:
def B_Not(key):
    result=[]
    tweet=[]
    for i in range(inverted_index[key][-1]):
        if i not in inverted_index[key]:
            result.append(i)
    for j in range(inverted_index[key][-1],len(text2)):
        result.append(j)
    for n in result:
        tweet.append(text1[n])
    if len(tweet)>50:
        for i in tweet[:50]:
            print(i,'\n')
        return result[:50]
    else:
        for i in tweet:
            print(i,'\n')
        return result


### and 交集运算

In [9]:
def B_And(key1,key2):
    result=[]
    tweet=[]
    i=0
    j=0
    while(i<len(inverted_index[key1])and j<len(inverted_index[key2])):
        if inverted_index[key1][i]==inverted_index[key2][j]:
            result.append(inverted_index[key1][i])
            i+=1
            j+=1
        else:
            if inverted_index[key1][i]<inverted_index[key2][j]:
                i=i+1
            else:
                j=j+1
    for n in result:
        tweet.append(text1[n])
    if len(tweet)>50:
        for i in tweet[:50]:
            print(i,'\n')
        return result[:50]
    else:
        for i in tweet:
            print(i,'\n')
        return result

### or 并集运算

In [10]:
def B_Or(key1,key2):
    result=[]
    tweet=[]
    result.extend(inverted_index[key1][:])
    result.extend(inverted_index[key2][:])
    result=set(result)
    result=list(result)
    result.sort(reverse=False)
    for n in result:
        tweet.append(text1[n])
    if len(tweet)>50:
        for i in tweet[:50]:
            print(i,'\n')
        return result[:50]
    else:
        for i in tweet:
            print(i,'\n')
        return result

### 构建识别查询语句的函数
#### 这里将查询语句拆分，对识别到的关键查询词分别进行对应的操作

In [11]:
def search(Q):
    s=Q.split()
    for i in s:
        if 'and' in i:
             return B_And(s[0],s[2])
        elif 'or' in i:
             return B_Or(s[0],s[2])
        elif 'not' in i:
             return B_Not(s[1])

### 输入语句测试

In [12]:
search('city and appeal')

New York City mayor says confident will win appeal on soda ban http://t.co/FHM20PdMuu | Reuters #news 

City To Appeal Legal Block Of Ban On Large Sugary Drinks: A judge has blocked the Bloomberg administration's e... http://t.co/DIR21BdqJ8 

Judge stops NYC ban on large sugary drinks, city plans appeal - http://t.co/Jczg08UCbp http://t.co/kUL3z1UXsa 



[20200, 20212, 20451]

In [13]:
search('login or slept')

Piranha Fish Attack Photos: Leave a comment. Anonymous 2leep user. Login. Password. Don't have an account? Creat... http://bit.ly/eaZPo1 

Slept way too late for McDonalds breakfast 

Slept in, now watching kim and kourtney take ny and keeping up with the kardashians marathon. Thanks @KimKardashian for a gooood morning! 

@_FloridaMan  
Huge sinkhole swallows Florida man as he slept
http://t.co/PH2i5Yi9ql 

Evernote Resets Passwords After Attackers Steal Login Data - PC Magazine: Economic TimesEvernote R... http://t.co/fzaChsXK7H #Tech #News 

#Tech Evernote Resets Passwords After Attackers Steal Login Data - PC Magazine http://t.co/EigErMV7Du 



[253, 259, 1228, 17021, 17605, 17608]

In [14]:
search('not the')

Mourners recall Sarge Shriver's charity, idealism 
    (AP): AP - R. Sargent Shriver was always an optimist, pio... http://bit.ly/gqMcdG 

Bass Fishing Techniques: 2 Fantastic Tips To Improve Your Casting Skills 

#Financial Aid | Proper Method Of Getting Financial Aid For Education http://ping.fm/BK0R3 #applying-for-financial-aid financial-aid-essay # 

Supreme Court: NASA's intrusive background checks OK http://bit.ly/h2jgy9 

@alyce Very sweet and quiet, if not polished - Bono & Hansard at Sgt Shriver's funeral 2day: http://youtu.be/Bf14XBbcVZg (when was ...cont'd 

Hawaii Gov Waffles on Obama’s Birth Certificate – Patriot Update http://t.co/1UxYa0r via @AddThis 

Ive never retweeted myself but wanted to pass on to @atu2 RT @tommymcgregor: I Want Bono To Sing At My Funeral! http://bit.ly/i0KdEn 

Iran nuclear talks end with no agreement; US officials say six powers aligned - Washington Post: Fox News (blog)... http://bit.ly/e78uRg 

Are Jobs Really Obama's Focus?: More job losses an

[1,
 2,
 3,
 4,
 6,
 8,
 9,
 11,
 12,
 14,
 19,
 21,
 22,
 23,
 24,
 25,
 27,
 28,
 29,
 30,
 31,
 32,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 45,
 46,
 47,
 48,
 49,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 63,
 64,
 66,
 68,
 69]

#### 由于做的只是基本的布尔查询，所以虽然显示结果是准确的，但却无法完成附加更多条件的更精确的搜索，尤其是not运算，只是单单not一个词的话得到的满足条件的推特过多。在此无论何种查询最多只排出前50条，就实际情况的查询而言不具备太大的价值。这是本次实验最大的一个不足。