# 布尔查询与倒排索引
### 流程
- 数据预处理
- 建立到排索引
- 实现query的AND，NOT，OR逻辑
- 查询操作返回topK结果

In [13]:
import json
import nltk

### 数据预处理
- 解析json格式数据得到dict_list
- 去非关键性标点，换行符等，split
- 建立vocab（Tokenize）
- 保存结果数据
###### 注意这里的 text texts的顺序相同 vbocab 一个文档只记录一次

In [14]:
f = open('./data/tweets.txt','r+')
lines = f.readlines()
text = []
for l in lines:
    text.append(json.loads(l)['text'])

#### 去除特殊符号
- 保留完整网址

In [15]:
texts = []
symbol = [',',':','_','!','\"','*','>','<','@','~','-','(',')','%','=','\\','^'
          ,'&','|','#','$','[',']','+',':','#','|'] 
for l in text:
    for s in symbol:
        line = l.replace(s,'')
    line = line.split()
    texts.append(line)

#### 建立vocab

In [16]:
vocab = {}
for line in texts:
    line_set = set(line) ##去重
    for word in line_set:       
            if vocab.get(word) == None:
                vocab[word] = 1
            else :
                vocab[word] += 1
v_tuple = sorted(vocab.items(),key = lambda x:x[1],reverse=True) ##从大到小排序
vocab = {}
for t in v_tuple:
    vocab[t[0]] = t[1]

#### 保存

In [17]:
f = open('./data/vocab.txt','w+')
line_str = json.dumps(vocab)
f.write(line_str)
f.close()

### 建立到倒排索引

In [20]:
word2inverted_index = {} ##一个字典 存frequency 一个字典存 每个词出现的docID
keys = list(vocab.keys())
for k in keys:
    word2inverted_index[k] = []##初始化

for i,line in enumerate(texts):
    line_set = set(line) ##去重
    for word in line_set:
        word2inverted_index[word].append(i)

### 实现query的AND，NOT，OR逻辑

In [28]:
def AND_op(l1,l2):
    ##两个有序链表 返回 共同的 docID（交集）有序
    result = []
    i = 0
    j = 0
    while(i<len(l1) and j < len(l2)):
        if l1[i] == l2[j]:
            result.append(l1[i])
            i += 1
            j += 1
        else:
            if l1[i] < l2[j]:
                i += 1
            else:
                j += 1
    return result

In [36]:
def OR_op(l1,l2):
    ##两个有序链表 返回并集 docID 并且去重 有序
    result = []
    i = 0
    j = 0
    while(i<len(l1) and j < len(l2)):
        if l1[i] == l2[j]:
            result.append(l1[i])
            
            i += 1
            j += 1
        else:
            if l1[i] < l2[j]:
                result.append(l1[i])
                i += 1
            else:
                result.append(l2[j])
                j += 1
    result.extend(l1[i:])
    result.extend(l2[j:])
    ##注意这里注意剩余
    return result

In [56]:
def NOT_op(l):
    ##一个有序链表 返回补集 有序
    result = []
    j = 0
    for i in range(l[-1]):
        if l[j] == i:
            j += 1
        else:
            result.append(i)
    ##补全
    for i in range(l[j]+1,len(texts)):
        result.append(i)
    return result

### 布尔查询

#### 后缀表达式需要的堆栈

In [69]:
class Stack:

    def __init__(self):

        self.items = []

    def isEmpty(self):

        return self.items == []

    def push(self, item):

        self.items.append(item)

    def pop(self):

        return self.items.pop()

    def peek(self):
        if len(self.items) >0:
            return self.items[len(self.items)-1]

    def size(self):

        return len(self.items)    

In [89]:
k = 50 ##只取topk个返回
query = "Appel AND pen"

def boolean_search(Q):
    ##输入特定的query进行解析
    s_line = Q.split()
    l = []
    op = []
    n = 0##not 优先级最高
    a = 1##and 优先级次之
    o = 2##or 优先级最低
    ##数字和list
    for s in s_line:
        if s == 'NOT':
            op.append(n)
        elif s == 'AND':
            op.append(a)
        elif s== 'OR':
            op.append(o)
        else:
            op.append(s)
    
    stack_back= Stack() ##后缀表达式栈
    stack_op = Stack() ##符号栈
    for o in op:
        if isinstance(o,str):
            stack_back.push(o)
        else:
            top_op = stack_op.peek()
            if top_op==None or o > top_op:
                ##优先级高于栈顶或者符号栈为空
                stack_op.push(o)
            else:
                while top_op!=None and stack_op.size()!= 0 and o <= top_op:
                    stack_back.push(stack_op.pop())##不断加入后缀表达式栈
                stack_op.push(o)
    
    while stack_op.size() != 0:
        print(stack_op.size())
        stack_back.push(stack_op.pop())##剩余全部加入后缀
        
    print(stack_back.items)           
    print(stack_op.items)        
    return l
def 

In [92]:
boolean_search('APPLE AND NOT Apple OR APPLE')

2
1
['APPLE', 1, 'Apple', 'APPLE', 2, 0]
[]


[]