## 导入库以及库的初始化

In [2]:
import json  
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
import re
from tqdm import tqdm 
lemmatizer = WordNetLemmatizer()
legal_words=words.words()

## 读取数据  
- 读取数据至all_data以列表形式存储

In [3]:
tweets=[]
all_tweets_id=[]
f=open('./data/tweets.txt','r')
for line in f:
    tweets.append(json.loads(line))
    all_tweets_id.append(int(tweets[-1]['tweetId']))
f.close()

## 对文本进行预处理的函数

In [4]:
def deal_text(text:str):
    text=text.lower()#均转换为小写
    text = re.sub(r'[^a-z\s]', '', text)#仅保留字母和空格  
    tokens=word_tokenize(text)#获取分词结果  
    stop_words=set(stopwords.words('english'))
    filterd_tokens=[word for word in tokens if word not in stop_words]#去除停用词
    lemmatizer_tokens=[lemmatizer.lemmatize(word) for word in filterd_tokens]#还原词形
    #lemmatizer_tokens=[word for word in lemmatizer_tokens if word in legal_words]#只保留合法单词，加上这个跑得很慢
    return lemmatizer_tokens

## 生成标记序列

In [5]:
sign_sequence=[]
for tweet in tqdm(tweets):
    words=[]
    words+=deal_text(tweet['userName'])
    words+=deal_text(tweet['text'])
    for word in words:
        sign_sequence.append((word,int(tweet['tweetId'])))

100%|██████████| 30364/30364 [00:18<00:00, 1611.20it/s]


In [6]:
sign_sequence[:10]

[('mariah', 28965792812892160),
 ('people', 28965792812892160),
 ('house', 28965792812892160),
 ('may', 28965792812892160),
 ('kill', 28965792812892160),
 ('arizonastyle', 28965792812892160),
 ('immigration', 28965792812892160),
 ('bill', 28965792812892160),
 ('rep', 28965792812892160),
 ('rick', 28965792812892160)]

## 排序  
- 按术语排列，然后按照文档ID排序  

In [7]:
unique_sign_sequence=list(set(sign_sequence))
unique_sign_sequence.sort(key=lambda x:(x[0],x[1]))

In [8]:
unique_sign_sequence[:10]

[('aa', 32858841427214336),
 ('aa', 299786729190219776),
 ('aa', 302033886265892864),
 ('aa', 302047903625654273),
 ('aa', 302101154517643264),
 ('aa', 302187951440404482),
 ('aa', 302403580630011906),
 ('aa', 303810647982997504),
 ('aa', 306788066926944256),
 ('aa', 307509260751892480)]

## 字典和倒排索引

In [9]:
unique_sign_sequence[:10]

[('aa', 32858841427214336),
 ('aa', 299786729190219776),
 ('aa', 302033886265892864),
 ('aa', 302047903625654273),
 ('aa', 302101154517643264),
 ('aa', 302187951440404482),
 ('aa', 302403580630011906),
 ('aa', 303810647982997504),
 ('aa', 306788066926944256),
 ('aa', 307509260751892480)]

In [10]:
keyword_dict={}
for word in unique_sign_sequence:
    if(word[0] in keyword_dict):
        keyword_dict[word[0]].append(word[1])
    else:
        keyword_dict[word[0]]=[word[1]]

In [11]:
keyword_dict['dog'][:10]

[29255276477554689,
 29902841330012160,
 30022191554764802,
 31655260195921921,
 31811330562334720,
 32547752923635712,
 32579490752241664,
 33962846387707904,
 34397865757384705,
 34530272498163713]

## 布尔查询实现

### and

In [12]:
def op_and(keyword1,keyword2):
    result=[]
    if(keyword1 not in keyword_dict or keyword2 not in keyword_dict):
        return result 
    else:
        list1=keyword_dict[keyword1]
        list2=keyword_dict[keyword2]
        p1=0
        p2=0
        while(True):
            if(p1==len(list1) or p2==len(list2)):
                break 
            if(list1[p1]==list2[p2]):
                result.append(list1[p1])
                p1+=1
                p2+=1
            else:
                if(list1[p1]>=list2[p2]):
                    p2+=1
                else:
                    p1+=1
        return result

In [13]:
#验证
test=op_and('dog','cat')
dog=keyword_dict['dog']
cat=keyword_dict['cat']
for id in test:
    if(id not in cat or id not in dog):
        print('False')

### or

In [14]:
def op_or(keyword1,keyword2):
    result=[]
    if(keyword1 not in keyword_dict and keyword2 not in keyword_dict):
        return result 
    else:
        list1=keyword_dict[keyword1]
        list2=keyword_dict[keyword2]
        p1=0
        p2=0
        while(True):
            if(p1==len(list1) or p2==len(list2)):
                break  
            if(list1[p1]>=list2[p2]):#如果第一个比第二个大，先放第二个
                result.append(list2[p2])
                p2+=1
            else:
                result.append(list1[p1])
                p1+=1
        result+=list1[p1:]
        result+=list2[p2:]
        return result

In [15]:
#验证
test=op_or('dog','cat')
dog=keyword_dict['dog']
cat=keyword_dict['cat']
sorted(list(set(test)))==sorted(list(set(dog+cat)))

True

## not

In [16]:
def op_not(keyword):
    if(keyword not in keyword_dict):
        return all_tweets_id
    else:
        return [i for i in all_tweets_id if i not in keyword_dict[keyword]]

In [17]:
#验证
test=op_not('dog')
dog=keyword_dict['dog']
for i in test:
    if(i in dog):
        print('False')

## and not

In [18]:
def op_and_not(keyword1,keyword2):
    result=[]
    if(keyword1 not in keyword_dict):
        return result  
    else:
        list1=keyword_dict[keyword1]
        list2=keyword_dict[keyword2]
        p1=0
        p2=0
        while(p1<len(list1)):
            id=list1[p1]
            p1+=1  
            while(p2<len(list2) and list2[p2]<id):
                p2+=1
            if(p2<len(list2) and list2[p2]==id):
                continue 
            else:
                result.append(id)
        return result

In [19]:
#验证
test=op_and_not('dog','cat')
dog=keyword_dict['dog']
cat=keyword_dict['cat']
sorted(list(set(test+cat)))==sorted(list(set(dog+cat)))

True

## 主程序

In [None]:
def deal_word(word:str):
    word=word.lower()
    word=re.sub(r'[^a-z]','',word)
    word=lemmatizer.lemmatize(word)
    return word

In [21]:
while(True):
    query=input('>>')
    queries=query.split(' ')
    if(queries[0]=='NOT'):
        print(op_not(deal_word(queries[1])))
    elif(queries[1]=='AND'):
        print(op_and(deal_word(queries[0]),deal_word(queries[2])))
    elif(queries[1]=='OR'):
        print(op_or(deal_word(queries[0]),deal_word(queries[2])))
    elif(queries[1]=='AND' and queries[2]=='NOT'):
        print(op_and_not(deal_word(queries[0]),deal_word(queries[4])))

['deal', 'fuck']
