In [1]:
a = 10
print(a)

10


In [31]:
pip install janome

Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 3.5 MB/s eta 0:00:011
[?25hInstalling collected packages: janome
Successfully installed janome-0.4.1
Note: you may need to restart the kernel to use updated packages.


# Charfilter

In [26]:
class CharacterFilter:
    @classmethod
    def filter(cls, text:str):
        raise NotImplementedError
class HtmlStripFilter(CharacterFilter):
    @classmethod
    def filter(cls,text:str):
        html_pattern = re.compile(r"<[^>]*?>")
        return html_pattern.sub("",text)
class LowercaseFilter(CharacterFilter):
    @classmethod
    def filter(cls,text:str):
        return text.lower()

 # Tokennizer

In [32]:
from janome.tokenizer import Tokenizer

tokennizer = Tokenizer()

class BaseTokenizer:
    @classmethod
    def tokenize(cls,text):
        raise NotImplementedError
class JanomeTokenizer(BaseTokenizer):
    @classmethod 
    def tokenize(cls,text):
        return (t for t in cls.tokenizer.tokenize(text))
class WhitespaceTokenizer(BaseTokenizer):
    @classmethod
    def tokenize(cls,text):
        return (t[0]for t in re.finditer(r"[^\t\r\n]+",text))

# TokenFilter

In [38]:
STOPWORDS = ("is","was","to","the")

def is_token_instance(token):
    return isinstance(token,Token)

class TokenFilter:
    @classmethod
    def filter(cls,token):
        """
        in: sting or janome.tokenizer.Token
        """
        raise NotImplementedError

class StopWordFilter(TokenFilter):
    @classmethod
    def filter(cls,token):
        if isinstance(token,Token):
            if token.surface in STOPWORDS:
                return None
        if token in STOPWORDS:
            return None
        return token

In [36]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [39]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

class Stemmer(TokenFilter):
    @classmethod
    def filter(cls,token: str):
        if token:
            return ps.stem(token)

In [40]:
class POSFilter(TokenFilter):
    """
    日本語の助詞/記号を除くフィルター
    """
    @classmethod
    def filter(cls,token):
        """
        in: janome token
        """
        stop_pos_list = ("助詞","副詞","記号")
        if any([token.part_of_speech.startswith(pos) for pos in stop_pos_list]):
            return None
        return token

# Analyzer

In [41]:
class Analyzer:
    tokenizer = None
    char_filters = []
    token_filters = []
    
    @classmethod
    def analyze(cls,text:str):
        text = cls._char_filter(text)
        tokens = cls.tokenizer.tokenize(text)
        filtered_token = (cls._token_filter(token) for token in tokens)
        return [parse_token(t) for t in filtered_token if t]
    
    @classmethod
    def _char_filter(cls,text):
        for char_filter in cls.char_filters:
            text = char_filter.filter(text)
        return text
    
    @classmethod
    def _token_filter(cls,token):
        for token_filter in cls.token_filters:
            token = token_filter.filter(token)
        return token

In [42]:
class JapaneseAnalyzer(Analyzer):
    tokenizer = JanomeTokenizer
    char_filters = [HtmlStripFilter,LowercaseFilter]
    token_filters = [StopWordFilter,POSFilter,Stemmer]
    
class EnglishAnalyzer(Analyzer):
    tokenizer = WhitespaceTokenizer
    char_filters = [HtmlStripFilter,LowercaseFilter]
    token_filters = [StopWordFilter,POSFilter,Stemmer]

In [46]:
def analyzed_query(parsed_query):
    return_val = []
    for q in parsed_query:
        if q in OPRS:
            return_val.append(q)
        else:
            analyzed_q = JapaneseAnalyzer.analyze(q)
            if analyzed_q:
                tmp = " OR ".join(analyzed_q)
                return_val += tmp.split(" ")
    return return_val

# indexer

In [43]:
class InvertedIndex:
    def __init__(
        self,token_id: int, token:str,postings_list=[],docs_count=0)->None:
        self.token_id = token_id
        self.token = token
        self.postings_list = []
        self.__hash_handle = {}
        self.docs_count = 0
        
def add_document(doc:str):
    """
    ドキュメントをデータベースに追加し転置インデックスを構築する
    """
    if not doc:
        return 
    # # 文書IDと文書内容を基にミニ転置インデックス作成
    text_to_postings_lists(doc)
    
    if len(TEMP_INVERT_INDEX) >=LIMIT:
        for inverted_index in TEMP_INVERT_INDEX.values():
            save_index(inverted_index)
            
def text_to_postings_lists(text)->list:
    tokens = JapaneseAnalyzer.analyze(text)
    token_count = len(tokens)
    document_id = save_document(text,token_count)
    
    cnt = Counter(tokens)
    for token, c in cnt.most_common():
        token_to_posting_list(token,document_id,c)

def text_to_posting_list(token:str,document:int,token_count:int):
    token_id = get_token_id(token)
    index = TEMP_INVERT_INDEX.get(token_id)
    if not index:
        index = InvertedIndex(token_id,token)
    
    posting = "{}:{}".format(str(document_id),str(token_count))
    index.add_posting(posting)
    
    TEMP_INVERT_INDEX[token_id] = index


# Searcher

In [44]:
def search_by_query(query):
    if not query:
        return []
    
    #parse
    parsed_query = tokenize(query)
    parsed_query = analyzed_query(parsed_queqry)
    rpn_tokens = parse_rpn(parsed_query)
    
    #merge
    doc_ids,query_postings = merge(rpn_tokens)
    print(doc_ids,query_postings)
    
    #fetch
    docs = [fetch_doc(doc_id) for doc_id in doc_ids]
    
    #sort
    sorted_docs = sort(docs,query_postings)
    return [_parse_doc(doc) for doc, _ in sorted_docs]

# Parser

In [47]:
import re

REGEX_PATTERN = r"\s*(\d+|\w+|.)"
SPLITTER = re.compile(REGEX_PATTERN)

LEFT = True
RIGHT = False

OPERATER = {"AND":(3,LEFT),"OR":(2,LEFT),"NOT":(1,RIGHT)}

def tokennize(text):
    return SPLITTER.findall(text)

def parse_rpn(tokens:list):
    

SyntaxError: unexpected EOF while parsing (153687985.py, line 15)

# Merge

In [48]:
def merge(tokens:list):
    target_posting = {}
    
    stack = []
    for token in tokens:
        if tokne not in OPRS:
            token_id = get_token_id(token)
            postings_list = fetch_postings_list(token_id)
            
            target_posting[token] = postings_list
            
            doc_ids = set([p[0] for p in postings_list])
            stack.append(doc_ids)
        
        else:
            if not stack:
                raise
            
            if len(stack) == 1:
                
                if token ==  "NOT":
                    
                    return not_doc_ids,{}
                else:
                    raise
            
            doc_ids1 = stack.pop()
            doc_ids2 = stack.pop()
            stack.append(merge_posting(token,doc_ids1,doc_ids2))
            
def sort(doc_ids,query_postings):
    docs = []
    all_docs = count_all_docs()
    for doc_id in doc_ids:
        doc = fetch_doc(doc_id)
        doc_tfidf = 0
        for token,postings_list in query_postings.items():
            idf = math.log10(all_docs/len(postings_list))+1
            posting = [p for p in postings_list if p[0] ==doc.id]
            if posting:
                tf = round(posting[0][1]/doc.token_count,2)
            else:
                tf = 0
                token_tfidf = tf*idf
                doc_tfidf += token_tfidf
            docs.append((doc,doc_tfidf))
        return sorted(docs,key = lambda x: x[1], reverse=True)