In [67]:
import CaboCha
import re

In [47]:
def parse():
    '''
    neko.txt.cabpchaを作る
    '''
    with open("./neko.txt") as f:
        with open("./neko.txt.cabocha","w") as outFile:
            cabocha = CaboCha.Parser()
            for line in f:
                outFile.write(cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE))
            

In [18]:
class Morph:
    '''
    形態素を表すクラス
    surface:表層系
    base:基本形
    pos:品詞
    po1:品詞再分類
    '''
    def __init__(self,surface,base,pos,pos1):
        self.surface=surface
        self.base=base
        self.pos=pos
        self.pos1=pos1
    
    def __str__(self):
        '''オブジェクトの文字列表現'''
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)

    


# 40

In [42]:
def neco_lines():
    '''
    係り受け解析結果のジェネレータ
    '''
    with open("./neko.txt.cabocha") as f:
        morphs=[]
        for line in f:
            if line=="EOS\n":
                yield morphs
                morphs=[]
            else:
                if line[0]=='*':
                    continue
                else:
                    cols = line.split('\t')
                    res_cols=cols[1].split(',')
                    newM=Morph(cols[0],res_cols[6],res_cols[0],res_cols[1])
                    print("NewEntry")
                    print(newM)
                    morphs.append(
                       newM
                    )
    raise StopIteration

In [49]:
parse()

In [52]:
for i,morphs in enumerate(neco_lines(),1):
    print("i->",i)
    if i==10:
        for morph in morphs:
            print(morph)
        break

NewEntry
surface[一]	base[一]	pos[名詞]	pos1[数]
i-> 1
i-> 2
NewEntry
surface[　]	base[　]	pos[記号]	pos1[空白]
NewEntry
surface[吾輩]	base[吾輩]	pos[名詞]	pos1[代名詞]
NewEntry
surface[は]	base[は]	pos[助詞]	pos1[係助詞]
NewEntry
surface[猫]	base[猫]	pos[名詞]	pos1[一般]
NewEntry
surface[で]	base[だ]	pos[助動詞]	pos1[*]
NewEntry
surface[ある]	base[ある]	pos[助動詞]	pos1[*]
NewEntry
surface[。]	base[。]	pos[記号]	pos1[句点]
i-> 3
NewEntry
surface[名前]	base[名前]	pos[名詞]	pos1[一般]
NewEntry
surface[は]	base[は]	pos[助詞]	pos1[係助詞]
NewEntry
surface[まだ]	base[まだ]	pos[副詞]	pos1[助詞類接続]
NewEntry
surface[無い]	base[無い]	pos[形容詞]	pos1[自立]
NewEntry
surface[。]	base[。]	pos[記号]	pos1[句点]
i-> 4
i-> 5
NewEntry
surface[　]	base[　]	pos[記号]	pos1[空白]
NewEntry
surface[どこ]	base[どこ]	pos[名詞]	pos1[代名詞]
NewEntry
surface[で]	base[で]	pos[助詞]	pos1[格助詞]
NewEntry
surface[生れ]	base[生れる]	pos[動詞]	pos1[自立]
NewEntry
surface[た]	base[た]	pos[助動詞]	pos1[*]
NewEntry
surface[か]	base[か]	pos[助詞]	pos1[副助詞／並立助詞／終助詞]
NewEntry
surface[とんと]	base[とんと]	pos[副詞]	pos1[一般]
NewEntry
surface[見当]	base[見当]	pos

## 41

In [62]:
class Chunk:
    '''
    文節を表す
    morphs:Morphのkisut
    dst: かかり先文節のインデックス番号
    srcs:かかり元文節のインデックス番号のlist
    '''
    def __init__(self):
        self.morphs=[]
        self.srcs=[]
        self.dst=-1
    
    def __str__(self):
        
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)


In [69]:
def chunk_lines():
    with open("./neko.txt.cabocha") as file:
        chunks=dict()
        idx=-1
        
        for line in file:
            if line=="EOS\n":
                if len(chunks)>0:
                    sorted_tuple = sorted(chunks.items(),key=lambda x:x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
                
            elif line[0]=="*":
                cols=line.split(' ')
                idx=int(cols[1])
                dst = int(re.search(r'(.*?)D',cols[2]).group(1))
                
                if idx not in chunks:
                    chunks[idx]=Chunk()
                chunks[idx].dst=dst
                
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst]=Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split("\t")
                res_cols=cols[1].split(",")
                
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],
                        res_cols[6],
                        res_cols[0],
                        res_cols[1])
                )

In [65]:
parse()

In [71]:
for i,chunks in enumerate(chunk_lines(),1):
    if i==8:
        for j,chunk in enumerate(chunks):
            print("[{}]{}".format(j,chunk))
        break

[0]吾輩は	srcs[]	dst[5]
[1]ここで	srcs[]	dst[2]
[2]始めて	srcs[1]	dst[3]
[3]人間という	srcs[2]	dst[4]
[4]ものを	srcs[3]	dst[5]
[5]見た。	srcs[0, 4]	dst[-1]
