# 5章 係り受け解析

## 40.係り受け解析結果の読み込み

In [None]:
import CaboCha
import re
fname = 'neko.txt'
fname_parsed = 'neko.txt.cabocha'

with open(fname) as data_file, open (fname_parsed,'w') as out_file:
    cabocha = CaboCha.Parser()
    for line in data_file:
        out_file.write(cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE))

In [None]:
class Morph:
    def __init__(self,surface,base,pos,pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1
        
    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'.format(self.surface,self.base,self.pos,self.pos1)

def neco_lines():
    with open (fname_parsed) as file_parsed:
        morphs = []
        for line in file_parsed:
            if line == 'EOS\n':
                yield morphs
                morphs = []
            else:
                if line[0] == '*':
                    continue
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                morphs.append(Morph(
                    cols[0],res_cols[6],res_cols[0],res_cols[1]))
        raise StopIteration

In [None]:
for i,morphs in enumerate(neco_lines(),1):
    if i==3:
        for morph in morphs:
            print(morph)
        break

## 41.係り受け解析結果の読み込み

In [None]:
class Chunk:
    def __init__(self):
        self.morphs = []
        self.dst = -1
        self.srcs = []
    def __str__(self):
        surface =''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface,self.srcs,self.dst)
    
    def normalized_surface(self):
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result
    
def neco_lines():
    with open (fname_parsed) as file_parsed:
        chunks = dict()
        idx = -1
        
        for line in file_parsed:
            if line == 'EOS\n':
                if len(chunks) > 0:
                    sorted_tuple = sorted(chunks.items(), key = lambda x:x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
            elif line[0] == '*': 
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D',cols[2]).group(1))
                
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                chunks[idx].morphs.append(Morph(
                    cols[0],res_cols[6],res_cols[0],res_cols[1]))
        raise StopIteration

In [None]:
for i,chunks in enumerate(neco_lines(),1):
    if i==8:
        for j,chunk in enumerate(chunks):
            print('[{}]{}'.format(j,chunk))
        break

## 42.係り元と係り先の文節の表示

In [None]:
for chunks in neco_lines():
    for chunk in chunks:
        if chunk.dst != -1:
            src = chunk.normalized_surface()
            dst = chunks[chunk.dst].normalized_surface()
            if src != '' and dst != '':
                print('{}\t{}'.format(src,dst))

## 43.名詞を含む文節が動詞を含む文節に係るものを抽出

In [None]:
def check_V(morphs,hinsi):
    for morph in morphs:
        if morph.pos == hinsi:
            return True
    return False
    


for chunks in neco_lines():
    for chunk in chunks:
        if chunk.dst != -1:
            if check_V(chunk.morphs,'名詞') and check_V(chunks[chunk.dst].morphs,'動詞') :
                src = chunk.normalized_surface()
                dst = chunks[chunk.dst].normalized_surface()
                if src != '' and dst != '':
                    print('{}\t{}'.format(src,dst))

## 44.係り受け解析の可視化

In [None]:
import CaboCha
import re
import pydot_ng as pydot

fname = 'neko.txt.tmp'
fname_parsed = 'neko.txt.cabocha.tmp'


def parse_neko():
    '''「吾輩は猫である」を係り受け解析
    「吾輩は猫である」(neko.txt)を係り受け解析してneko.txt.cabochaに保存する
    '''
    with open(fname) as data_file, \
            open(fname_parsed, mode='w') as out_file:

        cabocha = CaboCha.Parser()
        for line in data_file:
            out_file.write(
                cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            )


class Morph:
    '''
    形態素クラス
    表層形（surface）、基本形（base）、品詞（pos）、品詞細分類1（pos1）を
    メンバー変数に持つ
    '''
    def __init__(self, surface, base, pos, pos1):
        '''初期化'''
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        '''オブジェクトの文字列表現'''
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)


class Chunk:
    '''
    文節クラス
    形態素（Morphオブジェクト）のリスト（morphs）、係り先文節インデックス番号（dst）、
    係り元文節インデックス番号のリスト（srcs）をメンバー変数に持つ
    '''

    def __init__(self):
        '''初期化'''
        self.morphs = []
        self.srcs = []
        self.dst = -1

    def __str__(self):
        '''オブジェクトの文字列表現'''
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)

    def normalized_surface(self):
        '''句読点などの記号を除いた表層形'''
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result

    def chk_pos(self, pos):
        '''指定した品詞（pos）を含むかチェックする

        戻り値：
        品詞（pos）を含む場合はTrue
        '''
        for morph in self.morphs:
            if morph.pos == pos:
                return True
        return False


def neco_lines():
    '''「吾輩は猫である」の係り受け解析結果のジェネレータ
    「吾輩は猫である」の係り受け解析結果を順次読み込んで、
    1文ずつChunkクラスのリストを返す

    戻り値：
    1文のChunkクラスのリスト
    '''
    with open(fname_parsed) as file_parsed:

        chunks = dict()     # idxをkeyにChunkを格納
        idx = -1

        for line in file_parsed:

            # 1文の終了判定
            if line == 'EOS\n':

                # Chunkのリストを返す
                if len(chunks) > 0:

                    # chunksをkeyでソートし、valueのみ取り出し
                    sorted_tuple = sorted(chunks.items(), key=lambda x: x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()

                else:
                    yield []

            # 先頭が*の行は係り受け解析結果なので、Chunkを作成
            elif line[0] == '*':

                # Chunkのインデックス番号と係り先のインデックス番号取得
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D', cols[2]).group(1))

                # Chunkを生成（なければ）し、係り先のインデックス番号セット
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst

                # 係り先のChunkを生成（なければ）し、係り元インデックス番号追加
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)

            # それ以外の行は形態素解析結果なので、Morphを作りChunkに追加
            else:

                # 表層形はtab区切り、それ以外は','区切りでバラす
                cols = line.split('\t')
                res_cols = cols[1].split(',')

                # Morph作成、リストに追加
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],        # surface
                        res_cols[6],    # base
                        res_cols[0],    # pos
                        res_cols[1]     # pos1
                    )
                )


def graph_from_edges_ex(edge_list, directed=False):
    '''pydot_ng.graph_from_edges()のノード識別子への対応版

    graph_from_edges()のedge_listで指定するタプルは
    識別子とグラフ表示時のラベルが同一のため、
    ラベルが同じだが実体が異なるノードを表現することができない。
    例えば文の係り受けをグラフにする際、文の中に同じ単語が
    複数出てくると、それらのノードが同一視されて接続されてしまう。

    この関数ではedge_listとして次の書式のタプルを受け取り、
    ラベルが同一でも識別子が異なるノードは別ものとして扱う。

    edge_list = [((識別子1,ラベル1),(識別子2,ラベル2)), ...]

    識別子はノードを識別するためのもので表示されない。
    ラベルは表示用で、同じでも識別子が異なれば別のノードになる。

    なお、オリジナルの関数にあるnode_prefixは未実装。

    戻り値：
    pydot.Dotオブジェクト
    '''

    if directed:
        graph = pydot.Dot(graph_type='digraph')

    else:
        graph = pydot.Dot(graph_type='graph')

    for edge in edge_list:

        id1 = str(edge[0][0])
        label1 = str(edge[0][1])
        id2 = str(edge[1][0])
        label2 = str(edge[1][1])

        # ノード追加
        graph.add_node(pydot.Node(id1, label=label1))
        graph.add_node(pydot.Node(id2, label=label2))

        # エッジ追加
        graph.add_edge(pydot.Edge(id1, id2))

    return graph


# 対象文字列を入力してもらい、そのままfnameに保存
with open(fname, mode='w') as out_file:
    out_file.write(input('文字列を入力してください--> '))

# 係り受け解析
parse_neko()

# 1文ずつリスト作成
for chunks in neco_lines():

    # 係り先があるものを列挙
    edges = []
    for i, chunk in enumerate(chunks):
        if chunk.dst != -1:

            # 記号を除いた表層形をチェック、空なら除外
            src = chunk.normalized_surface()
            dst = chunks[chunk.dst].normalized_surface()
            if src != '' and dst != '':
                edges.append(((i, src), (chunk.dst, dst)))

    # 描画
    if len(edges) > 0:
        graph = graph_from_edges_ex(edges, directed=True)
        graph.write_png('result.png')

## 45.動詞の格パターンの抽出

In [None]:
import CaboCha
import re
import pydot_ng as pydot

fname = 'neko.txt'
fname_parsed = 'neko.txt.cabocha'

def parse_neko():
    with open(fname) as data_file, \
            open(fname_parsed, mode='w') as out_file:

        cabocha = CaboCha.Parser()
        for line in data_file:
            out_file.write(
                cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            )

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)


class Chunk:
    def __init__(self):
        self.morphs = []
        self.srcs = []
        self.dst = -1

    def __str__(self):
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)

    def normalized_surface(self):
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result

    def chk_pos(self, pos):
        for morph in self.morphs:
            if morph.pos == pos:
                return True
        return False
    
    def get_morphs_by_pos(self,pos,pos1=''):
        if len(pos1) > 0:
            return [res for res in self.morphs if (res.pos == pos) and (res.pos == pos1)]
        else:
            return [res for res in self.morphs if res.pos == pos]
    
def neco_lines():
    with open(fname_parsed) as file_parsed:
        chunks = dict() 
        idx = -1
        for line in file_parsed:
            if line == 'EOS\n':
                if len(chunks) > 0:
                    sorted_tuple = sorted(chunks.items(), key=lambda x: x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
            elif line[0] == '*':
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D', cols[2]).group(1))
                # Chunkを生成（なければ）し、係り先のインデックス番号セット
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst
                # 係り先のChunkを生成（なければ）し、係り元インデックス番号追加
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)
            # それ以外の行は形態素解析結果なので、Morphを作りChunkに追加
            else:
                # 表層形はtab区切り、それ以外は','区切りでバラす
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                # Morph作成、リストに追加
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],        # surface
                        res_cols[6],    # base
                        res_cols[0],    # pos
                        res_cols[1]     # pos1
                    )
                )

parse_neko()

In [None]:
for chunks in neco_lines():
    for chunk in chunks:
        verbs = chunk.get_morphs_by_pos('動詞')
        if len(verbs)< 1: continue
        prts = []
        for src in chunk.srcs:
            prts_in_chunk = chunks[src].get_morphs_by_pos('助詞')
            if len(prts_in_chunk) > 1:
                kaku_prts = chunks[src].get_morphs_by_pos('助詞','格助詞')
                prts_in_chunk = kaku_prts
            if len(prts_in_chunk) > 0:
                prts.append(prts_in_chunk[-1])
        if len(prts) < 1:
            continue
        print(verbs[0].base,' '.join(sorted(prt.surface for prt in prts)))

## 46.動詞の格フレーム情報の抽出

In [None]:
import CaboCha
import re
import pydot_ng as pydot

fname = 'neko.txt'
fname_parsed = 'neko.txt.cabocha'

def parse_neko():
    with open(fname) as data_file, \
            open(fname_parsed, mode='w') as out_file:

        cabocha = CaboCha.Parser()
        for line in data_file:
            out_file.write(
                cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            )

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)


class Chunk:
    def __init__(self):
        self.morphs = []
        self.srcs = []
        self.dst = -1

    def __str__(self):
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)

    def normalized_surface(self):
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result

    def chk_pos(self, pos):
        for morph in self.morphs:
            if morph.pos == pos:
                return True
        return False
    
    def get_morphs_by_pos(self,pos,pos1=''):
        if len(pos1) > 0:
            return [res for res in self.morphs if (res.pos == pos) and (res.pos == pos1)]
        else:
            return [res for res in self.morphs if res.pos == pos]
    
def neco_lines():
    with open(fname_parsed) as file_parsed:
        chunks = dict() 
        idx = -1
        for line in file_parsed:
            if line == 'EOS\n':
                if len(chunks) > 0:
                    sorted_tuple = sorted(chunks.items(), key=lambda x: x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
            elif line[0] == '*':
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D', cols[2]).group(1))
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],        # surface
                        res_cols[6],    # base
                        res_cols[0],    # pos
                        res_cols[1]     # pos1
                    )
                )

parse_neko()

In [None]:
for chunks in neco_lines():
    for chunk in chunks:
        verbs = chunk.get_morphs_by_pos('動詞')
        if len(verbs)< 1: continue
        prts = []
        frame = []
        for src in chunk.srcs:
            prts_in_chunk = chunks[src].get_morphs_by_pos('助詞')
            if len(prts_in_chunk) > 1:
                kaku_prts = chunks[src].get_morphs_by_pos('助詞','格助詞')
                prts_in_chunk = kaku_prts
            if len(prts_in_chunk) > 0:
                prts.append(prts_in_chunk[-1])
                frame.append(chunks[src].normalized_surface())
        if len(prts) < 1:
            continue
        print(verbs[0].base,' '.join(sorted(prt.surface for prt in prts)))
        print(":::",frame)

## 47.機能動詞構文のマイニング

In [None]:
import CaboCha
import re
import pydot_ng as pydot

fname = 'neko.txt'
fname_parsed = 'neko.txt.cabocha'

def parse_neko():
    with open(fname) as data_file, \
            open(fname_parsed, mode='w') as out_file:

        cabocha = CaboCha.Parser()
        for line in data_file:
            out_file.write(
                cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            )

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)


class Chunk:
    def __init__(self):
        self.morphs = []
        self.srcs = []
        self.dst = -1

    def __str__(self):
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)

    def normalized_surface(self):
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result

    def chk_pos(self, pos):
        for morph in self.morphs:
            if morph.pos == pos:
                return True
        return False
    
    def get_morphs_by_pos(self,pos,pos1=''):
        if len(pos1) > 0:
            return [res for res in self.morphs if (res.pos == pos) and (res.pos == pos1)]
        else:
            return [res for res in self.morphs if res.pos == pos]
    
    def get_kaku_prt(self):
        prts = self.get_morphs_by_pos('助詞')
        if len(prts) >1:
            kaku_prts = self.get_morphs_by_pos('助詞','格助詞')
            if len(kaku_prts) > 0:
                prts[-1].surface
        
        if len(prts) > 0:
            return prts[-1].surface
        else:
            return ''
    
    
    def get_sahen_wo(self):
        for i,morph in enumerate(self.morphs[0:-1]) :
            if(morph.pos == '名詞')\
              and (morph.pos1 == 'サ変接続')\
              and (self.morphs[i+1].pos == "助詞")\
              and (self.morphs[i +1].surface == 'を'):
                return morph.surface + self.morphs[i + 1].surface
        return ''
    
def neco_lines():
    with open(fname_parsed) as file_parsed:
        chunks = dict() 
        idx = -1
        for line in file_parsed:
            if line == 'EOS\n':
                if len(chunks) > 0:
                    sorted_tuple = sorted(chunks.items(), key=lambda x: x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
            elif line[0] == '*':
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D', cols[2]).group(1))
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],        # surface
                        res_cols[6],    # base
                        res_cols[0],    # pos
                        res_cols[1]     # pos1
                    )
                )

parse_neko()

In [None]:
for chunks in neco_lines():
    for chunk in chunks:
        tmpret = chunk.get_sahen_wo()
        ret2 = ''
        ret3 = ''
        if len(tmpret) > 0 and chunks[chunk.dst].chk_pos('動詞'):
            x = chunks[chunk.dst].get_morphs_by_pos('動詞')[0]
            ret = tmpret + x.base
            for lis in chunks[chunk.dst].srcs:
                tmp = chunks[lis].get_kaku_prt() 
                if len(tmp) > 0 and tmpret != chunks[lis].normalized_surface():
                    ret2 += tmp + " "
                    ret3 += chunks[lis].normalized_surface() + " "
            print(ret)
            print(ret2)
            print(ret3)
            print("--------------")

## 48.名詞から根へのパスの抽出

In [None]:
import CaboCha
import re
import pydot_ng as pydot

fname = 'neko.txt'
fname_parsed = 'neko.txt.cabocha'

def parse_neko():
    with open(fname) as data_file, \
            open(fname_parsed, mode='w') as out_file:

        cabocha = CaboCha.Parser()
        for line in data_file:
            out_file.write(
                cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            )

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)


class Chunk:
    def __init__(self):
        self.morphs = []
        self.srcs = []
        self.dst = -1

    def __str__(self):
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)

    def normalized_surface(self):
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result

    def chk_pos(self, pos):
        for morph in self.morphs:
            if morph.pos == pos:
                return True
        return False
    
    def get_morphs_by_pos(self,pos,pos1=''):
        if len(pos1) > 0:
            return [res for res in self.morphs if (res.pos == pos) and (res.pos == pos1)]
        else:
            return [res for res in self.morphs if res.pos == pos]
    
    def get_kaku_prt(self):
        prts = self.get_morphs_by_pos('助詞')
        if len(prts) >1:
            kaku_prts = self.get_morphs_by_pos('助詞','格助詞')
            if len(kaku_prts) > 0:
                prts[-1].surface
        
        if len(prts) > 0:
            return prts[-1].surface
        else:
            return ''
    
    
    def get_sahen_wo(self):
        for i,morph in enumerate(self.morphs[0:-1]) :
            if(morph.pos == '名詞')\
              and (morph.pos1 == 'サ変接続')\
              and (self.morphs[i+1].pos == "助詞")\
              and (self.morphs[i +1].surface == 'を'):
                return morph.surface + self.morphs[i + 1].surface
        return ''
    
def neco_lines():
    with open(fname_parsed) as file_parsed:
        chunks = dict() 
        idx = -1
        for line in file_parsed:
            if line == 'EOS\n':
                if len(chunks) > 0:
                    sorted_tuple = sorted(chunks.items(), key=lambda x: x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
            elif line[0] == '*':
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D', cols[2]).group(1))
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],        # surface
                        res_cols[6],    # base
                        res_cols[0],    # pos
                        res_cols[1]     # pos1
                    )
                )

parse_neko()

In [None]:
def meishi_pass(chunk,chunks,neko):
    if chunk.dst == -1:
        neko += chunk.normalized_surface()
        print(neko)
    else:
        for i,x in enumerate(chunks):
            if i == chunk.dst:
                neko += chunk.normalized_surface() + "->"
                meishi_pass(x,chunks,neko)
        

for chunks in neco_lines():
    for chunk in chunks:
        if chunk.chk_pos('名詞'):
            meishi_pass(chunk,chunks,"")

## 49.名詞間の係り受けパスの抽出

In [None]:
import CaboCha
import re
import pydot_ng as pydot

fname = 'neko.txt'
fname_parsed = 'neko.txt.cabocha'

def parse_neko():
    with open(fname) as data_file, \
            open(fname_parsed, mode='w') as out_file:

        cabocha = CaboCha.Parser()
        for line in data_file:
            out_file.write(
                cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE)
            )

class Morph:
    def __init__(self, surface, base, pos, pos1):
        self.surface = surface
        self.base = base
        self.pos = pos
        self.pos1 = pos1

    def __str__(self):
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)


class Chunk:
    def __init__(self):
        self.morphs = []
        self.srcs = []
        self.dst = -1

    def __str__(self):
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)

    def normalized_surface(self):
        result = ''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result

    def chk_pos(self, pos):
        for morph in self.morphs:
            if morph.pos == pos:
                return True
        return False
    
    def get_morphs_by_pos(self,pos,pos1=''):
        if len(pos1) > 0:
            return [res for res in self.morphs if (res.pos == pos) and (res.pos == pos1)]
        else:
            return [res for res in self.morphs if res.pos == pos]
    
    def get_kaku_prt(self):
        prts = self.get_morphs_by_pos('助詞')
        if len(prts) >1:
            kaku_prts = self.get_morphs_by_pos('助詞','格助詞')
            if len(kaku_prts) > 0:
                prts[-1].surface
        
        if len(prts) > 0:
            return prts[-1].surface
        else:
            return ''
    
    
    def get_sahen_wo(self):
        for i,morph in enumerate(self.morphs[0:-1]) :
            if(morph.pos == '名詞')\
              and (morph.pos1 == 'サ変接続')\
              and (self.morphs[i+1].pos == "助詞")\
              and (self.morphs[i +1].surface == 'を'):
                return morph.surface + self.morphs[i + 1].surface
        return ''
    
    def noun_masked_surface(self,mask,dst=False):
        result = ''
        for morph in self.morphs:
            if morph .pos != '記号':
                if morph.pos == '名詞':
                    result += mask
                    if dst:    
                        return result
                    mask = ''
                else:
                    result += morph.surface
        return result
    
def neco_lines():
    with open(fname_parsed) as file_parsed:
        chunks = dict() 
        idx = -1
        for line in file_parsed:
            if line == 'EOS\n':
                if len(chunks) > 0:
                    sorted_tuple = sorted(chunks.items(), key=lambda x: x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
            elif line[0] == '*':
                cols = line.split(' ')
                idx = int(cols[1])
                dst = int(re.search(r'(.*?)D', cols[2]).group(1))
                if idx not in chunks:
                    chunks[idx] = Chunk()
                chunks[idx].dst = dst
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst] = Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split('\t')
                res_cols = cols[1].split(',')
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],        # surface
                        res_cols[6],    # base
                        res_cols[0],    # pos
                        res_cols[1]     # pos1
                    )
                )

parse_neko()

In [None]:
with open ("ruu.txt",mode='w') as out_file:
    for chunks in neco_lines():
        index_noun = [i for i in range(len(chunks)) if len(chunks[i].get_morphs_by_pos('名詞'))>0]
        if len(index_noun) < 2: continue
        for i,index_x in enumerate(index_noun[:-1]):
            for index_y in index_noun[i+1:]:
                meet_y = False
                index_dup = -1
                routes_x =set()
                dst = chunks[index_x].dst
                while dst != -1:
                    if dst != index_y:
                        meet_y = True
                        break
                    routes_x.add(dst)
                    dst = chunks[dst].dst
                if not meet_y:
                    dst = chunks[index_y].dst
                    while dst != -1:
                        if dst in routes_x:
                            index_dup =dst
                            break
                        else:
                            dst = chunks[dst].dst
            
                if index_dup == -1:
                    out_file.write(chunks[index_x].noun_masked_surface('X'))
                    dst = chunks[index_x].dst
                    while dst != -1:
                        if dst == index_y:
                            out_file.write('->'+chunks[dst].noun_masked_surface('Y',True))
                            break
                        else:
                            out_file.write('->'+chunks[dst].normalized_surface())
                        dst = chunks[dst].dst
                    out_file.write('\n')
                else:
                    out_file.write(chunks[index_x].noun_masked_surface('X'))
                    dst = chunks[index_x].dst
                    while dst != index_dup:
                        out_file.write(' -> ' + chunks[dst].normalized_surface())
                        dst = chunks[dst].dst
                    out_file.write(' | ')
                    # Yからぶつかる手前までを出力
                    out_file.write(chunks[index_y].noun_masked_surface('Y'))
                    dst = chunks[index_y].dst
                    while dst != index_dup:
                        out_file.write(' -> ' + chunks[dst].normalized_surface())
                        dst = chunks[dst].dst
                    out_file.write(' | ')
                    # ぶつかったchunkを出力
                    out_file.write(chunks[index_dup].normalized_surface())
                    out_file.write('\n')