In [1]:
import CaboCha
import re
import pydot_ng as pydot
import graphviz

In [2]:
def parse():
    '''
    neko.txt.cabpchaを作る
    '''
    with open("./neko.txt") as f:
        with open("./neko.txt.cabocha","w") as outFile:
            cabocha = CaboCha.Parser()
            for line in f:
                outFile.write(cabocha.parse(line).toString(CaboCha.FORMAT_LATTICE))
            

In [3]:
class Morph:
    '''
    形態素を表すクラス
    surface:表層系
    base:基本形
    pos:品詞
    po1:品詞再分類
    '''
    def __init__(self,surface,base,pos,pos1):
        self.surface=surface
        self.base=base
        self.pos=pos
        self.pos1=pos1
    
    def __str__(self):
        '''オブジェクトの文字列表現'''
        return 'surface[{}]\tbase[{}]\tpos[{}]\tpos1[{}]'\
            .format(self.surface, self.base, self.pos, self.pos1)

    


# 40

In [None]:
def neco_lines():
    '''
    係り受け解析結果のジェネレータ
    '''
    with open("./neko.txt.cabocha") as f:
        morphs=[]
        for line in f:
            if line=="EOS\n":
                yield morphs
                morphs=[]
            else:
                if line[0]=='*':
                    continue
                else:
                    cols = line.split('\t')
                    res_cols=cols[1].split(',')
                    newM=Morph(cols[0],res_cols[6],res_cols[0],res_cols[1])
                    print("NewEntry")
                    print(newM)
                    morphs.append(
                       newM
                    )
    raise StopIteration

In [None]:
parse()

In [None]:
for i,morphs in enumerate(neco_lines(),1):
    print("i->",i)
    if i==10:
        for morph in morphs:
            print(morph)
        break

## 41

In [None]:
class Chunk:
    '''
    文節を表す
    morphs:Morphのkisut
    dst: かかり先文節のインデックス番号
    srcs:かかり元文節のインデックス番号のlist
    '''
    def __init__(self):
        self.morphs=[]
        self.srcs=[]
        self.dst=-1
    
    def __str__(self):
        
        surface = ''
        for morph in self.morphs:
            surface += morph.surface
        return '{}\tsrcs{}\tdst[{}]'.format(surface, self.srcs, self.dst)
    
    def normalized_surface(self):
        result=''
        for morph in self.morphs:
            if morph.pos != '記号':
                result += morph.surface
        return result
    
    def contain_pos(self,searchPos):
        '''
        chunkのなかにseachPos(品詞名)を含むか
        ret:bool
        '''        
        for morph in self.morphs:
            if morph.pos==searchPos:
                return True
        return False
    
    def get_pos_in_morphs(self,pos,pos1=""):
        ret=[]
        if len(pos1)>0:
            return [res for res in self.morphs if res.pos==pos and res.pos1==pos1]
        else:
            return [res for res in self.morphs if res.pos==pos]
        
    def get_kaku_prt(self):
        prts = self.get_pos_in_morphs('助詞')
        if len(prts)>1:
            kaku_prts=self.get_pos_in_morphs('助詞','格助詞')
            if len(kaku_prts)>0:
                prts = kaku_prts
        if len(prts)>0:
            return prts[-1].surface
        else:
            return ''
        
    def get_sahen_wo(self):
        '''
        [さ変接続名詞+を]を含む場合は、surfaceを返す
        '''
        for i,morph in enumerate(self.morphs[0:-1]):
            if morph.pos=="名詞" \
                and morph.pos1=="サ変接続" \
                and self.morphs[i+1].pos =="助詞" \
                and self.morphs[i+1].surface=="を":
                    #複数ある場合はもっとも左の動詞のみでいいから、return してる
                    return morph.surface+self.morphs[i+1].surface
        return ''
                                             
    
    def noun_masked_surface(self,mask,dst=False):
        '''
        名詞を指定文字(mask)でマスクしたsurfaceを返す
        '''
        result =''
        for morph in self.morphs:
            if morph.pos != '記号':
                if morph.pos=="名詞":
                    result+=mask
                    if dst:
                        return result
                masK=''
            else:
                result += morph.surface
        return result
        
        

In [None]:
def chunk_lines():
    with open("./neko.txt.cabocha") as file:
        chunks=dict()
        idx=-1
        
        for line in file:
            if line=="EOS\n":
                if len(chunks)>0:
                    sorted_tuple = sorted(chunks.items(),key=lambda x:x[0])
                    yield list(zip(*sorted_tuple))[1]
                    chunks.clear()
                else:
                    yield []
                
            elif line[0]=="*":
                cols=line.split(' ')
                idx=int(cols[1])
                dst = int(re.search(r'(.*?)D',cols[2]).group(1))
                
                if idx not in chunks:
                    chunks[idx]=Chunk()
                chunks[idx].dst=dst
                
                if dst != -1:
                    if dst not in chunks:
                        chunks[dst]=Chunk()
                    chunks[dst].srcs.append(idx)
            else:
                cols = line.split("\t")
                res_cols=cols[1].split(",")
                
                chunks[idx].morphs.append(
                    Morph(
                        cols[0],
                        res_cols[6],
                        res_cols[0],
                        res_cols[1])
                )

In [None]:
parse()

In [None]:
for i,chunks in enumerate(chunk_lines(),1):
    if i==8:
        for j,chunk in enumerate(chunks):
            print("[{}]{}".format(j,chunk))
        break

## 42

In [None]:
parse()

In [None]:
for chunks in chunk_lines():
    for chunk in chunks:
        if chunk.dst != -1:
            src = chunk.normalized_surface()
            dst = chunks[chunk.dst].normalized_surface()
            if src != '' and dst != '':
                print("{}\t{}".format(src,dst))

# 43

In [None]:
for chunks in chunk_lines():
    for chunk in chunks:
        if chunk.dst != -1:
            src = chunk
            dst = chunks[chunk.dst]
            if src.contain_pos("名詞") and dst.contain_pos("動詞"):
                src_str=src.normalized_surface()
                dst_str= dst.normalized_surface()
                if src_str:
                    print(src_str+"\t"+dst_str)
            

# 44

In [None]:
def createGraph(edges,directed=False):
    '''
    与えられた分の係り受け木を生成する
    '''
    if directed:
        graph = pydot.Dot(graph_type="digraph")
    else:
        graph = pydot.Dot(graph_type="graph")
        
    for a,b in edges:
        id1 = str(a[0])
        id2 = str(b[0])
        label1 = str(a[1])
        label2 = str(b[1])
        
        graph.add_node(pydot.Node(id1,label=label1))
        graph.add_node(pydot.Node(id2,label=label2))
        
        graph.add_edge(pydot.Edge(id1,id2))
        
    return graph
        
    

In [None]:
parse()

In [None]:
edges=[]
def createEdges():
    for chunks in chunk_lines():
        edges=[]
        for i,chunk in enumerate(chunks):
            if chunk.dst != -1:
                src = chunk.normalized_surface()
                dst = chunks[chunk.dst].normalized_surface()
                if src!='' and dst!= '':
                    edges.append([[i,src],[chunk.dst,dst]])
        if len(edges)>8:
            return edges
        
    

In [None]:
edges=createEdges()
edges

In [None]:
if len(edges)>0:
    graph = createGraph(edges,directed=True)
    graph.write_png("./result.png")
else:
    print("a")

## 45

In [None]:
parse()

In [None]:
with open("res_45.txt","w") as outFile:
    for chunks in chunk_lines():
        for chunk in chunks:
            verbs = chunk.get_pos_in_morphs("動詞")
            if len(verbs)==0:
                #動詞を含まない
                continue
            #かかり元の列挙
            prts=[]
            for src in chunk.srcs:
                prts_in_chunk = chunks[src].get_pos_in_morphs("助詞")
                if len(prts_in_chunk)>1:
                    kaku_prts = chunks[src].get_pos_in_morphs("助詞","各助詞")
                    if len(kaku_prts)>0:
                        prts_in_chunk=kaku_prts
                if len(prts_in_chunk)>0:
                    prts.append(prts_in_chunk[-1])
            if len(prts)<1:
                continue
            outFile.write("{}\t{}\n".format(verbs[0].base,' '.join(sorted(prt.surface for prt in prts))))
                
            

In [None]:
! head res_45.txt

## Unix　確認

In [None]:
! ls 

In [None]:
! sort res_45.txt | uniq -c | sort --numeric-sort --reverse

In [None]:
! grep "^する\s" res_45.txt | sort | uniq -c | sort --numeric-sort --reverse

# 46

In [None]:
with open("res_46.txt","w") as outFile:
    for chunks in chunk_lines():
        for chunk in chunks:
            verbs = chunk.get_pos_in_morphs("動詞")
            if len(verbs)==0:
                #動詞を含まない
                continue
            #かかり元に助詞を含むchunkを列挙
            chunks_include_prt=[]
            for src in chunk.srcs:
                chunks_include_prt.append(chunks[src])
            if len(chunk_include_prt)<1:
                continue
            chunks_include_prt = sorted(chunks_include_prt,key=lambda x:x.get_kaku_prt())
            
            outFile.write('{}\t{}\t{}\n'.format(
                verbs[0].base,
                ' '.join([chunk.get_kaku_prt() for chunk in chunks_include_prt]),
                ' '.join([chunk.normalized_surface() for chunk in chunks_include_prt])
            ))
            

In [None]:
! head res_46.txt

# 47

In [None]:
parse()

In [None]:
with open("res_47.txt","w") as out_file:
    for chunks in chunk_lines():
        for chunk in chunks:
            verbs = chunk.get_pos_in_morphs("動詞")
            if len(verbs)<1:
                continue
            
            chunks_include_prt=[]
            for src in chunk.srcs:
                if len(chunks[src].get_kaku_prt()) >0:
                    chunks_include_prt.append(chunks[src])
            if len(chunks_include_prt)<1:
                continue
            
            sahen_wo=''
            for chunk_src in chunks_include_prt:
                sahen_wo = chunk_src.get_sahen_wo()
                if len(sahen_wo)>0:
                    chunk_remove = chunk_src
                    break
            if len(sahen_wo)<1:
                continue
                
            chunks_include_prt.remove(chunk_remove)
            
            chunks_include_prt = sorted(chunks_include_prt,key=lambda x:x.get_kaku_prt())
            
            out_file.write("{}\t{}\t{}\n".format(
                           sahen_wo+verbs[0].base,
                            ' '.join([chunk.get_kaku_prt() for chunk in chunks_include_prt]),
                            ' '.join([chunk.normalized_surface() for chunk in chunks_include_prt])
            ))
                

In [None]:
! head res_47.txt

# 48

In [None]:
parse()

In [None]:
with open("res_48.txt","w") as out_file:
    for chunks in chunk_lines():
        for chunk in chunks:
            if len(chunk.get_pos_in_morphs('名詞'))>0:
                out_file.write(chunk.normalized_surface())
                dst = chunk.dst
                while dst != -1:
                    out_file.write(' ->' + chunks[dst].normalized_surface())
                    dst = chunks[dst].dst
                out_file.write("\n")

In [None]:
! head res_48.txt

# 49

In [None]:
parse()

In [None]:
with open("res_49.txt","w") as out_file:
    for chunks in chunk_lines():
        indexs_noun = [i for i in range(len(chunks)) if len(chunks[i].get_pos_in_morphs('名詞'))>0]
        
        
        if len(indexs_noun)<2:
            continue
        
        for i,index_x in enumerate(indexs_noun[:-1]):
            for index_y in indexs_noun[i+1:]:
                meet_y=False
                index_dup=-1
                routes_x=set()
                
                dst = chunks[index_x].dst
                while dst!=-1:
                    if dst==index_y:
                        meet_y=True
                        break
                    routes_x.add(dst)
                    dst = chunks[dst].dst
                
                if not meet_y:
                    dst = chunks[index_y].dst
                    while dst != -1:
                        if dst in routes_x:
                            index_dup = dst
                            break
                        else:
                            dst = chunks[dst].dst
                if index_dup==-1:
                    out_file.write(chunks[index_x].noun_masked_surface("X"))
                    dst = chunks[index_x].dst
                    while dst != -1:
                        if dst == index_y:
                            out_file.write(" ->" + chunks[dst].noun_masked_surface('Y',True))
                            break
                        else:
                            out_file.write(" ->" +chunks[dst].normalized_surface())
                        dst = chunks[dst].dst
                    out_file.write("\n")
                    
                else:
                    out_file.write(chunks[index_x].noun_masked_surface('X'))
                    dst = chunks[index_x].dst
                    while dst != index_dup:
                        out_file.write(" ->" + chunks[dst].normalized_surface())
                        dst = chunks[dst].dst
                    out_file.write(" | ")
                    
                    out_file.write(chunks[index_y].noun_masked_surface('Y'))
                    dst = chunks[index_y].dst
                    while dst != index_dup:
                        out_file.write(" -> "+chunks[dst].normalized_surface())
                        dst = chunks[dst].dst
                    out_file.write(" | ")
                    
                    out_file.write(chunks[index_dup].normalized_surface())
                    out_file.write("\n")
                    
        

In [None]:
! head res_49.txt