# TSV 형식 뉴스 기사 행태소 분석

In [1]:
# 형태소 분석 함수
import re
import ujson
from konlpy.tag import Komoran


def split_sentences(text):
    all_sentences = []
    lines = [line for line in text.strip().splitlines() if line.strip]
    
    for line in lines:
        sentences = re.split("(?<=[.?!]) ", line)
        all_sentences += sentences
    
    return all_sentences


def get_morph_anal(analyzer, text):
    sent_morph_anals = []
    sentences = split_sentences(text)
    
    for sentence in sentences:
        sent_morph_anal = analyzer.pos(sentence)
        sent_morph_anals.append(sent_morph_anal)
        
    return sent_morph_anals


def parse_line(line):
    """주어진 TSV라인을 개별 항목들로 분절하여 돌려준다"""

    subject, article_id, title, body, date_time, url = \
        line.strip().split("\t")
    return  subject, article_id, title, body, date_time, url

def write_ma_article(output_file, subject, article_id, title, body, 
                     date_time, url, title_ma, body_ma):
    """기사 항목들을 주어진 출력 파일에 기록한다"""
    json_obj = {
         "subject":  subject, 
         "article_id":   article_id, 
         "title":   title, 
         "body":   body, 
         "date_time":  date_time, 
         "url":  url, 
         "title_ma":  title_ma, 
         "body_ma":  body_ma
    }
    json_str = ujson.dumps(json_obj, ensure_ascii= False)
    print(json_str, file=output_file)

def main():
    """뉴스 기사 형태소 분석"""
    
    komoran = Komoran()
    input_file_name = "../data/news/news.txt"
    output_file_name = "../data/news/news.ma.txt"
    
    with open(input_file_name, "r", encoding="utf-8") as input_file, \
         open(output_file_name, "w", encoding="utf-8") as output_file:
        for line in input_file:
            subject, article_id, title, body, date_time, url = parse_line(line)     # 변수 이름짓기가 고통스럽다. 
            title_ma = get_morph_anal(komoran, title)
            body_ma = get_morph_anal(komoran, body)
            write_ma_article(output_file, subject, article_id, title, body, date_time, url, title_ma, body_ma)
            
main()         
            