In [1]:
from collections import Counter
from datetime import timedelta, datetime
import glob
from itertools import chain
import json
import os
import re

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.plotting import register_matplotlib_converters
import seaborn as sns

In [3]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
font_path = '/usr/share/fonts/NanumFont/NanumGothic.ttf'
font_name = fm.FontProperties(fname=font_path, size=10).get_name()
plt.rc('font', family=font_name, size=12)
plt.rcParams["figure.figsize"] = (20, 10)
register_matplotlib_converters()

In [4]:
directory = './brunch-article-recommendation/res/'

In [5]:
def iterparse(file_obj):
    decoder = json.JSONDecoder()
    buf = ""
    for line in file_obj:
        buf += line.strip()
        try:
            res = decoder.raw_decode(buf)
            buf = ""
            yield res[0]
        except ValueError:
            pass

In [6]:
contents_dir = [directory + 'contents/data.' + str(i) for i in range(7)]

data = []
with open(contents_dir[0], encoding='utf-8') as data_file:
    for obj in iterparse(data_file):
        data.append(obj)

In [7]:
len(data)

100000

In [8]:
data[2]['morphs'][:20]

[['180/NNG', '12/JKS'],
 ['483/VV', '37/EF', '14/SF'],
 ['484/MAG'],
 ['485/NNG', '486/NNG', '12/JKC'],
 ['17/VV', '400/EC'],
 ['92/MAG'],
 ['487/VA', '7/ETM'],
 ['180/NNG', '12/JKS'],
 ['483/VV', '37/EF', '14/SF'],
 ['303/NNG', '59/JX'],
 ['488/NNG'],
 ['485/NNG', '40/JKO'],
 ['489/VV', '65/EC'],
 ['98/NNG', '40/JKO'],
 ['490/VV', '5/ETM'],
 ['6/NNG'],
 ['180/NNG', '12/JKS'],
 ['491/MAG'],
 ['492/VV', '177/EP', '13/EF', '14/SF'],
 ['493/NNG', '35/JKS']]

In [12]:
# https://github.com/kakao/khaiii/wiki/코퍼스#품사-집합
# /NNG: 일반 명사    /NNP: 고유 명사    /VV: 동사    /VA: 형용사    /XR: 어근
# /EF: 종결 어미     /SF: 마침표, 느낌표, 물음표
def morphs_to_sents(morphs):
    '''merge morphs list to sentence'''
    sents = []
    temp = []
    for m in morphs:
        # 종결 어미나 마침표(및 느낌표, 물음표)를 기준으로 문장을 나눈다.
        if [w for w in m if ('/EF' in w or '/SF' in w)] == []:
            temp += m
        else:
            temp += m
            sents.append(temp)
            temp = []
    
    return sents

def token_processing(sent):
    words = [w for w in sent if ('/NNG' in w or '/NNP' in w or '/XR' in w or '/VV' in w or '/VA' in w)]
    return words

sents = morphs_to_sents(data[2]['morphs'])
tokens = token_processing(sents[2])
print(sents[2])
print(tokens)

['303/NNG', '59/JX', '488/NNG', '485/NNG', '40/JKO', '489/VV', '65/EC', '98/NNG', '40/JKO', '490/VV', '5/ETM', '6/NNG', '180/NNG', '12/JKS', '491/MAG', '492/VV', '177/EP', '13/EF', '14/SF']
['303/NNG', '488/NNG', '485/NNG', '489/VV', '98/NNG', '490/VV', '6/NNG', '180/NNG', '492/VV']


In [13]:
from textrank import KeywordSummarizer

keyword_extractor = KeywordSummarizer(
    tokenize = token_processing,
    window = -1,
    verbose = False
)
keywords = keyword_extractor.summarize(sents, topk=30)
print(keywords)

['523/VV', '485/NNG', '498/VA', '17/VV', '187/VV', '466/NNG', '522/NNG', '520/NNG', '519/NNG', '158/NNG', '495/NNG', '306/NNG', '180/NNG', '504/NNG', '526/NNG', '557/VV', '493/NNG', '549/NNG', '488/NNG', '508/VV', '6/NNG', '150/NNG', '516/VV', '521/NNG', '403/VA', '483/VV', '533/VV', '531/VV', '523/VV', '485/NNG']


## Save Content's TextRank Data into Json

In [6]:
def morphs_to_sents(morphs):
    '''merge morphs list to sentence'''
    sents = []
    temp = []
    for m in morphs:
        # 종결 어미나 마침표(및 느낌표, 물음표)를 기준으로 문장을 나눈다.
        if [w for w in m if ('/EF' in w or '/SF' in w)] == []:
            temp += m
        else:
            temp += m
            sents.append(temp)
            temp = []
    
    return sents

def token_processing(sent):
    words = [w for w in sent if ('/NNG' in w or '/NNP' in w or '/XR' in w or '/VV' in w or '/VA' in w)]
    return words

from textrank import KeywordSummarizer

keyword_extractor = KeywordSummarizer(
    tokenize = token_processing,
    window = -1,
    verbose = False
)

In [14]:
import time
start_time = time.time()

contents_data = {}
contents_dir = [directory + 'contents/data.' + str(i) for i in range(7)]

for i in range(len(contents_dir)):
    print('data.' + str(i) + ' is processing...')
    data = []
    with open(contents_dir[i], encoding='utf-8') as data_file:
        for obj in iterparse(data_file):
            data.append(obj)
    
    for j in range(len(data)):
        sents = morphs_to_sents(data[j]['morphs'])
        keywords = keyword_extractor.summarize(sents, topk=15)
        contents_data[data[j]['id']] = keywords
        
print("--- %s seconds ---" % (time.time() - start_time))
        
with open('contents_textrank.json', 'w', encoding="utf-8") as make_file:
    json.dump(contents_data, make_file, ensure_ascii=False, indent="\t")

data.0 is processing...
data.1 is processing...
data.2 is processing...
data.3 is processing...
data.4 is processing...
data.5 is processing...
data.6 is processing...
--- 2463.3041944503784 seconds ---


In [15]:
print(len(contents_data))
# check short and NaN keywords are processed properly(same length with others)
print(contents_data["@kohwang56_81"])
print(contents_data["@pencil-k_478"])

642190
['17/VV', '434/VV', '433/NNG', '424/VA', '415/NNG', '409/VA', '403/VA', '175/VV', '213/VV', '454/NNG', '230/NNG', '459/NNG', '457/NNP', '17/VV', '434/VV']
['NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN', 'NaN']
