In [1]:
########################################################################################################
# textrank

import pandas as pd
import re
import torch
import math
import numpy as np
from sklearn.preprocessing import normalize
from konlpy.tag import Komoran
from konlpy.tag import Okt
from konlpy.tag import Hannanum

def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R


def textrank(article):
    korean_data = article
    
    text = re.sub(r"\n+", " ", korean_data)
    sentences = re.split("[\.?!]\s+", text)
    komoran = Komoran()
    
    data = []
    for sentence in sentences:
        if(sentence == "" or len(sentence) == 0):
            continue
        temp_dict = dict()
        temp_dict['sentence'] = sentence
        temp_dict['token_list'] = komoran.nouns(sentence)
        
        data.append(temp_dict)
    
    df = pd.DataFrame(data)
    
    similarity_matrix = []
    for i, row_i in df.iterrows():
        i_row_vec = []
        for j, row_j in df.iterrows():
            if i == j:
                i_row_vec.append(0.0)
            else:
                intersection = len(set(row_i['token_list']) & set(row_j['token_list']))
                log_i = math.log(len(set(row_i['token_list'])))
                log_j = math.log(len(set(row_j['token_list'])))
                similarity = intersection / (log_i + log_j)
                i_row_vec.append(similarity)
        similarity_matrix.append(i_row_vec)
        
    weightedGraph = np.array(similarity_matrix)
    R = pagerank(weightedGraph)
    R = R.sum(axis=1)
    indexs = R.argsort()[-3:] # 여기가 세줄요약 느낌 / -5주면 5줄나옴 
    
    article_summary_str = ''
    for index in sorted(indexs):
        #print(df['sentence'][index])
        #print()
        article_summary = df['sentence'][index]
        article_summary_str += ' ' + article_summary
        
    return article_summary_str

In [2]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import torch
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-summarization')
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-summarization')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [3]:
def gogamza(article):
    input_ids = tokenizer.encode(article)
    input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
    input_ids = torch.tensor([input_ids])
    
    summary_text_ids = model.generate(
        input_ids = input_ids,
        bos_token_id = model.config.bos_token_id,
        eos_token_id = model.config.eos_token_id,
        length_penalty=2.0, # 길이에 대한 penalty 값
        max_length = 150, # 요약문의 최대 길이 설정
        min_length = 50, # 요약문의 최소 길이 설정
        num_beams = 5) # 문장 생성 시 다음 단어를 탐색하는 영역의 개수
    
    return tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)

In [4]:
import telegram
from telegram.ext import Updater
from telegram.ext import MessageHandler, Filters
import telepot
from telepot.loop import MessageLoop # 봇 구동
from telepot.namedtuple import InlineKeyboardMarkup as MU # 마크업
from telepot.namedtuple import InlineKeyboardButton as BT # 버튼

token = '' # 텔레그램 봇 API
mc = '' # 본인 텔레그램 숫자 id

updater = Updater(token=token, use_context=True)

bot = telepot.Bot(token) # 봇 객체 생성

bot.sendMessage(chat_id=mc, text='시작')

user_text = bot.getUpdates()[-1]['message']['text']

def btn_show(msg):
    btn1 = BT(text = "1. textrank(긴 글 세줄요약)", callback_data = "1")
    btn2 = BT(text = "2. gogamza(짧은 글 한줄요약)", callback_data = "2")
    mu = MU(inline_keyboard = [[btn1, btn2]])
    bot.sendMessage(mc, "요약기를 선택하세요", reply_markup = mu)

def query_ans(msg):
    query_data = msg["data"]
    if query_data == "1":
        bot.sendMessage(mc, text = textrank(user_text))
    elif query_data == "2":
        bot.sendMessage(mc, text = gogamza(user_text))

MessageLoop(bot, {'chat': btn_show, "callback_query" : query_ans}).run_as_thread()# 답장 보내기