# 3문장 요약

In [1]:
# textrank
import pandas as pd
import re
import torch
import math
import numpy as np
from sklearn.preprocessing import normalize
from konlpy.tag import Komoran
from konlpy.tag import Okt
from konlpy.tag import Hannanum

def pagerank(x, df=0.85, max_iter=30):
    assert 0 < df < 1

    # initialize
    A = normalize(x, axis=0, norm='l1')
    R = np.ones(A.shape[0]).reshape(-1,1)
    bias = (1 - df) * np.ones(A.shape[0]).reshape(-1,1)
    # iteration
    for _ in range(max_iter):
        R = df * (A * R) + bias

    return R

article = """ """
def textrank(update, context):
    global article
    article = " ".join(context.args)
    
    korean_data = article
    
    text = re.sub(r"\n+", " ", korean_data)
    sentences = re.split("[\.?!]\s+", text)
    komoran = Komoran()
    
    data = []
    for sentence in sentences:
        if(sentence == "" or len(sentence) == 0):
            continue
        temp_dict = dict()
        temp_dict['sentence'] = sentence
        temp_dict['token_list'] = komoran.nouns(sentence)
        
        data.append(temp_dict)
    
    df = pd.DataFrame(data)
    
    similarity_matrix = []
    for i, row_i in df.iterrows():
        i_row_vec = []
        for j, row_j in df.iterrows():
            if i == j:
                i_row_vec.append(0.0)
            else:
                intersection = len(set(row_i['token_list']) & set(row_j['token_list']))
                log_i = math.log(len(set(row_i['token_list'])))
                log_j = math.log(len(set(row_j['token_list'])))
                similarity = intersection / (log_i + log_j)
                i_row_vec.append(similarity)
        similarity_matrix.append(i_row_vec)
        
    weightedGraph = np.array(similarity_matrix)
    R = pagerank(weightedGraph)
    R = R.sum(axis=1)
    indexs = R.argsort()[-3:] # 여기가 세줄요약 느낌 / -5주면 5줄나옴 
    
    article_summary_str = ''
    for index in sorted(indexs):
        #print(df['sentence'][index])
        #print()
        article_summary = df['sentence'][index]
        article_summary_str += ' ' + article_summary
        
    summary = article_summary_str
    context.bot.send_message(chat_id=update.effective_chat.id, text=summary)

# 1문장 요약

In [2]:
from transformers import PreTrainedTokenizerFast, BartForConditionalGeneration
import torch
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-summarization')
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-summarization')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [3]:
article = """ """
def gogamza(update, context):
    global article
    article = " ".join(context.args)
    input_ids = tokenizer.encode(article)
    input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
    input_ids = torch.tensor([input_ids])
    
    summary_text_ids = model.generate(
        input_ids = input_ids,
        bos_token_id = model.config.bos_token_id,
        eos_token_id = model.config.eos_token_id,
        length_penalty=2.0, # 길이에 대한 penalty 값
        max_length = 150, # 요약문의 최대 길이 설정
        min_length = 50, # 요약문의 최소 길이 설정
        num_beams = 5) # 문장 생성 시 다음 단어를 탐색하는 영역의 개수
    
    summary = tokenizer.decode(summary_text_ids[0], skip_special_tokens=True)
    context.bot.send_message(chat_id=update.effective_chat.id, text=summary)

# 단어 뜻

In [4]:
import json
import urllib.request

word = """ """
def naver_dict(update, context):
    global word
    word = "".join(context.args)
    
    client_id = '' # Your client_id
    client_secret = '' # Your client_secret
    encText = urllib.parse.quote(word)
    url = 'https://openapi.naver.com/v1/search/encyc?query=' + encText
    request = urllib.request.Request(url)
    request.add_header('X-Naver-Client-Id', client_id)
    request.add_header('X-Naver-Client-Secret', client_secret)

    response = urllib.request.urlopen(request)
    response_json = json.loads(response.read().decode('utf-8'))

    #voca = response_json['items'][0]['description']
    voca_link = response_json['items'][0]['link']
    
    context.bot.send_message(chat_id=update.effective_chat.id, text=voca_link)

# 영상

In [5]:
keyword = """ """
def video(update, context):
    # selenium 크롤링
    from selenium import webdriver  

    # 크롬 드라이버
    import chromedriver_autoinstaller

    # 경고 무시
    import warnings
    warnings.filterwarnings('ignore')

    # 크롬 옵션
    options = webdriver.ChromeOptions()  

    # 크롬 윈도우 사이즈 조절
    options.add_argument("--window-size=1505,800") # 화면이 작으면 크롤링 안 되는 현상을 막기 위해 크게 늘린다

    # 크롬창 띄우기
    chrome_path = chromedriver_autoinstaller.install()
    driver = webdriver.Chrome(chrome_path, options=options)
    
    global keyword
    keyword = "".join(context.args)
    # 키워드가 입력된 유튜브 사이트에 들어가기
    driver.get("https://www.youtube.com/results?search_query={}".format(keyword)) 
    time.sleep(0.5)
    
    # 영상 제목 크롤링
    overlays = "#video-title > yt-formatted-string"
    titles = driver.find_elements_by_css_selector(overlays)

    # 영상 링크 크롤링
    overlays = "#video-title"
    urls = driver.find_elements_by_css_selector(overlays)

    n = 3 # n : 조회수를 가져오고 싶은 영상 갯수
    url_list = []
    for i in range(n):
        url = urls[i].get_attribute('href')
        url_list.append(url)
    
        context.bot.send_message(chat_id=update.effective_chat.id, text="{}번째 링크: {}".format(i+1, url_list[i]))

# 이미지

In [6]:
import os
from PIL import Image
from pytesseract import *
  # import modules
dir_now = os.path.dirname(os.path.abspath('텔레그램_챗봇_종합(이미지).ipynb'))  # real path to dirname

def get_photo(update, context) :
    file_path = os.path.join(dir_now, 'image01.png')
    photo_id = update.message.photo[-1].file_id  
    photo_file = context.bot.getFile(photo_id)
    photo_file.download(file_path)
    image01 = Image.open('image01.png')  
    Print01 = image_to_string(image01, lang='kor', config='-c preserve_interword_spaces=1 --psm 4') # lang 옵션 주의
    update.message.reply_text(Print01)
    context.bot.send_message(chat_id=update.effective_chat.id, text=Print01)

# 텔레그램

In [7]:
import time
from telegram import ChatAction
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import Updater
from telegram.ext import CommandHandler, MessageHandler, Filters, CallbackQueryHandler
 
BOT_TOKEN='' # 토큰 변경

print('Start telegram chat bot')
updater = Updater( token=BOT_TOKEN, use_context=True )
dispatcher = updater.dispatcher

dir_now = os.path.dirname(os.path.abspath('텔레그램_챗봇_종합(이미지).ipynb'))

def get_photo(update, context) :
    file_path = os.path.join(dir_now, 'image01.png')
    photo_id = update.message.photo[-1].file_id  
    photo_file = context.bot.getFile(photo_id)
    photo_file.download(file_path)
    image01 = Image.open('image01.png')  
    Print01 = image_to_string(image01, lang='kor', config='-c preserve_interword_spaces=1 --psm 4') # lang 옵션 주의
    update.message.reply_text(Print01)

# 서비스 선택 command
def cmd_menu_buttons(update, context):
    task_buttons = [[InlineKeyboardButton('1. 긴 글 세줄요약', callback_data='1')],
                    [InlineKeyboardButton('2. 짧은 글 한줄요약', callback_data='2')],
                    [InlineKeyboardButton('3. 단어 뜻', callback_data='3')],
                    [InlineKeyboardButton('4. 영상', callback_data='4')],
                    [InlineKeyboardButton('5. 이미지',callback_data='5')]]
    
    reply_markup = InlineKeyboardMarkup(task_buttons)
    
    context.bot.send_message(
        chat_id=update.message.chat_id
        , text='서비스를 선택하세요'
        , reply_markup=reply_markup
    )
    
# 선택한 서비스 결과
def cb_button(update, context):
    query = update.callback_query
    data = query.data
    
    context.bot.send_chat_action(chat_id=update.effective_user.id, 
                                 action=ChatAction.TYPING)
    
    if data == '1':
        context.bot.edit_message_text(text="/sum3 긴글"+"\n"+"이렇게 입력해주세요",
                                      chat_id=query.message.chat_id,
                                      message_id=query.message.message_id)
    elif data == '2':
        context.bot.edit_message_text(text="/sum1 짧은글"+"\n"+"이렇게 입력해주세요",
                                      chat_id=query.message.chat_id,
                                      message_id=query.message.message_id)
    elif data == '3':
        context.bot.edit_message_text(text="/def 단어"+"\n"+"이렇게 입력해주세요",
                                      chat_id=query.message.chat_id,
                                      message_id=query.message.message_id)
    elif data == '4':
        context.bot.edit_message_text(text="/video 단어"+"\n"+"이렇게 입력해주세요",
                                      chat_id=query.message.chat_id,
                                      message_id=query.message.message_id)
    elif data == '5':
         context.bot.edit_message_text(text="/image 입력후 이미지를 입력해주세요",
                                      chat_id=query.message.chat_id,
                                      message_id=query.message.message_id)       

# 특정 단어를 입력하면 특정 함수를 실행시켜주는 handler 생성
menu_buttons_handler = CommandHandler('menu', cmd_menu_buttons)
button_callback_handler = CallbackQueryHandler(cb_button)
sum3_handler = CommandHandler('sum3', textrank)
sum1_handler = CommandHandler('sum1', gogamza)
def_handler = CommandHandler('def', naver_dict)
video_handler = CommandHandler('video', video)
photo_handler = MessageHandler(Filters.photo, get_photo)

# handler 추가하기
dispatcher.add_handler(menu_buttons_handler)
dispatcher.add_handler(button_callback_handler)
dispatcher.add_handler(sum3_handler)
dispatcher.add_handler(sum1_handler)
dispatcher.add_handler(def_handler)
dispatcher.add_handler(video_handler)
dispatcher.add_handler(photo_handler)

# 챗봇 구동
updater.start_polling()
updater.idle()

Start telegram chat bot
