<a href="https://colab.research.google.com/github/PPareun/DeepLearningTextBook/blob/main/Keyword_Extractor/Keyword_From_SUM/BertSUM_(ENG_KOR).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
!pip install trafilatura

In [5]:
import pickle
import transformers
import torch
import sentencepiece
from transformers import get_linear_schedule_with_warmup, BertTokenizer, BertModel
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset, random_split
import math
from torch.optim import Adam
from torch.nn import functional as F
from collections import Counter
import re
import torch.nn as nn
import pandas as pd

In [6]:
from kobert_tokenizer import KoBERTTokenizer
kor_tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
eng_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower=True)

Downloading:   0%|          | 0.00/371k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/432 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [8]:
def Embedding(sentence, lang = 'korean'):
  tokenizer = kor_tokenizer if lang == 'korean' else eng_tokenizer
  word = []
  seg = []
  attn = []
  cls_mask = []
  cls = []
  length = 0
  size = 0
  for sent in sentence:
    add = tokenizer.encode(sent)
    if (length + len(add)<512):
      word.extend(add)
      seg.extend([(size%2)]*len(add))
      cls.append(length)
      length += len(add)
      size += 1
    else:
      break
  word = word + [0] * (512-len(word))
  seg = seg + [0] * (512-len(seg))
  attn = [1]*length + [0]*(512-length)
  cls_mask = [True]*len(cls) + [False]*(512-len(cls))
  cls = cls + [-1]*(512-len(cls))
  return torch.tensor([word]), torch.tensor([seg]), torch.tensor([attn]), torch.tensor([cls]), torch.tensor([cls_mask])

In [9]:
class PositionalEncoding(nn.Module):
    # d_model set as 768, due to the bert feature output dimension is 768.
    def __init__(self, d_model = 768, max_len = 512, device = device):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model, device=device)
        self.encoding.requires_grad = False 
        pos = torch.arange(0, max_len, device =device)
        pos = pos.float().unsqueeze(dim=1)
        _2i = torch.arange(0, d_model, step=2, device=device).float()
        #simple function according to bert paper.
        self.encoding[:, ::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        
        
    def forward(self, x):
        seq_len = x.size(dim = 1)
        return self.encoding[:seq_len, :]

In [10]:
class BERTSUM(nn.Module):
  def __init__(self, lang = 'korean'):
    super(BERTSUM, self).__init__()
    self.bert = BertModel.from_pretrained('skt/kobert-base-v1') if lang == 'korean' else BertModel.from_pretrained("bert-base-uncased")
    self.pos_emb = PositionalEncoding()
    self.encoder = nn.TransformerEncoderLayer(d_model=768, nhead=8)
    self.linear = nn.Linear(768, 1, bias = True)
    self.sigmoid = nn.Sigmoid()
  def forward(self, word_emb, seg_emb, attn_mask, clss, mask_cls):
    output= self.bert(input_ids = word_emb, attention_mask = attn_mask, token_type_ids = seg_emb)[0]
    output = output
    output = output[torch.arange(output.size(0)).unsqueeze(1), clss]
    output = output * mask_cls[:, :, None].float()
    output = output + self.pos_emb(output)
    #Double TransformerEncoder layer as the paper noticed the best performence.
    output = self.encoder(output)
    output = self.encoder(output)
    output = self.linear(output)
    output = self.sigmoid(output)

    return output

In [11]:
BertSum_kor = torch.load("/content/drive/MyDrive/BertSum_kor.pt", map_location = torch.device(device))
BertSum_eng = torch.load("/content/drive/MyDrive/BertSum_model.pt", map_location = torch.device(device))

In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [None]:
tokenizer_ = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
model_ = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
classifier = pipeline("ner", model=model_, tokenizer=tokenizer_)

In [30]:
class pred():
  def __init__(self, text, lang = 'korean'):
    super(pred).__init__()
    #sent tokenizer
    self.text = [(sent) for sent in re.split(r'[.]\s+|\n', text)][:-1]
    self.model = BertSum_kor if lang == 'korean' else BertSum_eng
    #BERTSUM evaluate
    self.model.eval()
    word_emb, segment_emb, attention_mask, clss, mask_cls = Embedding(self.text, lang = lang)
    with torch.no_grad():
      self.value = self.model(word_emb = word_emb.to(device), seg_emb = segment_emb.to(device), 
                       attn_mask = attention_mask.to(device), clss = clss.to(device)
                       , mask_cls = mask_cls.to(device))[mask_cls.to(device)].squeeze(dim = -1)
    #Sort by criteria value
    self.get_index = torch.sort(self.value, descending = True).indices
  # Get value
  def get_value(self):
    return self.value
  # Get n_summary sentences
  def n_sum(self, n = 5):
    n_sum = ""
    for i in range(min(n,self.get_index.size(dim = 0))):
      n_sum += (self.text[self.get_index[i]]+'\n')
    return n_sum
  #Get [summary, prob] list
  def sum_prob(self, n = 5):
    sum_prob = []
    for i in range(min(n,self.get_index.size(dim = 0))):
      sum_prob.append([self.text[self.get_index[i]],self.value[self.get_index[i]]])
    return sum_prob
  # Get Keyword from NER of n_summary
  def keyword(self, n = 5):
    keyword = []
    print("{:}-sentence summarization : ".format(n))
    print("")
    for sent, weight in self.sum_prob(n):
      print(sent)
      start, end = -2,-2
      for ner in classifier(sent):
        if(ner['word'] == '▁'):
          continue
        start_, end_ = ner['start'], ner['end']
        if start_ == end or (sent[start_-1] == ' ' and start_-end == 1):
          end = end_
        else:
          if(start != -2):
            keyword.append(sent[start:end])
          start, end = start_, end_
      # For last NER piece of sentence
      if(start != -2):
        word = sent[start:end]
        if(len(word)>1):
          keyword.append(word)
    print("")
    print("Keyword from {:}-sentence : ".format(n))
    print("")
    print(set(keyword))

In [47]:
#import trafilatura
from trafilatura import fetch_url, extract
URL = input()
doc = fetch_url(URL)
text = extract(doc)
if(text):
  hangul = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')  
  lang = 'korean' if hangul.findall(text[:50]) else 'english'
  print("등록하신 Article은 {}입니다.".format(lang))
  pred(text, lang).keyword()
else:
  print("{} 는 유효하지 않은 URL입니다.".format(URL))

https://tech.hindustantimes.com/tech/news/dalle-2-to-stable-diffusion-generate-photos-freely-with-these-ai-tools-71674561212855.html
등록하신 Article은 english입니다.
5-sentence summarization : 

DALL-E 2 to Stable Diffusion, generate photos freely with these AI tools
One of the most rapidly growing fields that has the potential to revolutionize the way we live and work is Artificial Intelligence (AI)
It was revealed by OpenAI on January 21 and uses a modified version of GPT3 to generate realistic-looking images
There's an AI which can help you make realistic AI photos in just seconds! Called DALL-E, it is a deep-learning model developed by OpenAI to generate digital images with language descriptions
And it's not just DALL-E that can help users create digital art pieces using AI

Keyword from 5-sentence : 

{'OpenAI', 'DALL-E 2', 'GPT3', 'Stable Diffusion', 'Intelligence', 'AI', 'DALL-E'}
