# Model Creating

## Library import

In [140]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import *
import json
from tqdm import tqdm
import os
import logging
import unicodedata
from shutil import copyfile
from transformers import PreTrainedTokenizer
import tensorflow_addons as tfa # Rectified Adam 옵티마이저 사용
import requests
import time
from apscheduler.schedulers.blocking import BlockingScheduler


2.4.1
1.19.5
0.24.2
2.0.9
0.5.1.2
0.12.1
2.25.1
<class 'apscheduler.schedulers.blocking.BlockingScheduler'>


## Data import

In [2]:
# 네이버 영화 감성분석 데이터 다운로드
!git clone https://github.com/e9t/nsmc.git


fatal: destination path 'nsmc' already exists and is not an empty directory.


In [3]:
# 딥러닝 훈련에 사용 할 train 데이터와 test 데이터를 pandas dataframe 형식으로 불러옵니다.
train = pd.read_table("nsmc/"+"ratings_train.txt")
test = pd.read_table("nsmc/"+"ratings_test.txt")

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team and Jangwon Park
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for KoBert model."""


logger = logging.getLogger(__name__)

VOCAB_FILES_NAMES = {"vocab_file": "tokenizer_78b3253a26.model",
                     "vocab_txt": "vocab.txt"}

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model"
    },
    "vocab_txt": {
        "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
        "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
        "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt"
    }
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "monologg/kobert": 512,
    "monologg/kobert-lm": 512,
    "monologg/distilkobert": 512
}

PRETRAINED_INIT_CONFIGURATION = {
    "monologg/kobert": {"do_lower_case": False},
    "monologg/kobert-lm": {"do_lower_case": False},
    "monologg/distilkobert": {"do_lower_case": False}
}

SPIECE_UNDERLINE = u'▁'


class KoBertTokenizer(PreTrainedTokenizer):
    """
        SentencePiece based tokenizer. Peculiarities:
            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
    """
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

    def __init__(
            self,
            vocab_file,
            vocab_txt,
            do_lower_case=False,
            remove_space=True,
            keep_accents=False,
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs):
        super().__init__(
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs
        )

        # Build vocab
        self.token2idx = dict()
        self.idx2token = []
        with open(vocab_txt, 'r', encoding='utf-8') as f:
            for idx, token in enumerate(f):
                token = token.strip()
                self.token2idx[token] = idx
                self.idx2token.append(token)

        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")

        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file
        self.vocab_txt = vocab_txt

        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(vocab_file)

    @property
    def vocab_size(self):
        return len(self.idx2token)

    def get_vocab(self):
        return dict(self.token2idx, **self.added_tokens_encoder)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def preprocess_text(self, inputs):
        if self.remove_space:
            outputs = " ".join(inputs.strip().split())
        else:
            outputs = inputs
        outputs = outputs.replace("``", '"').replace("''", '"')

        if not self.keep_accents:
            outputs = unicodedata.normalize('NFKD', outputs)
            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
        if self.do_lower_case:
            outputs = outputs.lower()

        return outputs

    def _tokenize(self, text, return_unicode=True, sample=False):
        """ Tokenize a string. """
        text = self.preprocess_text(text)

        if not sample:
            pieces = self.sp_model.EncodeAsPieces(text)
        else:
            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
        new_pieces = []
        for piece in pieces:
            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                    if len(cur_pieces[0]) == 1:
                        cur_pieces = cur_pieces[1:]
                    else:
                        cur_pieces[0] = cur_pieces[0][1:]
                cur_pieces.append(piece[-1])
                new_pieces.extend(cur_pieces)
            else:
                new_pieces.append(piece)

        return new_pieces

    def _convert_token_to_id(self, token):
        """ Converts a token (str/unicode) in an id using the vocab. """
        return self.token2idx.get(token, self.token2idx[self.unk_token])

    def _convert_id_to_token(self, index, return_unicode=True):
        """Converts an index (integer) in a token (string/unicode) using the vocab."""
        return self.idx2token[index]

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
        return out_string

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        A KoBERT sequence has the following format:
            single sequence: [CLS] X [SEP]
            pair of sequences: [CLS] A [SEP] B [SEP]
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
        """
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
        Args:
            token_ids_0: list of ids (must not contain special tokens)
            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
                for sequence pairs
            already_has_special_tokens: (default False) Set to True if the token list is already formated with
                special tokens for the model
        Returns:
            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
        """

        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
        A KoBERT sequence pair mask has the following format:
        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence
        if token_ids_1 is None, only returns the first portion of the mask (0's).
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory):
        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
            to a directory.
        """
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return

        # 1. Save sentencepiece model
        out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
            copyfile(self.vocab_file, out_vocab_model)

        # 2. Save vocab.txt
        index = 0
        out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
        with open(out_vocab_txt, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
                    )
                    index = token_index
                writer.write(token + "\n")
                index += 1
 
        return out_vocab_model, out_vocab_txt


In [5]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

In [6]:
def convert_data(data_df):
    global tokenizer
    
    SEQ_LEN = 64 #SEQ_LEN : 버트에 들어갈 인풋의 길이
    
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # token : 문장을 토큰화함
        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, pad_to_max_length=True)
       
        # 마스크는 토큰화한 문장에서 패딩이 아닌 부분은 1, 패딩인 부분은 0으로 통일
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        
        # 문장의 전후관계를 구분해주는 세그먼트는 문장이 1개밖에 없으므로 모두 0
        segment = [0]*SEQ_LEN
 
        # 버트 인풋으로 들어가는 token, mask, segment를 tokens, segments에 각각 저장
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
        
        # 정답(긍정 : 1 부정 0)을 targets 변수에 저장해 줌
        targets.append(data_df[LABEL_COLUMN][i])
 
    # tokens, masks, segments, 정답 변수 targets를 numpy array로 지정    
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)
 
    return [tokens, masks, segments], targets
 
# 위에 정의한 convert_data 함수를 불러오는 함수를 정의
def load_data(pandas_dataframe):
    data_df = pandas_dataframe
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data_df[LABEL_COLUMN] = data_df[LABEL_COLUMN].astype(int)
    data_x, data_y = convert_data(data_df)
    return data_x, data_y
 
SEQ_LEN = 64
BATCH_SIZE = 32
# 긍부정 문장을 포함하고 있는 칼럼
DATA_COLUMN = "document"
# 긍정인지 부정인지를 (1=긍정,0=부정) 포함하고 있는 칼럼
LABEL_COLUMN = "label"
 
# train 데이터를 버트 인풋에 맞게 변환
train_x, train_y = load_data(train)


  0%|          | 0/150000 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 150000/150000 [00:20<00:00, 7160.18it/s]


In [7]:
test_x, test_y = load_data(test)

100%|██████████| 50000/50000 [00:06<00:00, 7325.34it/s]


In [8]:
# 버트 훈련을 빠르게 하기 위해, TPU를 사용
model = TFBertModel.from_pretrained("monologg/kobert", from_pt=True)
# 토큰 인풋, 마스크 인풋, 세그먼트 인풋 정의
token_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_word_ids')
mask_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_masks')
segment_inputs = tf.keras.layers.Input((SEQ_LEN,), dtype=tf.int32, name='input_segment')
# 인풋이 [토큰, 마스크, 세그먼트]인 모델 정의
bert_outputs = model([token_inputs, mask_inputs, segment_inputs])

All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7efc000712a0> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7efc000712a0> is not a module, class, method, function, traceback, frame, or code object



In [9]:
bert_outputs = bert_outputs[1]

In [10]:
# 총 batch size * 4 epoch = 2344 * 4
opt = tfa.optimizers.RectifiedAdam(lr=5.0e-5, total_steps = 2344*4, warmup_proportion=0.1, min_lr=1e-5, epsilon=1e-08, clipnorm=1.0)

In [11]:
sentiment_drop = tf.keras.layers.Dropout(0.5)(bert_outputs)
sentiment_first = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))(sentiment_drop)
sentiment_model = tf.keras.Model([token_inputs, mask_inputs, segment_inputs], sentiment_first)
sentiment_model.compile(optimizer=opt, loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

In [12]:
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_segment (InputLayer)      [(None, 64)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 92186880    input_word_ids[0][0]             
                                                                 input_masks[0][0]            

In [13]:
history = sentiment_model.fit(train_x, train_y, epochs=4, shuffle=True, batch_size=64, validation_data=(test_x, test_y))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 185/2344 [=>............................] - ETA: 1:40:24 - loss: 0.1075 - accuracy: 0.9630

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





In [14]:
evaluate_result = sentiment_model.evaluate(test_x,test_y)



In [15]:
print('Accuracy : ' ,evaluate_result[1])

Accuracy :  0.8981999754905701


# import Server to Analysis

In [16]:
def convert_serverdata(data_df):
    global tokenizer
    
    SEQ_LEN = 64 #SEQ_LEN : 버트에 들어갈 인풋의 길이
    
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # token : 문장을 토큰화함
        token = tokenizer.encode(data_df[DATA_COLUMN][i], max_length=SEQ_LEN, pad_to_max_length=True)
       
        # 마스크는 토큰화한 문장에서 패딩이 아닌 부분은 1, 패딩인 부분은 0으로 통일
        num_zeros = token.count(0)
        mask = [1]*(SEQ_LEN-num_zeros) + [0]*num_zeros
        
        # 문장의 전후관계를 구분해주는 세그먼트는 문장이 1개밖에 없으므로 모두 0
        segment = [0]*SEQ_LEN
 
        # 버트 인풋으로 들어가는 token, mask, segment를 tokens, segments에 각각 저장
        tokens.append(token)
        masks.append(mask)
        segments.append(segment)
    # tokens, masks, segments 를 numpy array로 지정    
    tokens = np.array(tokens)
    masks = np.array(masks)
    segments = np.array(segments)
    targets = np.array(targets)
 
    return [tokens, masks, segments], targets
 
# 위에 정의한 convert_data 함수를 불러오는 함수를 정의
def load_serverdata(pandas_dataframe):
    data_df = pandas_dataframe
    data_df[DATA_COLUMN] = data_df[DATA_COLUMN].astype(str)
    data = convert_serverdata(data_df)
    return data

In [17]:
#상태확인
state_url = "http://todaynews.dothome.co.kr/Select_Data_State.php"
response = requests.get(state_url)
if (response.text == '{\n    "Article_List": []\n}'):
    print('null')

In [18]:
#data import
select_url = "http://todaynews.dothome.co.kr/Select_Article_KeywordList.php"
response = requests.get(select_url)
text = response.text
df_serverdata = json.loads(text)
df_serverdata = pd.DataFrame(df_serverdata['Article_List'])
keyword_url = list(df_serverdata['Keyword_URL'])
keyword_word = list(df_serverdata['Keyword_Word'])

In [19]:
for i in range(len(keyword_url)):
    keyword_url[i] = ("'"+keyword_url[i]+"'")
    keyword_word[i] = ("'"+keyword_word[i]+"'")

In [20]:
#Article_Content Converting
SEQ_LEN = 64
BATCH_SIZE = 32
DATA_COLUMN = "Article_Content"

# train 데이터를 버트 인풋에 맞게 변환
article_data = load_serverdata(df_serverdata)
len(article_data)

# Reshaping
article_data = list(article_data)
article_data = article_data[0]
np.expand_dims(article_data, axis=0)

100%|██████████| 344/344 [00:00<00:00, 1801.90it/s]


array([[[[   2, 2124, 1670, ..., 2542, 6896,    3],
         [   2, 2124, 1670, ..., 2542, 6896,    3],
         [   2, 2929, 5330, ..., 1312, 6493,    3],
         ...,
         [   2, 3721, 2627, ...,  268,  264,    3],
         [   2, 3721, 2627, ...,  268,  264,    3],
         [   2, 3721, 2627, ...,  268,  264,    3]],

        [[   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1],
         ...,
         [   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1]],

        [[   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0],
         ...,
         [   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0]]]])

In [21]:
#Article_Keyword Converting
SEQ_LEN = 64
BATCH_SIZE = 32
DATA_COLUMN = "Keyword_Word"

# train 데이터를 버트 인풋에 맞게 변환
keyword_data = load_serverdata(df_serverdata)
keyword_data = list(keyword_data)
keyword_data = keyword_data[0]
np.expand_dims(keyword_data, axis=0)


100%|██████████| 344/344 [00:00<00:00, 11923.51it/s]


array([[[[   2, 1674,    3, ...,    1,    1,    1],
         [   2, 3417, 7395, ...,    1,    1,    1],
         [   2,  650,  202, ...,    1,    1,    1],
         ...,
         [   2, 5107,    3, ...,    1,    1,    1],
         [   2,  517, 7922, ...,    1,    1,    1],
         [   2,  517, 5468, ...,    1,    1,    1]],

        [[   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1],
         ...,
         [   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1],
         [   1,    1,    1, ...,    1,    1,    1]],

        [[   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0],
         ...,
         [   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0],
         [   0,    0,    0, ...,    0,    0,    0]]]])

In [22]:
article_predict = sentiment_model.predict(article_data)
keyword_predict = sentiment_model.predict(keyword_data)



In [111]:
for i in range(len(keyword_url)):
    url = keyword_url[i] #url 주소
    keyword_pred_data = keyword_predict[i] #키워드 분석
    keyword_update_data = {'Keyword_URL':url} #키워드 분석 데이터 dict화
    if(keyword_pred_data >= 0.7):
        keyword_update_url = ("http://todaynews.dothome.co.kr/Update_KeywordList_Positive_Emotion.php") #키워드 분석 데이터 삽입 URL 
    else:
        keyword_update_url = ("http://todaynews.dothome.co.kr/Update_KeywordList_Negative_Emotion.php") #키워드 분석 데이터 삽입 URL
    response = requests.get(keyword_update_url,params=keyword_update_data) #전송

    article_pred_data = article_predict[i] #기사 분석
    article_update_data = {'Article_URL':url} #기사 분석 데이터 dict화
    if(article_pred_data >= 0.7):
        article_update_url = ('http://todaynews.dothome.co.kr/Update_Article_Positive_Emotion.php') #기사 분석 데이터 삽입 URL
    else:
        article_update_url = ('http://todaynews.dothome.co.kr/Update_Article_Negative_Emotion.php') #기사 분석 데이터 삽입 URL
    response = requests.get(article_update_url,params=article_update_data) #전송
    


In [103]:
keyword_rank_url = "http://todaynews.dothome.co.kr/Select_KeywordRank.php"
response = requests.get(keyword_rank_url)
keyword_text = response.text
df_keyword_data = json.loads(keyword_text)
df_keyword_data = pd.DataFrame(df_keyword_data['Article_List'])
keyword_text = list(df_serverdata['Keyword_Word'])
for i in range(len(keyword_text)):
    keyword_url = ('http://todaynews.dothome.co.kr/Update_KeywordRank_Emotion.php')
    keyword_syncro = {'Keyword_Word':keyword_text[i]} 
    response = requests.get(keyword_url,keyword_syncro)

{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":true}
{"success":tru

# Module

In [96]:
#분석 함수
def analysis():
    #data import
    select_url = "http://todaynews.dothome.co.kr/Select_Article_KeywordList.php"
    response = requests.get(select_url)
    text = response.text
    df_serverdata = json.loads(text)
    df_serverdata = pd.DataFrame(df_serverdata['Article_List'])
    keyword_url = list(df_serverdata['Keyword_URL'])
    keyword_word = list(df_serverdata['Keyword_Word'])
    
    #data preprocessing(1)
    for i in range(len(keyword_url)):
        keyword_url[i] = ("'"+keyword_url[i]+"'")
        keyword_word[i] = ("'"+keyword_word[i]+"'")
    
    #Article_Content Converting
    SEQ_LEN = 64
    BATCH_SIZE = 32
    DATA_COLUMN = "Article_Content"

    # train 데이터를 버트 인풋에 맞게 변환
    article_data = load_serverdata(df_serverdata)
    
    # Reshaping
    article_data = list(article_data)
    article_data = article_data[0]
    np.expand_dims(article_data, axis=0)
    
   #Article_Keyword Converting
    SEQ_LEN = 64
    BATCH_SIZE = 32
    DATA_COLUMN = "Keyword_Word"

    # predict 데이터를 버트 인풋에 맞게 변환
    keyword_data = load_serverdata(df_serverdata)
    keyword_data = list(keyword_data)
    keyword_data = keyword_data[0]
    np.expand_dims(keyword_data, axis=0)
    
    #predict
    article_predict = sentiment_model.predict(article_data)
    keyword_predict = sentiment_model.predict(keyword_data)
    
    for i in range(len(keyword_url)):
        url = keyword_url[i] #url 주소
        keyword_pred_data = int(np.around(keyword_predict[i])) #키워드 분석
        keyword_update_data = {'Keyword_URL':url} #키워드 분석 데이터 dict화
        if(keyword_pred_data == 1):
            keyword_update_url = ("http://todaynews.dothome.co.kr/Update_KeywordList_Positive_Emotion.php") #키워드 분석 데이터 삽입 URL 
        else:
            keyword_update_url = ("http://todaynews.dothome.co.kr/Update_KeywordList_Negative_Emotion.php") #키워드 분석 데이터 삽입 URL
        response = requests.get(keyword_update_url,params=keyword_update_data) #전송
        print(response.text)
        
        article_pred_data = int(np.around(article_predict[i])) #기사 분석
        article_update_data = {'Article_URL':url} #기사 분석 데이터 dict화
        if(article_pred_data == 1):
            article_update_url = ('http://todaynews.dothome.co.kr/Update_Article_Positive_Emotion.php') #기사 분석 데이터 삽입 URL
        else:
            article_update_url = ('http://todaynews.dothome.co.kr/Update_Article_Negative_Emotion.php') #기사 분석 데이터 삽입 URL
        response = requests.get(article_update_url,params=article_update_data) #전송
        print(response.text)

In [86]:
def keyword_rank_syn():
    keyword_rank_url = "http://todaynews.dothome.co.kr/Select_KeywordRank.php"
    response = requests.get(keyword_rank_url)
    keyword_text = response.text
    df_keyword_data = json.loads(keyword_text)
    df_keyword_data = pd.DataFrame(df_keyword_data['Article_List'])
    keyword_text = list(df_serverdata['Keyword_Word'])
    for i in range(len(keyword_text)):
        keyword_url = ('http://todaynews.dothome.co.kr/Update_KeywordRank_Emotion.php')
        keyword_syncro = {'Keyword_Word':keyword_text[i]} 
        response = requests.get(keyword_url,keyword_syncro)
        print(response.text)

In [143]:
#실행 모듈
def job():
    analysis()
    keyword_rank_syn()
    print("updated!")

In [260]:
def state_check():
    state_url = "http://todaynews.dothome.co.kr/Select_Data_State.php"
    response = requests.get(state_url)
    state = response.text
    state = json.loads(response.text)
    state = state["Data_State"]
    state = state[0]['State_Code']
    if(state != '0'):
        return 1
    else :
        return 0

In [261]:
# 스케쥴링 모듈
def sched_module():
    state = state_check()
    if (state == 0):
        job()
        print('Newest Version Updated!')
    else:
        print('Newest Version Now')

# Run

In [273]:
sched = BlockingScheduler()

In [274]:
activate = sched.add_job(sched_module, 'cron', second='0', id="Analysis_Module")

In [None]:
sched.start()

In [None]:
sched.shutdown()