In [1]:
import pytz
import aiohttp
import asyncio
import datetime


async def datetime_to_iso_string(date: datetime.datetime) -> str :
    return (
        date
        .replace(microsecond=0)
        .astimezone(pytz.timezone('Asia/Seoul'))
        .isoformat()
    )


async def request_sampling_data(target_keyword: list, size_per_channel: int, day_interval: int):
    url = 'http://k8s.mysterico.com:31464/analyzer/niz_sample_data/generate'
    async with aiohttp.ClientSession() as session:
        today = datetime.datetime.now()
        iso_today = await datetime_to_iso_string(today)
        interval = datetime.timedelta(days=day_interval)
        response = await session.post(
            url=url,
            json={
                "size_per_channel": size_per_channel,
                "target_keyword": target_keyword,
                "start_date": await datetime_to_iso_string(today - interval),
                "end_date": iso_today,
                "include": ["channel"]
            }
        )
        return await response.json()


async def random_sampling(target_keyword: list, size_per_channel: int, day_interval: int):
    response = await request_sampling_data(target_keyword, size_per_channel, day_interval)
    documents = response.get("documents")
    documents = [document.get("contentPlainText") for document in documents]
    return documents




In [4]:
import unicodedata
from typing import List

async def request_ner(sentences: List[str]):
    url = 'http://127.0.0.1:8001/predict'
    sentences = [unicodedata.normalize('NFC', sentence) for sentence in sentences]
    async with aiohttp.ClientSession() as session:
        response = await session.post(
            url=url,
            json={
                "sentences": sentences
            }
        )
        return await response.json()

In [5]:
await request_ner(['''👨‍🔧👩‍🌾🧑‍🌾🧑‍🚀👩‍🚀👨‍🔬야경스타그램 일러스트그램 비건샴푸 성실한 🌚 '''])

[{'sentence': '👨\u200d🔧👩\u200d🌾🧑\u200d🌾🧑\u200d🚀👩\u200d🚀👨\u200d🔬야경스타그램 일러스트그램 비건샴푸 성실한 🌚 ',
  'merged_sentence': '                         일러스트그램 [비건샴푸 : PS] 성실한 🌚',
  'entities': [['비건샴푸', 'PS', [32, 36]]],
  'not_entities': [['일러스트그램', 'O', [25, 31]], ['성실한 🌚', 'O', [37, 42]]]}]

In [8]:
import time
import random

characters_per_line = 100
sampling_trial = 20
target_keyword = [
    "밀키트"
]

documents = await random_sampling(
    target_keyword=target_keyword,
    size_per_channel=1000,
    day_interval=2 ** 10
)
random_documents = [random.choice(documents) for _ in range(sampling_trial)]

print(f'total documents : {len(documents)}')
print(f'sampled documents : {len(random_documents)}\n')

def split_documents_by_index(document: str, index: int):
    criterias = [(idx, idx+index) for idx in range(0, len(document), index)]
    result = [document[start_idx:end_idx] for start_idx, end_idx in criterias]
    return '\n'.join(result)

for random_document in random_documents:
    try:
        start = time.time()
        result = await request_ner([random_document])
        for k, result in result[0].items():
            print(f'<<{k}>>')
            if k == 'merged_sentence':
                print(split_documents_by_index(result, characters_per_line), '\n')
                continue
            print(result, '\n')
        print(time.time() - start, 'sec')
        print('='*100, '\n')
    except Exception as e:
        print('\n', e)
        print(random_document, '\n')
        continue

total documents : 8233
sampled documents : 20

<<sentence>>
거제도 중식 1등 👍 탄탄면 전문 타이웨이 거제 🍜

거제도여행 필수코스!! 타이웨이 @taiwei_offical
.
.
.
#거제도맛집
#거제맛집
#거제도맛집베스트
#거제도가볼만한곳추천
#거제가볼맛한곳
#밀키트
#탄탄면
#탄탄면맛집
#타이웨이
#탕수대첩 

<<merged_sentence>>
[거제 : LOC]도 [중식 : FNB] 1등 👍 [탄탄면 : FNB] 전문 [타이웨이 : ORG] [거제    거제 : LOC]도여행 필수코스!! [타이웨이 : LOC] @tai
wei_offical . . . #[거제도 : LOC]맛집 #[거제 : LOC]맛집 #[거제도 : LOC]맛집베스트 #[거제 : LOC]도가볼만한곳추천 #[거제 : LOC]가볼맛한
곳 #[밀키트 : FNB] #[탄탄면 : FNB] #[탄탄면 : FNB]맛집 #[타이웨이 : LOC] #탕수대첩 

<<entities>>
[['거제', 'LOC', [0, 2]], ['중식', 'FNB', [4, 6]], ['탄탄면', 'FNB', [12, 15]], ['타이웨이', 'ORG', [19, 23]], ['거제    거제', 'LOC', [24, 32]], ['타이웨이', 'LOC', [43, 47]], ['거제도', 'LOC', [71, 74]], ['거제', 'LOC', [78, 80]], ['거제도', 'LOC', [84, 87]], ['거제', 'LOC', [94, 96]], ['거제', 'LOC', [106, 108]], ['밀키트', 'FNB', [115, 118]], ['탄탄면', 'FNB', [120, 123]], ['탄탄면', 'FNB', [125, 128]], ['타이웨이', 'LOC', [132, 136]]] 

<<not_entities>>
[['도', 'O', [2, 3]], ['1등 👍', 'O', [7, 11]], ['전문', 'O', [16, 18]], ['도여행 필수