In [1]:
import wandb
import aiohttp
import asyncio
import pandas as pd
from typing import List
from tqdm import tqdm

In [19]:
url = 'http://k8s.mysterico.com:31464/analyzer/niz_sample_data/generate'

async with aiohttp.ClientSession() as sess:
    response = await sess.post(
        url=url,
        json={
            "size_per_channel": 5,
            "target_keyword": ["맥도날드"],
            "start_date": "2023-03-10T00:00:00.000Z",
            "end_date": "2023-03-17T00:00:00.000Z",
            "channel": ["twitter"],
            # "search_keyword": ["AI"],
            "spam_doc_type": [
                "profit",
                "press"
            ],
            "spam_category": [
                "defaultSpam",
                "trade",
                "cafeActivity",
                "economy"
            ],
            "remove_empty_content": True,
            "include": ["channel"]
        }
    )
    result = await response.json()
    texts = [
        r.get('contentPlainText').replace('\n', ' ').replace('\u200b', '') for r in result.get('documents')
    ]
    print(len(texts))
    for i, text in enumerate(texts):
        print(i, text)

5
0 맥도날드에선 리필 안됨
1 [맥도날드x뉴진스] 색다른 차원의 빠삭함, 맥크리스피(Full ver.) https://t.co/c5lS9CaUdA  Newjeansちゃんたち出てるから見てみたらクリスピーめっちゃ美味しそうで、今週末はマクドナルドに決定です https://t.co/qW6KI55Nlw
2 저녁을 먹으러 맥도날드에 오다.
3 뭐?어제 맥도날드를 가는바람에 KFC를 안갔다고?
4 [맥도날드x뉴진스] 색다른 차원의 빠삭함, 맥크리스피(Full ver.) https://t.co/goVgRTVUyo @YouTubeより


# Legacy category model

In [2]:
async def request_legacy_category(text: str):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url='http://k8s.mysterico.com:31464/model-server/category-model/classify',
            json={
                "text": text
            }
        )
        result = await response.json()
        return result

In [7]:
await request_legacy_category('맥도날드 맥크리스피 가격 너무 비싼거아님?!')

{'model': 'CategoryClassifier',
 'result': [{'category': 'price', 'result': False, 'rank': 255},
  {'category': 'quality', 'result': False, 'rank': 255},
  {'category': 'design', 'result': False, 'rank': 255},
  {'category': 'service', 'result': False, 'rank': 255},
  {'category': 'etc', 'result': False, 'rank': 255}]}

In [2]:
df_file_names = [
    'eval/df_가격_20000_test.csv',
    'eval/df_디자인_20000_test.csv',
    # 'eval/df_브랜드제품기타_20000_test.csv',
    'eval/df_서비스_20000_test.csv',
    'eval/df_품질_20000_test.csv'
]
df_price, df_design, df_service, df_quality = [
    pd.read_csv(df_file_name) for df_file_name in df_file_names
]

In [14]:
text_price = df_price['text'].to_list()
label_price = df_price['가격'].to_list()

preds = list()
for text in tqdm(text_price):
    result = await request_legacy_category(text)
    for r in result.get('result'):
        if r.get('category') == 'price':
            label = 1.0 if r.get('result') == True else 0.0
            preds.append(label)
            
from sklearn.metrics import f1_score
f1_score(label_price, preds)

In [22]:
text_design = df_design['text'].to_list()
label_design = df_design['디자인'].to_list()

preds = list()
for text in tqdm(text_design):
    result = await request_legacy_category(text)
    for r in result.get('result'):
        if r.get('category') == 'design':
            label = 1.0 if r.get('result') == True else 0.0
            preds.append(label)
            
from sklearn.metrics import f1_score
f1_score(label_design, preds)

100%|██████████| 2706/2706 [04:28<00:00, 10.07it/s]


0.7631695440460382

In [4]:
text_service = df_service['text'].to_list()
label_service = df_service['서비스'].to_list()

preds = list()
for text in tqdm(text_service):
    result = await request_legacy_category(text)
    for r in result.get('result'):
        if r.get('category') == 'service':
            label = 1.0 if r.get('result') == True else 0.0
            preds.append(label)
            
from sklearn.metrics import f1_score
f1_score(label_service, preds)

100%|██████████| 4128/4128 [07:13<00:00,  9.52it/s]


0.8446000538647993

In [5]:
text_quality = df_quality['text'].to_list()
label_quality = df_quality['품질'].to_list()

preds = list()
for text in tqdm(text_quality):
    result = await request_legacy_category(text)
    for r in result.get('result'):
        if r.get('category') == 'quality':
            label = 1.0 if r.get('result') == True else 0.0
            preds.append(label)
            
from sklearn.metrics import f1_score
f1_score(label_quality, preds)

100%|██████████| 12992/12992 [25:18<00:00,  8.56it/s]  


0.8689300240723831

In [6]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhsung951027[0m ([33mmysterico[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [7]:
df_file_names = {
    'eval/df_가격_20000_test.csv': ['가격', 'price'],
    'eval/df_디자인_20000_test.csv': ['디자인', 'design'],
    'eval/df_서비스_20000_test.csv': ['서비스', 'service'],
    'eval/df_품질_20000_test.csv': ['품질', 'quality']
}

for df_file_name, categories in df_file_names.items():
    ko_category, en_category = categories
    wandb.init(
        project="optimize-category-model",
        name=f"legacy-{en_category}"
    )
    df = pd.read_csv(df_file_name)
    texts = df['text'].to_list()
    labels = df[ko_category].to_list()
    preds = list()
    for text in texts:
        result = await request_legacy_category(text)
        for r in result.get('result'):
            if r.get('category') == en_category:
                pred = 1.0 if r.get('result') == True else 0.0
                preds.append(pred)
    f1 = f1_score(labels, preds)
    wandb.log(
        {"f1 score": f1}
    )
    
wandb.finish()

0,1
f1 score,▁

0,1
f1 score,0.88985


0,1
f1 score,▁

0,1
f1 score,0.7589


0,1
f1 score,▁

0,1
f1 score,0.84412


0,1
f1 score,▁

0,1
f1 score,0.86615


# Renew category model

In [3]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhsung951027[0m ([33mmysterico[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [16]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_chunk',
            json={
                "text": [text]
            }
        )
        return await response.json()
        
async def request_renew_category(text: str):
    responses = await asyncio.gather(
        *[
            request(
                text=text,
                port=port
            ) 
            for port, category in ports.items()
        ]
    )
    print('\n')
    for category, response in zip(ports.values(), responses):
        print(category, text, response)

In [4]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_chunk',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-price"
)

text_price = df_price['text'].to_list()
label_price = df_price['가격'].to_list()

preds = list()
for text in tqdm(text_price):
    result = await request(text, 9001)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'price':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_price, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

100%|██████████| 3040/3040 [06:35<00:00,  7.70it/s]


0,1
f1 score,▁

0,1
f1 score,0.95159


In [5]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_chunk',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-design"
)

text_design = df_design['text'].to_list()
label_design = df_design['디자인'].to_list()

preds = list()
for text in tqdm(text_design):
    result = await request(text, 9002)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'design':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_design, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

100%|██████████| 2706/2706 [05:12<00:00,  8.66it/s]


0,1
f1 score,▁

0,1
f1 score,0.85799


In [3]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_chunk',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-service"
)

text_service = df_service['text'].to_list()
label_service = df_service['서비스'].to_list()

preds = list()
for text in tqdm(text_service):
    result = await request(text, 9003)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'service':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_service, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhsung951027[0m ([33mmysterico[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 4128/4128 [07:48<00:00,  8.81it/s]


0,1
f1 score,▁

0,1
f1 score,0.88084


In [4]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_chunk',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-quality"
)

text_quality = df_quality['text'].to_list()
label_quality = df_quality['품질'].to_list()

preds = list()
for text in tqdm(text_quality):
    result = await request(text, 9000)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'quality':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_quality, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

100%|██████████| 12992/12992 [25:23<00:00,  8.53it/s]


0,1
f1 score,▁

0,1
f1 score,0.88342


# Renew category by sentence

In [3]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_sentence',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-price-sentence"
)

text_price = df_price['text'].to_list()
label_price = df_price['가격'].to_list()

preds = list()
for text in tqdm(text_price):
    result = await request(text, 9001)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'price':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_price, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhsung951027[0m ([33mmysterico[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 3040/3040 [06:52<00:00,  7.37it/s]


0,1
f1 score,▁

0,1
f1 score,0.94248


In [4]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_sentence',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-design-sentence"
)

text_design = df_design['text'].to_list()
label_design = df_design['디자인'].to_list()

preds = list()
for text in tqdm(text_design):
    result = await request(text, 9002)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'design':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_design, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

100%|██████████| 2706/2706 [05:53<00:00,  7.67it/s]


0,1
f1 score,▁

0,1
f1 score,0.84935


In [5]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_sentence',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-service-sentence"
)

text_service = df_service['text'].to_list()
label_service = df_service['서비스'].to_list()

preds = list()
for text in tqdm(text_service):
    result = await request(text, 9003)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'service':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_service, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

100%|██████████| 4128/4128 [08:33<00:00,  8.04it/s]


0,1
f1 score,▁

0,1
f1 score,0.86983


In [6]:
ports = {
    "9000": "quality",
    "9001": "price",
    "9002": "design",
    "9003": "service",
    # "9004": "brand"
}

async def request(text: str, port: int):
    async with aiohttp.ClientSession() as sess:
        response = await sess.post(
            url=f'http://192.168.0.93:{port}/category_clf_sentence',
            json={
                "text": [text]
            }
        )
        return await response.json()

wandb.init(
    project="optimize-category-model",
    name="renew-quality-sentence"
)

text_quality = df_quality['text'].to_list()
label_quality = df_quality['품질'].to_list()

preds = list()
for text in tqdm(text_quality):
    result = await request(text, 9000)
    category = result.get('category')
    r = result.get('result')[0]
    if category == 'quality':
        preds.append(r)
            
from sklearn.metrics import f1_score
f1 = f1_score(label_quality, preds)

wandb.log(
    {"f1 score": f1}
)

wandb.finish()

 15%|█▌        | 1976/12992 [04:02<27:59,  6.56it/s]