In [1]:
# Copyright 2020 HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3

import csv
import os

import datasets

_CITATION = """\
@article{jeon2022user,
    title={User Guide for KOTE: Korean Online Comments Emotions Dataset},
    author={Jeon, Duyoung and Lee, Junho and Kim, Cheongtag},
    journal={arXiv preprint arXiv:2205.05300},
    year={2022}
}
"""

_DESCRIPTION = """\
50k Korean online comments labeled for 44 emotion categories.
"""

_HOMEPAGE = "https://github.com/searle-j/KOTE"

_LICENSE = "MIT License"

_BASE_URL = "https://raw.githubusercontent.com/searle-j/KOTE/main/"

_LABELS = [
'불평/불만',
'환영/호의',
'감동/감탄',
'지긋지긋',
'고마움',
'슬픔',
'화남/분노',
'존경',
'기대감',
'우쭐댐/무시함',
'안타까움/실망',
'비장함',
'의심/불신',
'뿌듯함',
'편안/쾌적',
'신기함/관심',
'아껴주는',
'부끄러움',
'공포/무서움',
'절망',
'한심함',
'역겨움/징그러움',
'짜증',
'어이없음',
'없음',
'패배/자기혐오',
'귀찮음',
'힘듦/지침',
'즐거움/신남',
'깨달음',
'죄책감',
'증오/혐오',
'흐뭇함(귀여움/예쁨)',
'당황/난처',
'경악',
'부담/안_내킴',
'서러움',
'재미없음',
'불쌍함/연민',
'놀람',
'행복',
'불안/걱정',
'기쁨',
'안심/신뢰'
]

class KOTEConfig(datasets.BuilderConfig):
    @property
    def features(self):
        if self.name == "dichotomized":
            return {
                "ID": datasets.Value("string"),
                "text": datasets.Value("string"),
                "labels": datasets.Sequence(datasets.ClassLabel(names=_LABELS)),
            }

class KOTE(datasets.GeneratorBasedBuilder):
    BUILDER_CONFIGS = [KOTEConfig(name="dichotomized")]
    BUILDER_CONFIG_CLASS = KOTEConfig
    DEFAULT_CONFIG_NAME = "dichotomized"
    
    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(self.config.features),
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )
    
    
    def _split_generators(self, dl_manager):
        if self.config.name=="dichotomized":
            train_path = dl_manager.download_and_extract(os.path.join(_BASE_URL, "train.tsv"))
            test_path = dl_manager.download_and_extract(os.path.join(_BASE_URL, "test.tsv"))
            val_path = dl_manager.download_and_extract(os.path.join(_BASE_URL, "val.tsv"))
            return [
                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": [train_path],}),
                datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": [test_path],}),
                datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": [val_path],}),
            ]
            
    def _generate_examples(self, filepaths):
        if self.config.name=="dichotomized":
            for filepath in filepaths:
                with open(filepath, mode="r", encoding="utf-8") as f:
                    reader = csv.DictReader(f, delimiter="\t", fieldnames=list(self.config.features.keys()))
                    for idx, row in enumerate(reader):
                        row["labels"] = [int(lab) for lab in row["labels"].split(",")]
                        yield idx, row

In [9]:
pip install datasets


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
from datasets import load_dataset

# KOTE 데이터셋 로드
dataset = load_dataset("searle-j/kote", trust_remote_code=True)

# 데이터셋 정보 출력
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['ID', 'text', 'labels'],
        num_rows: 5000
    })
})


In [23]:
dataset["train"].to_pandas().to_csv("kote_train.csv", index=False)
dataset["test"].to_pandas().to_csv("kote_test.csv", index=False)
dataset["validation"].to_pandas().to_csv("kote_validation.csv", index=False)


In [1]:
import pytorch_lightning as pl
import torch.nn as nn
from transformers import ElectraModel, AutoTokenizer
import torch

LABELS = ['불평/불만',
 '환영/호의',
 '감동/감탄',
 '지긋지긋',
 '고마움',
 '슬픔',
 '화남/분노',
 '존경',
 '기대감',
 '우쭐댐/무시함',
 '안타까움/실망',
 '비장함',
 '의심/불신',
 '뿌듯함',
 '편안/쾌적',
 '신기함/관심',
 '아껴주는',
 '부끄러움',
 '공포/무서움',
 '절망',
 '한심함',
 '역겨움/징그러움',
 '짜증',
 '어이없음',
 '없음',
 '패배/자기혐오',
 '귀찮음',
 '힘듦/지침',
 '즐거움/신남',
 '깨달음',
 '죄책감',
 '증오/혐오',
 '흐뭇함(귀여움/예쁨)',
 '당황/난처',
 '경악',
 '부담/안_내킴',
 '서러움',
 '재미없음',
 '불쌍함/연민',
 '놀람',
 '행복',
 '불안/걱정',
 '기쁨',
 '안심/신뢰']

device = "cuda" if torch.cuda.is_available() else "cpu"

class KOTEtagger(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base", revision='v2021').to(device)
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base", revision='v2021')
        self.classifier = nn.Linear(self.electra.config.hidden_size, 44).to(device)
        
    def forward(self, text:str):
        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=512,
          return_token_type_ids=False,
          padding="max_length",
          return_attention_mask=True,
          return_tensors='pt',
        ).to(device)
        output = self.electra(encoding["input_ids"], attention_mask=encoding["attention_mask"])
        output = output.last_hidden_state[:,0,:]
        output = self.classifier(output)
        output = torch.sigmoid(output)
        torch.cuda.empty_cache()
        
        return output

# 모델 초기화
trained_model = KOTEtagger()

# 모델 가중치 로드 (strict=False 적용)
state_dict = torch.load(r"C:/Users/Playdata/Desktop/Final/model/KOTE/kote_pytorch_lightning.bin", map_location=device)
trained_model.load_state_dict(state_dict, strict=False)

preds = trained_model(
"""인셉션은 대단하다고 느꼈는데, 인터스텔라는 경이롭다고 느껴진다다"""
)[0]

for l, p in zip(LABELS, preds):
    if p>0.4:
        print(f"{l}: {p}")

  state_dict = torch.load(r"C:/Users/Playdata/Desktop/Final/model/KOTE/kote_pytorch_lightning.bin", map_location=device)


감동/감탄: 0.9351656436920166
존경: 0.7400605082511902
신기함/관심: 0.5662599802017212
깨달음: 0.49894675612449646
놀람: 0.49727797508239746


In [None]:
import pytorch_lightning as pl
import torch.nn as nn
from transformers import ElectraModel, AutoTokenizer
import torch
import pandas as pd

LABELS = [
    '불평/불만', '환영/호의', '감동/감탄', '지긋지긋', '고마움', '슬픔', '화남/분노', '존경', '기대감', '우쭐댐/무시함',
    '안타까움/실망', '비장함', '의심/불신', '뿌듯함', '편안/쾌적', '신기함/관심', '아껴주는', '부끄러움', '공포/무서움', '절망',
    '한심함', '역겨움/징그러움', '짜증', '어이없음', '없음', '패배/자기혐오', '귀찮음', '힘듦/지침', '즐거움/신남', '깨달음',
    '죄책감', '증오/혐오', '흐뭇함(귀여움/예쁨)', '당황/난처', '경악', '부담/안_내킴', '서러움', '재미없음', '불쌍함/연민',
    '놀람', '행복', '불안/걱정', '기쁨', '안심/신뢰'
]

device = "cuda" if torch.cuda.is_available() else "cpu"

class KOTEtagger(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base", revision='v2021').to(device)
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base", revision='v2021')
        self.classifier = nn.Linear(self.electra.config.hidden_size, len(LABELS)).to(device)
        
    def forward(self, text: str):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        ).to(device)
        
        output = self.electra(encoding["input_ids"], attention_mask=encoding["attention_mask"])
        output = output.last_hidden_state[:, 0, :]
        output = self.classifier(output)
        output = torch.sigmoid(output)
        
        return output

# 모델 초기화
trained_model = KOTEtagger()

# 모델 가중치 로드 (strict=False 적용)
state_dict = torch.load(r"C:/Users/Playdata/Desktop/Final/model/KOTE/kote_pytorch_lightning.bin", map_location=device)
trained_model.load_state_dict(state_dict, strict=False)

# 엑셀 데이터 로드
data_path = r"C:/Users/Playdata/Desktop/Final/model/calculation/1stcalculationtest.xlsx"  # 파일 경로 지정
df = pd.read_excel(data_path)

# 첫 번째 영화의 제목 및 리뷰 가져오기
movie_title = df.columns[0]  # 첫 번째 행(영화 제목)
reviews = df[movie_title].dropna().tolist()[:150]  # 최대 150개 리뷰 선택

# 감정 분석 실행
results = []
for review in reviews:
    preds = trained_model(review)[0].detach().cpu().numpy()
    result = {"리뷰": review}
    
    # 감정 점수가 0.4 이상인 것만 필터링 후, 상위 10개 감정 선택
    filtered_emotions = [(label, score) for label, score in zip(LABELS, preds) if score >= 0.4]
    top_10_emotions = sorted(filtered_emotions, key=lambda x: x[1], reverse=True)[:10]
    
    for label, score in top_10_emotions:
        result[label] = score
    
    results.append(result)

# 결과를 데이터프레임으로 변환 및 엑셀 저장
result_df = pd.DataFrame(results)
output_path = "C:/Users/Playdata/Desktop/Final/model/calculation/emotion_analysis.xlsx"
result_df.to_excel(output_path, index=False)

print(f"감정 분석 결과가 저장되었습니다: {output_path}")


  state_dict = torch.load(r"C:/Users/Playdata/Desktop/Final/model/KOTE/kote_pytorch_lightning.bin", map_location=device)


감정 분석 결과가 저장되었습니다: C:/Users/Playdata/Desktop/Final/model/KOTE/emotion_analysis.xlsx


In [1]:
import pytorch_lightning as pl
import torch.nn as nn
from transformers import ElectraModel, AutoTokenizer
import torch
import pandas as pd
import numpy as np

LABELS = [
    '불평/불만', '환영/호의', '감동/감탄', '지긋지긋', '고마움', '슬픔', '화남/분노', '존경', '기대감', '우쭐댐/무시함',
    '안타까움/실망', '비장함', '의심/불신', '뿌듯함', '편안/쾌적', '신기함/관심', '아껴주는', '부끄러움', '공포/무서움', '절망',
    '한심함', '역겨움/징그러움', '짜증', '어이없음', '없음', '패배/자기혐오', '귀찮음', '힘듦/지침', '즐거움/신남', '깨달음',
    '죄책감', '증오/혐오', '흐뭇함(귀여움/예쁨)', '당황/난처', '경악', '부담/안_내킴', '서러움', '재미없음', '불쌍함/연민',
    '놀람', '행복', '불안/걱정', '기쁨', '안심/신뢰'
]

device = "cuda" if torch.cuda.is_available() else "cpu"

class KOTEtagger(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.electra = ElectraModel.from_pretrained("beomi/KcELECTRA-base", revision='v2021').to(device)
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base", revision='v2021')
        self.classifier = nn.Linear(self.electra.config.hidden_size, len(LABELS)).to(device)
        
    def forward(self, text: str):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors='pt',
        ).to(device)
        
        output = self.electra(encoding["input_ids"], attention_mask=encoding["attention_mask"])
        output = output.last_hidden_state[:, 0, :]
        output = self.classifier(output)
        output = torch.sigmoid(output)
        
        return output

# 모델 초기화 및 가중치 로드
trained_model = KOTEtagger()
state_dict = torch.load(r"C:/Users/Playdata/Desktop/Final/model/KOTE/kote_pytorch_lightning.bin", map_location=device)
trained_model.load_state_dict(state_dict, strict=False)
trained_model.eval()  # 평가 모드 전환

# 엑셀 데이터 로드 (각 열은 영화 제목, 각 열 아래에는 최대 150개의 리뷰가 저장되어 있음)
data_path = r"C:/Users/Playdata/Desktop/Final/model/calculation/1stcalculationtest.xlsx"
df = pd.read_excel(data_path)

# 각 영화(열)별로 새로운 emotion_point 열을 생성
# 각 영화의 리뷰에 대해 상위 2개의 예측값(0.4 이상의 값이 있다면 그 중 상위 2, 없으면 전체에서 상위 2)을 평균 내어 emotion_point로 산출
emotion_threshold = 0.4

# torch.no_grad()를 사용하여 gradient 계산 없이 추론 수행
with torch.no_grad():
    for movie in df.columns:
        # 리뷰별 emotion_point를 저장할 리스트
        emotion_points = []
        # 각 리뷰에 대해 처리 (NaN이면 그대로 NaN)
        for review in df[movie]:
            if pd.isna(review):
                emotion_points.append(np.nan)
            else:
                preds = trained_model(review)[0].detach().cpu().numpy()
                # threshold 이상의 감정만 선택
                filtered = [score for score in preds if score >= emotion_threshold]
                if len(filtered) >= 2:
                    top_two = sorted(filtered, reverse=True)[:2]
                    point = sum(top_two) / 2.0
                elif len(filtered) == 1:
                    point = filtered[0]
                else:
                    # threshold를 만족하는 감정이 없을 경우, 전체에서 상위 2개 선택
                    top_two = sorted(preds, reverse=True)[:2]
                    # 모델 출력은 항상 45개 값이므로 top_two는 항상 2개 이상일 것임
                    point = sum(top_two) / 2.0
                emotion_points.append(point)
        # 새로운 열 이름은 "{영화제목}_emotion_point"
        new_col_name = f"{movie}_emotion_point"
        df[new_col_name] = emotion_points

# 새로운 엑셀 파일로 저장
output_path = r"C:/Users/Playdata/Desktop/Final/model/calculation/emotion_analysis.xlsx"
df.to_excel(output_path, index=False)
print(f"감정 분석 결과가 저장되었습니다: {output_path}")


  state_dict = torch.load(r"C:/Users/Playdata/Desktop/Final/model/KOTE/kote_pytorch_lightning.bin", map_location=device)


감정 분석 결과가 저장되었습니다: C:/Users/Playdata/Desktop/Final/model/calculation/emotion_analysis.xlsx
