In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline

model_name = "searle-j/kote_for_easygoing_people"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = 'mps'

pipe = TextClassificationPipeline(
        model=model,
        tokenizer=tokenizer,
        device=device, # gpu number, -1 if cpu used
        return_all_scores=True,
        function_to_apply='sigmoid'
    )




In [14]:
import pandas as pd

df = pd.read_csv('텍스트_윤리검증_train.csv')
df.head()

Unnamed: 0,conversation_idx,sentence_idx,origin_text,types,is_immoral,intensity,speaker
0,0,0,부랴부랴 왔는데 아무도 안왔네. 시간개념들이 없네,['CENSURE'],True,2.0,A
1,0,1,맞아. 사람들이 진짜 개념이없다,['CENSURE'],True,2.0,B
2,0,2,저렇게는 살지 말아야지,['CENSURE'],True,2.0,A
3,0,3,그러게 게으르고 멍청한 사람들은 맞아야해,['CENSURE'],True,2.0,B
4,1,0,인방 보는 남자는 거르는게 맞다,['CENSURE'],True,1.0,A


In [3]:
from pprint import pprint
result =pipe(df['origin_text'][0])
result[0]

[{'label': '불평/불만', 'score': 0.9518845081329346},
 {'label': '환영/호의', 'score': 0.0028544499073177576},
 {'label': '감동/감탄', 'score': 0.003995489329099655},
 {'label': '지긋지긋', 'score': 0.11973022669553757},
 {'label': '고마움', 'score': 0.0010144844418391585},
 {'label': '슬픔', 'score': 0.05571465566754341},
 {'label': '화남/분노', 'score': 0.6774910688400269},
 {'label': '존경', 'score': 0.00042167434003204107},
 {'label': '기대감', 'score': 0.004941482562571764},
 {'label': '우쭐댐/무시함', 'score': 0.035073600709438324},
 {'label': '안타까움/실망', 'score': 0.8867160677909851},
 {'label': '비장함', 'score': 0.0018034239765256643},
 {'label': '의심/불신', 'score': 0.20897753536701202},
 {'label': '뿌듯함', 'score': 0.0011958081740885973},
 {'label': '편안/쾌적', 'score': 0.0005761024658568203},
 {'label': '신기함/관심', 'score': 0.01138535887002945},
 {'label': '아껴주는', 'score': 0.004767137113958597},
 {'label': '부끄러움', 'score': 0.04189775884151459},
 {'label': '공포/무서움', 'score': 0.0034784041345119476},
 {'label': '절망', 'score': 

In [4]:
label_list = []

for i in range(len(result[0])):
    label_list.append(result[0][i]['label'])

In [5]:
label_list

['불평/불만',
 '환영/호의',
 '감동/감탄',
 '지긋지긋',
 '고마움',
 '슬픔',
 '화남/분노',
 '존경',
 '기대감',
 '우쭐댐/무시함',
 '안타까움/실망',
 '비장함',
 '의심/불신',
 '뿌듯함',
 '편안/쾌적',
 '신기함/관심',
 '아껴주는',
 '부끄러움',
 '공포/무서움',
 '절망',
 '한심함',
 '역겨움/징그러움',
 '짜증',
 '어이없음',
 '없음',
 '패배/자기혐오',
 '귀찮음',
 '힘듦/지침',
 '즐거움/신남',
 '깨달음',
 '죄책감',
 '증오/혐오',
 '흐뭇함(귀여움/예쁨)',
 '당황/난처',
 '경악',
 '부담/안_내킴',
 '서러움',
 '재미없음',
 '불쌍함/연민',
 '놀람',
 '행복',
 '불안/걱정',
 '기쁨',
 '안심/신뢰']

In [6]:
for label in label_list:
    if label not in df.columns:
        df[label] = 0.0  # 초기값을 0.0으로 설정

In [9]:
from tqdm import tqdm

for i in tqdm(range(len(df))):
    result = pipe(df['origin_text'][i])
    
    for j in range(len(label_list)):
        df.loc[i, label_list[j]] = result[0][j]['score']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 363154/363154 [1:37:54<00:00, 61.82it/s]  


In [10]:
df.to_csv('emotion_labeld_text.csv', index=False)

In [11]:
df.head()

Unnamed: 0,conversation_idx,sentence_idx,origin_text,types,is_immoral,intensity,speaker,불평/불만,환영/호의,감동/감탄,...,경악,부담/안_내킴,서러움,재미없음,불쌍함/연민,놀람,행복,불안/걱정,기쁨,안심/신뢰
0,0,0,부랴부랴 왔는데 아무도 안왔네. 시간개념들이 없네,['CENSURE'],True,2.0,A,0.951885,0.002854,0.003995,...,0.147897,0.099539,0.044339,0.075104,0.056602,0.111031,0.001223,0.082044,0.003539,0.00173
1,0,1,맞아. 사람들이 진짜 개념이없다,['CENSURE'],True,2.0,B,0.962885,0.002928,0.006041,...,0.317529,0.246773,0.08468,0.110543,0.123408,0.068188,0.000862,0.281097,0.00155,0.003915
2,0,2,저렇게는 살지 말아야지,['CENSURE'],True,2.0,A,0.848221,0.001995,0.008851,...,0.647499,0.172025,0.288276,0.046891,0.5084,0.106782,0.002143,0.513087,0.002387,0.002772
3,0,3,그러게 게으르고 멍청한 사람들은 맞아야해,['CENSURE'],True,2.0,B,0.787709,0.00378,0.006107,...,0.170331,0.094054,0.026338,0.103005,0.314593,0.013974,0.001491,0.062749,0.002502,0.004004
4,1,0,인방 보는 남자는 거르는게 맞다,['CENSURE'],True,1.0,A,0.749683,0.014485,0.013222,...,0.319111,0.319428,0.022047,0.219026,0.183416,0.051595,0.00454,0.194536,0.009818,0.017963


In [15]:
valid = pd.read_csv("텍스트_윤리검증_valid.csv")

In [16]:
for label in label_list:
    if label not in valid.columns:
        valid[label] = 0.0  # 초기값을 0.0으로 설정

for i in tqdm(range(len(valid))):
    result = pipe(valid['origin_text'][i])
    
    for j in range(len(label_list)):
        valid.loc[i, label_list[j]] = result[0][j]['score']

100%|██████████| 45215/45215 [14:17<00:00, 52.72it/s]


In [17]:
valid.to_csv('emotion_labeld_valid.csv', index=False)

In [11]:
import pandas as pd
df = pd.read_csv('emotion_labeld_text.csv')

In [12]:
type_label = ["DISCRIMINATION","HATE","CENSURE","VIOLENCE","CRIME","SEXUAL","ABUSE"]

In [13]:
# type label 초기화
for label in type_label:
    df[label] = 0.0
    

In [14]:
df['types'] = df['types'].apply(lambda x: x.replace("[","").replace("]","").replace("'","").replace(" ","").split(","))

In [16]:
for i in range(len(df)):
    # types 에 있는 list를 하나씩 가져와서 해당하는 label에 값을 넣어준다.
    for type in df['types'][i]:
        df.loc[i, type] = 1.0

In [18]:
df.drop(['types', 'IMMORAL_NONE'], axis=1, inplace=True)

In [22]:
df.to_csv('emotion_labeld_text.csv', index=False)