In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [65]:
PATH =  '/content/drive/MyDrive/AIConnect/NLP_classificaiton/data'

train = pd.read_csv(os.path.join(PATH, 'train.csv'), encoding='utf-8')
test = pd.read_csv(os.path.join(PATH, 'test.csv'), encoding='utf-8')


# Modeling

In [5]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 36.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

In [37]:
import os
import random
from tqdm import tqdm

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer

In [None]:
!pip install wandb
!wandb login

In [7]:
import wandb
from transformers import TrainingArguments, Trainer

wandb.init(project="4week_NLP", entity="team-5")

[34m[1mwandb[0m: Currently logged in as: [33mtjkim[0m (use `wandb login --relogin` to force relogin)


In [9]:
# seed 고정, gpu 고정
def seed_everything(seed:int = 1004):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore

seed_everything(2022)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


# Load Tokenizer, Model
Hugging Face Hub에 존재하는 Pretrained Tokenizer와 Model 및 Model Config 불러오기

이 때, Classification은 num_labels가 2로 Default되어있기 때문에 Model Config의 Parameter를  6으로 변경

In [15]:
bias_model = 'beomi/beep-KcELECTRA-base-bias' # 보통 tokenizer도 같은거 씀. 이거만 바꾸면 모델, tokenizer 변경 가능.
hate_model = 'beomi/beep-KcELECTRA-base-hate'

In [28]:
MODEL_NAME = bias_model

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 3 # other, gender, 

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

print(model)
print(config)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

# Tokenizing

In [29]:
train.rename(columns = {'bias':'label'},inplace=True) # column 명이 label  이 아니면 학습이 안되길래 바꿔줌.

In [30]:
train.head(3)

Unnamed: 0,title,comment,label,hate
0,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?""",김태리 정말 연기잘해 진짜,none,none
1,"""[SC현장]""""극사실주의 현실♥""""…'가장 보통의 연애' 김래원X공효진, 16년만...",공효진 발연기나이질생각이읍던데 왜계속주연일까,none,hate
2,"""손연재, 리듬체조 학원 선생님 """"하고 싶은 일 해서 행복하다""""""",누구처럼 돈만 밝히는 저급인생은 살아가지마시길~~ 행복은 머니순이 아니니깐 작은거에...,others,hate


In [31]:
train_dataset, eval_dataset = train_test_split(train, test_size=0.1, shuffle=True, stratify=train['label'])

tokenized_train = tokenizer(
    list(train_dataset['title']),
    list(train_dataset['comment']),
    return_tensors="pt",
    max_length=256, # Max_Length = 138  tokenizing 하면 길이가 줄어들어서 128로 해도 무관한듯 ?
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_eval = tokenizer(
    list(eval_dataset['title']),
    list(eval_dataset['comment']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print(tokenized_train['input_ids'][0])
print(tokenizer.decode(tokenized_train['input_ids'][0]))

tensor([    2,     6,    61, 18398,    63,    11, 14734,  4172,  4178,  8018,
           11,  8929, 38390,  4215,  4151, 33984,  4166,    16, 11464,  8934,
         4027,  4529, 27064,  4628,  4424,  4276, 14734,  4331,    11, 12401,
           11,     6,     3, 17870,  2434,  8622, 17979, 12055,    18,  8426,
        12140, 35915,  8082,    18,  8022, 23845,  8485,  8004,    18,    18,
           18,     3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])
[CLS] " [ 종합 ]'조카면족하다'결혼14년차 김원희, 출산 진심고백→홍석천 조카와'갈등'" [SEP] 몸이 안 따라준거였네. 마음고생 심했겠다. 그냥 솔직하게 얘기하지... [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [32]:

class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, pair_dataset, label):
        self.pair_dataset = pair_dataset
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)

In [33]:

def label_to_num(label): 
    label_dict = {"none": 0, "others": 1, "gender": 2}  #여기   answer 는 의미 없는거임 참고한 코드에 있어서 그냥 놔둠
    num_label = []

    for v in label: 
        num_label.append(label_dict[v])
    
    return num_label


train_label = label_to_num(train_dataset['label'].values)
eval_label = label_to_num(eval_dataset['label'].values)

In [34]:
train_dataset = BERTDataset(tokenized_train, train_label)
eval_dataset = BERTDataset(tokenized_eval, eval_label)

print(train_dataset.__len__())
print(train_dataset.__getitem__(7529))
print(tokenizer.decode(train_dataset.__getitem__(7529)['input_ids'])) # 메서드 호출하면 이렇게 생겼구나 ..

7530
{'input_ids': tensor([    2,     6,    11, 16003,  4878,  4008,  4075,    11, 10019,  4192,
           16, 34264,  4230, 44225,  3192, 16003,  4169,  9181, 28816,  4058,
           61, 18398,    63,     6,     3,   519,  5751, 39853, 11061,   519,
         5751,     3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [38]:
def compute_metrics(pred):
  """ validation을 위한 metrics function """
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  probs = pred.predictions

  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds) # 리더보드 평가에는 포함되지 않습니다.
  f1_macro = f1_score(labels, preds, average='macro')
  return {
      'accuracy': acc,
      'f1_macro': f1_macro,
  }

In [39]:
training_ars = TrainingArguments(

    # 항상바꿔주자. checkpoint 마다 모델이 해당 경로에 저장됨.
    output_dir='/content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04',
    num_train_epochs=20,
    per_device_train_batch_size=32,
    save_total_limit=5, # 성능 상위 5개 모델만 저장.  이거 용량 꽤 커서 제한 해줘야댐.
    save_strategy = 'epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end = True, # parameter들 의미를 정확히 모름 알아보고 바꿔주면 성능 올라갈듯.
    metric_for_best_model= 'f1_macro',
    greater_is_better= True,
    fp16 = True,

)

trainer = Trainer(
    model=model,
    args=training_ars,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

Using amp half precision backend


In [40]:
trainer.train()
model.save_pretrained('/content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/best_model')

***** Running training *****
  Num examples = 7530
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4720
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.154766,0.968937,0.955543
2,No log,0.108461,0.978495,0.973487
3,0.092600,0.113813,0.982079,0.976133
4,0.092600,0.148388,0.970131,0.95956
5,0.029700,0.157224,0.9773,0.968762
6,0.029700,0.167681,0.976105,0.965828
7,0.009400,0.15444,0.9773,0.970229
8,0.009400,0.173,0.978495,0.969782


***** Running Evaluation *****
  Num examples = 837
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-236
Configuration saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-236/config.json
Model weights saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-236/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-236/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-236/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 837
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-472
Configuration saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-472/c

In [97]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Tokenizer_NAME = bias_model
tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

MODEL_NAME = '/content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/checkpoint-1652'  # checkpoint 마다 미리 지정해둔 경로에 모델 저장됨 ㅇㅇ
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(tokenizer.vocab_size) # load_best_model = true 라서 그냥 모델쓰면댐 ㅇㅇ
model.to(device)

print(tokenizer)

loading file https://huggingface.co/beomi/beep-KcELECTRA-base-bias/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/69fdbef31e3e81dc248c7cf959d7c4e20bc3541ac6991ec073938a51e64151ea.a59cda3abc7fe9224f5b3344b4ac76b515bb2d86124f7ab6cfd6f7be710361c3
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-bias/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/63868885a8c2d98e7fdc35996d9382cea91058406924a6ae8db658bb5ac9b263.4952cacdcbbd2176992883f3375706d756af20e3e4d3337a2884539239fdf20c
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-bias/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-bias/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/640ba04c8ce4ee7851648b0c532ef666015efacb011e5a285cd5601228f6e5f6.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/beomi/beep-K

PreTrainedTokenizerFast(name_or_path='beomi/beep-KcELECTRA-base-bias', vocab_size=50135, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [98]:
test['label'] = 'none'

In [99]:
test.tail(5)

Unnamed: 0,ID,title,comment,label
506,506,"[N이슈] 최율, 조재현 성추행 의혹 폭로… 소속사 ""상황 파악 중""",얜 그냥 봐도 아니다 ㅋ 고소당하면 어마어마한 금액 물어줘야할껄?,none
507,507,"해투4' 이서진, 한지민 '대본 리딩 격리설' 해명…""날씨가 좋아서"" [SC컷]",대박 게스트... 꼭 봐야징~ 컨셉이 바뀌니깐 재미지넹,none
508,508,"[SS인터뷰①]박민영 ""'김비서' 행복했다..열애설엔 당당..미소였으니까""",성형으로 다 뜯어고쳐놓고 예쁜척. 성형 전 니 얼굴 다 알고있다. 순자처럼 된장냄새...,none
509,509,"[POP이슈]""사실무근"" 'SKY캐슬' 측 '위올라이' 표절설 부인→여전히 '핫'(종합)",분위기는 비슷하다만 전혀다른 전개던데 무슨ㅋㅋㄱ 우리나라사람들은 분위기만 비슷하면 ...,none
510,510,"오창석♥' 이채은, 웨딩사진?...순백의 드레스 입고 '활짝'",입에 손가릭이 10개 있으니 징그럽다,none


In [100]:
test_label = label_to_num(test['label'].values)

tokenized_test = tokenizer(
    list(test['title']),
    list(test['comment']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

test_dataset = BERTDataset(tokenized_test, test_label)

print(test_dataset.__len__())
print(test_dataset.__getitem__(510))
print(tokenizer.decode(test_dataset.__getitem__(6)['input_ids']))

511
{'input_ids': tensor([    2,  2571,  4473,  4424, 27048,    11, 42252,  4192,    16, 31895,
        12869,    33,    18,    18,    18,  2155,  4529,  4041, 29372, 12634,
           11, 31971,    11,     3, 11010,  2111,  4050,  5222,  4012,  8229,
         4010,  8842, 23704,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask'

In [101]:
dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
output_pred = []
output_prob = []

for i, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
            token_type_ids=data['token_type_ids'].to(device)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
print(pred_answer)

100%|██████████| 16/16 [00:01<00:00, 10.29it/s]

[0, 0, 0, 1, 1, 1, 1, 0, 2, 0, 0, 2, 0, 1, 0, 2, 0, 2, 1, 0, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1, 2, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 1, 1, 0, 0, 0, 2, 0, 0, 2, 2, 0, 1, 0, 0, 2, 0, 0, 0, 2, 1, 2, 0, 2, 0, 0, 1, 2, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 2, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 2, 0, 0, 1, 1, 2, 0, 0, 0, 1, 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 1, 0, 0, 2, 0, 0, 1, 2, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 




In [102]:
def num_to_label(label):
    label_dict = {0: "none", 1: "others", 2: "gender"}
    str_label = []

    for i, v in enumerate(label):
        str_label.append([i,label_dict[v]])
    
    return str_label

answer = num_to_label(pred_answer)
print(answer)

[[0, 'none'], [1, 'none'], [2, 'none'], [3, 'others'], [4, 'others'], [5, 'others'], [6, 'others'], [7, 'none'], [8, 'gender'], [9, 'none'], [10, 'none'], [11, 'gender'], [12, 'none'], [13, 'others'], [14, 'none'], [15, 'gender'], [16, 'none'], [17, 'gender'], [18, 'others'], [19, 'none'], [20, 'gender'], [21, 'none'], [22, 'others'], [23, 'none'], [24, 'none'], [25, 'none'], [26, 'gender'], [27, 'none'], [28, 'none'], [29, 'none'], [30, 'others'], [31, 'gender'], [32, 'none'], [33, 'others'], [34, 'none'], [35, 'gender'], [36, 'none'], [37, 'none'], [38, 'none'], [39, 'none'], [40, 'others'], [41, 'none'], [42, 'others'], [43, 'none'], [44, 'none'], [45, 'none'], [46, 'none'], [47, 'none'], [48, 'none'], [49, 'gender'], [50, 'others'], [51, 'none'], [52, 'others'], [53, 'gender'], [54, 'none'], [55, 'gender'], [56, 'gender'], [57, 'none'], [58, 'none'], [59, 'none'], [60, 'others'], [61, 'gender'], [62, 'none'], [63, 'gender'], [64, 'none'], [65, 'none'], [66, 'none'], [67, 'none'], [

In [103]:
df = pd.DataFrame(answer, columns=['ID', 'bias'])

df.to_csv('/content/drive/MyDrive/AIConnect/NLP_classificaiton/seperated_submit01.csv', index=False) # 매번 파일 이름 바꿔주자

print(df)

      ID    bias
0      0    none
1      1    none
2      2    none
3      3  others
4      4  others
..   ...     ...
506  506    none
507  507    none
508  508  others
509  509  others
510  510  others

[511 rows x 2 columns]


# bias 끝

# 이제 hate 모델을 만들어 보자
bias 와 똑같이 진행 하되, Number of classes 를 2로 바꾸고 label column 을 바꿔주자

In [104]:
# tokenizer 는 위에서 이미 load 되어 있으므로 생략 해도됨(아마도)
MODEL_NAME = hate_model  # 위에 test 할때 MODEL_NAME 변수에 checkpoint-1500 모델로 저장되어 있어서 새로 불러옴 ㅇㅇ

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

config = AutoConfig.from_pretrained(MODEL_NAME)
config.num_labels = 3 # 오류나길레 그냥 3으로 해줌.  기존 학습 데이터 라벨 : none, hate, offensive

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

print(model)
print(config)

loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/013e359d5e8b3c38b2b1f7016ddc0c5a6e82edb1e42d78aee92f070a0e775326.a59cda3abc7fe9224f5b3344b4ac76b515bb2d86124f7ab6cfd6f7be710361c3
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/97b99140595f8ed0b06385019dacdded5dd5aa062c53686325d3d466bee4aac5.4952cacdcbbd2176992883f3375706d756af20e3e4d3337a2884539239fdf20c
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/22b31305287f4dd412eda7755af5342128d3c2bf168efc682db44f2027d127d2.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/beomi/beep-K

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [105]:
train.head(1)

Unnamed: 0,title,comment,bias,label
0,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?""",김태리 정말 연기잘해 진짜,none,none


In [79]:
train.rename(columns = {'label':'bias', 'hate':'label'},inplace=True) # label -> bias ,  hate -> label   아니면 train dataset 새로 불러오던가.

In [80]:
train.head(10)

Unnamed: 0,title,comment,bias,label
0,"""'미스터 션샤인' 변요한, 김태리와 같은 양복 입고 학당 방문! 이유는?""",김태리 정말 연기잘해 진짜,none,none
1,"""[SC현장]""""극사실주의 현실♥""""…'가장 보통의 연애' 김래원X공효진, 16년만...",공효진 발연기나이질생각이읍던데 왜계속주연일까,none,hate
2,"""손연재, 리듬체조 학원 선생님 """"하고 싶은 일 해서 행복하다""""""",누구처럼 돈만 밝히는 저급인생은 살아가지마시길~~ 행복은 머니순이 아니니깐 작은거에...,others,hate
3,"""'섹션TV' 김해숙 """"'허스토리' 촬영 후 우울증 얻었다""""""",일본 축구 져라,none,none
4,"""[단독] 임현주 아나운서 “‘노브라 챌린지’ 방송 덕에 낸 용기, 자연스런 논의의...",난 절대로 임현주 욕하는인간이랑은 안논다 @.@,none,none
5,"""샤이니 온유, 클럽 강제추행 '무혐의' 처분 받았다""",만진건 변하지 않는다 아이돌은 아이돌 좋아하는 여자들한테 이미지가 생명인데 얜 바닥...,others,hate
6,"""[POP이슈]'프듀2' 김사무엘 父, 멕시코서 숨진 채 발견→타살 의혹 제기→애도...",연예계에 외국인노동자 많네..,others,hate
7,"""슈, 동안미모+아찔한 수영복 자태로 시선강탈 """"밥을 먹을 수가 없네""""""",아이는 대부분 엄마가 원해서낳고 독박육아하고남편은 그냥 따라고는 수준!싫은 내색도 ...,gender,hate
8,"""AOA 지민, 앙상한 몸매·건강이상설 직접 해명 """"건강합니다"""" [공식입장]""",먼기사를 기대하고 사진을 올리는지....관종,none,hate
9,"""""""이 정도면 신드롬""""..'연예인들의 연예인' 양준일, 이지혜→김이나→신현준도 ...",개그맨 김경민 닮은다고 나만 느낌?,none,none


In [81]:
train_dataset, eval_dataset = train_test_split(train, test_size=0.1, shuffle=True, stratify=train['label'])

tokenized_train = tokenizer(
    list(train_dataset['title']),
    list(train_dataset['comment']),
    return_tensors="pt",
    max_length=256, # Max_Length = 138
    padding=True,
    truncation=True,
    add_special_tokens=True
)

tokenized_eval = tokenizer(
    list(eval_dataset['title']),
    list(eval_dataset['comment']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

print(tokenized_train['input_ids'][0])
print(tokenizer.decode(tokenized_train['input_ids'][0]))

tensor([    2,    61, 12081,  4231,  4231,    63,    11,  2836,  4194,  4486,
         9464,    11,  2571,  4782,  8254,    33,    18,    18,    26,  4107,
        28877,  3775,  4230,  4379, 10657,     3,  9092,  2836,  4194,  4486,
         8045, 13738, 12466,  4268,     3,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])
[CLS] [ 스타톡톡 ]'정소영 남편'오협 누구?.. 6세 연상 학구파 배우 [SEP] 일단 정소영부터 누군지 모르겠음 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [82]:
# hate label 에 맞게 바꿔주자   # 좀더 깔끔한 방법이 있을거같은데 일단 ㄱ
def hate_to_num(label): 
    label_dict = {"none": 0, "hate": 1, 'offensive': 1}
    num_label = []

    for v in label: 
        num_label.append(label_dict[v])
    
    return num_label


train_label = hate_to_num(train_dataset['label'].values)
eval_label = hate_to_num(eval_dataset['label'].values)

In [83]:
train_dataset = BERTDataset(tokenized_train, train_label)
eval_dataset = BERTDataset(tokenized_eval, eval_label)

print(train_dataset.__len__())
print(train_dataset.__getitem__(7529))
print(tokenizer.decode(train_dataset.__getitem__(7529)['input_ids']))

7530
{'input_ids': tensor([    2,     6, 25048,  3481, 33381,  4499,  4591,    16, 28719, 12433,
        27404, 25049,    18,    18, 25046,  3389,  4562,  4265, 25047,   335,
         4156,  4376,    16, 19412, 11751,   820, 19744,    12, 18398,    13,
            6,     3,  3001,  4336, 10460,  8340,  2434,  4675,  4095,     3,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,

In [84]:
training_ars = TrainingArguments(

    # 항상바꿔주자. checkpoint 마다 모델이 해당 경로에 저장됨.
    output_dir='/content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate',
    num_train_epochs=20,
    per_device_train_batch_size=32,
    save_total_limit=5, # 성능 상위 5개 모델만 저장.  이거 용량 꽤 커서 제한 해줘야댐.
    save_strategy = 'epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end = True, # parameter들 의미를 정확히 모름 알아보고 바꿔주면 성능 올라갈듯.
    metric_for_best_model= 'f1_macro',
    greater_is_better= True,
    fp16 = True,

)

trainer = Trainer(
    model=model,
    args=training_ars,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend


In [85]:
trainer.train()
model.save_pretrained('/content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/best_model')

***** Running training *****
  Num examples = 7530
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 4720
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.123256,0.962963,0.962212
2,No log,0.164667,0.95221,0.951639
3,0.140900,0.204515,0.947431,0.946591
4,0.140900,0.30245,0.943847,0.942823
5,0.038100,0.315342,0.941458,0.94089


***** Running Evaluation *****
  Num examples = 837
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-236
Configuration saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-236/config.json
Model weights saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-236/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-236/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-236/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 837
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-472
Configuration saved in /content/drive/MyDrive/AIConnect/NLP_classificaiton/resul

KeyboardInterrupt: ignored

In [106]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

Tokenizer_NAME = hate_model
tokenizer = AutoTokenizer.from_pretrained(Tokenizer_NAME)

MODEL_NAME = '/content/drive/MyDrive/AIConnect/NLP_classificaiton/result/kcelectra04/hate/checkpoint-236'  # checkpoint 마다 미리 지정해둔 경로에 모델 저장됨 ㅇㅇ
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(tokenizer.vocab_size)
model.to(device)

print(tokenizer)

loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/013e359d5e8b3c38b2b1f7016ddc0c5a6e82edb1e42d78aee92f070a0e775326.a59cda3abc7fe9224f5b3344b4ac76b515bb2d86124f7ab6cfd6f7be710361c3
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/97b99140595f8ed0b06385019dacdded5dd5aa062c53686325d3d466bee4aac5.4952cacdcbbd2176992883f3375706d756af20e3e4d3337a2884539239fdf20c
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/beomi/beep-KcELECTRA-base-hate/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/22b31305287f4dd412eda7755af5342128d3c2bf168efc682db44f2027d127d2.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/beomi/beep-K

PreTrainedTokenizerFast(name_or_path='beomi/beep-KcELECTRA-base-hate', vocab_size=50135, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [107]:
test['label'] = 'none' # label colum}n 생성 아무값으로 ㄱ

In [108]:
test.tail(3)

Unnamed: 0,ID,title,comment,label
508,508,"[SS인터뷰①]박민영 ""'김비서' 행복했다..열애설엔 당당..미소였으니까""",성형으로 다 뜯어고쳐놓고 예쁜척. 성형 전 니 얼굴 다 알고있다. 순자처럼 된장냄새...,none
509,509,"[POP이슈]""사실무근"" 'SKY캐슬' 측 '위올라이' 표절설 부인→여전히 '핫'(종합)",분위기는 비슷하다만 전혀다른 전개던데 무슨ㅋㅋㄱ 우리나라사람들은 분위기만 비슷하면 ...,none
510,510,"오창석♥' 이채은, 웨딩사진?...순백의 드레스 입고 '활짝'",입에 손가릭이 10개 있으니 징그럽다,none


In [109]:
test_label = label_to_num(test['label'].values)

tokenized_test = tokenizer(
    list(test['title']),
    list(test['comment']),
    return_tensors="pt",
    max_length=256,
    padding=True,
    truncation=True,
    add_special_tokens=True
)

test_dataset = BERTDataset(tokenized_test, test_label)

print(test_dataset.__len__())
print(test_dataset.__getitem__(510))
print(tokenizer.decode(test_dataset.__getitem__(6)['input_ids']))

511
{'input_ids': tensor([    2,  2571,  4473,  4424, 27048,    11, 42252,  4192,    16, 31895,
        12869,    33,    18,    18,    18,  2155,  4529,  4041, 29372, 12634,
           11, 31971,    11,     3, 11010,  2111,  4050,  5222,  4012,  8229,
         4010,  8842, 23704,     3,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask'

In [110]:
dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
output_pred = []
output_prob = []

for i, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
            token_type_ids=data['token_type_ids'].to(device)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)

    output_pred.append(result)
    output_prob.append(prob)
  
pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
print(pred_answer)

100%|██████████| 16/16 [00:01<00:00, 10.30it/s]

[0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 




In [91]:
print(output_prob) # 오.. 이거 확률값.. threshold 가능 할듯 ?

[[0.9995812773704529, 0.0003982054768130183, 2.048896931228228e-05], [0.9994189739227295, 0.0005688293604180217, 1.2244440767972264e-05], [0.0001859772455645725, 0.9998078942298889, 6.1965461100044195e-06], [4.6412831579800695e-05, 0.9999257326126099, 2.781898911052849e-05], [0.0001295139518333599, 0.9998600482940674, 1.037426136463182e-05], [0.000117712355859112, 0.9998735189437866, 8.707782399142161e-06], [9.49262102949433e-05, 0.9998925924301147, 1.2460885045584291e-05], [0.0005477132508531213, 0.9994468092918396, 5.474116733239498e-06], [9.783659334061667e-05, 0.9998915195465088, 1.0587494216451887e-05], [0.999524712562561, 0.0004592344048433006, 1.6083748050732538e-05], [0.000938570883590728, 0.9990563988685608, 5.090661943540908e-06], [8.981704013422132e-05, 0.9998996257781982, 1.049213096848689e-05], [0.9995023012161255, 0.00048382760724052787, 1.383178368996596e-05], [5.286551095196046e-05, 0.9999244213104248, 2.2693477149005048e-05], [0.9995619654655457, 0.00042054959340021014

In [111]:
def num_to_hate(label):
    label_dict = {0: "none", 1: "hate"}
    str_label = []

    for i, v in enumerate(label):
        str_label.append([i,label_dict[v]])
    
    return str_label

answer = num_to_hate(pred_answer)
print(answer)

[[0, 'none'], [1, 'none'], [2, 'hate'], [3, 'hate'], [4, 'hate'], [5, 'hate'], [6, 'hate'], [7, 'hate'], [8, 'hate'], [9, 'none'], [10, 'hate'], [11, 'hate'], [12, 'none'], [13, 'hate'], [14, 'none'], [15, 'hate'], [16, 'hate'], [17, 'hate'], [18, 'none'], [19, 'none'], [20, 'hate'], [21, 'hate'], [22, 'hate'], [23, 'hate'], [24, 'hate'], [25, 'hate'], [26, 'hate'], [27, 'none'], [28, 'none'], [29, 'none'], [30, 'hate'], [31, 'hate'], [32, 'hate'], [33, 'hate'], [34, 'none'], [35, 'hate'], [36, 'hate'], [37, 'none'], [38, 'none'], [39, 'none'], [40, 'hate'], [41, 'hate'], [42, 'hate'], [43, 'none'], [44, 'none'], [45, 'hate'], [46, 'hate'], [47, 'hate'], [48, 'none'], [49, 'hate'], [50, 'hate'], [51, 'none'], [52, 'hate'], [53, 'hate'], [54, 'hate'], [55, 'hate'], [56, 'hate'], [57, 'none'], [58, 'none'], [59, 'hate'], [60, 'hate'], [61, 'none'], [62, 'none'], [63, 'hate'], [64, 'hate'], [65, 'hate'], [66, 'hate'], [67, 'hate'], [68, 'none'], [69, 'hate'], [70, 'none'], [71, 'hate'], [

In [112]:
df_hate = pd.DataFrame(answer, columns=['ID', 'hate'])
df_hate.tail(3)

Unnamed: 0,ID,hate
508,508,hate
509,509,hate
510,510,hate


In [113]:
df = pd.read_csv('/content/drive/MyDrive/AIConnect/NLP_classificaiton/seperated_submit01.csv') # 아까 저장한 bias csv 파일 load
df.tail(3)

Unnamed: 0,ID,bias
508,508,others
509,509,others
510,510,others


In [114]:
df['hate'] = df_hate['hate']  # df 에 hate column 을 만들어서 hate 값 추가
df.tail(3)

Unnamed: 0,ID,bias,hate
508,508,others,hate
509,509,others,hate
510,510,others,hate


In [115]:
# bias, hate 값이 들어간 최종 csv 파일 
df.to_csv('/content/drive/MyDrive/AIConnect/NLP_classificaiton/seperated_submit02.csv', index=False) # 매번 파일 이름 바꿔주자

print(df)

      ID    bias  hate
0      0    none  none
1      1    none  none
2      2    none  hate
3      3  others  hate
4      4  others  hate
..   ...     ...   ...
506  506    none  hate
507  507    none  none
508  508  others  hate
509  509  others  hate
510  510  others  hate

[511 rows x 3 columns]


In [None]:
# 이제 제출하러 ㄱㄱ 