<a href="https://colab.research.google.com/github/Taewon-Park/Dacon/blob/main/%EA%B0%90%EC%A0%95_%EC%9D%B8%EC%8B%9D_(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
import os
os.chdir('/content/drive/MyDrive/dacon/발화자의 감정인식/data/open')

In [None]:
!gdown https://drive.google.com/uc?=1-QNs8sk5X3u_1rK-dv5ESgQIt0ZDQSwY
!unzip -qq "./data/databffhw4rntmp"

In [10]:
# Requirements
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import random
import os

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from tqdm import tqdm
from transformers import BertTokenizer, RobertaModel, AutoModel
from transformers import BertModel, RobertaTokenizer, AutoTokenizer
from torch.optim import Adam

import matplotlib as mpl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

In [25]:
# Train Data Load
data = pd.read_csv("train.csv")

In [57]:
# Preprocessing
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

CFG = {
  'EPOCHS' : 50,
  'LEARNING_RATE' : 1e-5,
  'BATCH_SIZE' : 2,
  'SEED' : 42
}


def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = True


def Encoding(data):
  le = LabelEncoder()
  le = le.fit(data['Target'])
  data['Target'] = le.transform(data['Target'])

  return data['Target']

def Class_Weights(data):
  class_counts = data['Target'].value_counts()
  class_weights = 1./class_counts
  class_weights = class_weights/class_weights.min()
  class_weights = class_weights.to_dict()
  class_weights = {k : v for k, v in sorted(class_weights.items(), key=lambda item : item[0])}
  class_weights = list(class_weights.values())
  class_weights = torch.FloatTensor(class_weights).to(device)

  return class_weights

def Strat_Split(nsplit, data):

  folds = StratifiedKFold(n_splits=nsplit, shuffle=True, random_state=CFG['SEED'])
  data['fold'] = -1
  for i in range(nsplit):
    trn_idx, val_idx = list(folds.split(data, data['Target']))[i]
    valid = data.iloc[val_idx]
    data.loc[data[data.ID.isin(valid.ID) == True].index.to_list(), 'fold'] = i

  data.to_csv('train_fold.csv', index=False)
  fold = pd.read_csv("train_fold.csv")

  le = LabelEncoder()
  le = le.fit(fold['Target'])
  fold['Target'] = le.transform(fold['Target'])

  strat_train = fold[fold['fold'] != 1].reset_index(drop=True)
  strat_valid = fold[fold['fold'] == 1].reset_index(drop=True)

  return strat_train, strat_valid, le


def Tokenizer_Define():
  bert = AutoModel.from_pretrained("tae898/emoberta-large").to(device)
  bert_pool = bert.pooler
  bert.pooler = torch.nn.Identity()

  return bert, bert_pool


class CustomDataset(torch.utils.data.Dataset):
  def __init__(self, data, mode = "train"):
    self.dataset = data
    self.mode = mode
    self.feature = []

    tokenizer = AutoTokenizer.from_pretrained("tae898/emoberta-large")

    for text in tqdm(data['Utterance']):
      inputs = tokenizer(text, padding='max_length', max_length=328, truncation=True, return_tensors="pt")
      input_ids = inputs['input_ids'][0][None].to(device)
      attention_mask = inputs['attention_mask'][0][None].to(device)

      _, pooled_output = bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
      self.feature.append(pooled_output.detach()[0])

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    if self.mode == "train":
      return self.feature[idx], self.dataset['Target'][idx]
    else:
      return self.feature[idx]

In [None]:
# Run Preprocessing
seed_everything(CFG['SEED'])

Encoding(data)
class_weights = Class_Weights(data)
strat_train, strat_valid, le = Strat_Split(35, data)

bert, bert_pool = Tokenizer_Define()

In [None]:
# Stratified Dataset Check
print("train set : ", len(strat_train))
print("test set : ", len(strat_valid))

In [None]:
# Make CustomDataset & DataLoader
train_ds = CustomDataset(strat_train, mode = "train")
valid_ds = CustomDataset(strat_valid, mode = "train")
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size= CFG['BATCH_SIZE'], shuffle=True)
val_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size= CFG['BATCH_SIZE'], shuffle=True)

In [55]:
# Modeling
class BaseModel(nn.Module):
  def __init__(self, dropout=0.5, num_classes=len(le.classes_)):
    super(BaseModel, self).__init__()
    self.bert_pool = bert_pool
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Sequential(
      nn.Linear(1024, 512),
      nn.ReLU(),
      nn.Linear(512, num_classes),
    )

  def forward(self, pooled_output):
    pooled_output = self.bert_pool(pooled_output)
    dropout_output = self.dropout(pooled_output)
    linear_output = self.linear(pooled_output)

    return linear_output


def train(model, optimizer, train_loader, test_loader, scheduler, class_weights, device):
  model.to(device)
  criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)

  best_score = 0
  best_model = "None"
  for epoch_num in range(CFG["EPOCHS"]):
    model.train()
    train_loss = []
    for pooled_output, train_label in tqdm(train_loader):
      optimizer.zero_grad()
      pooled_output = pooled_output.to(device)
      train_label = train_label.to(device)

      output = model(pooled_output).to(device)
      batch_loss = criterion(output, train_label.long())

      batch_loss.backward()
      optimizer.step()

      train_loss.append(batch_loss.item())

    val_loss, val_score = validation(model, criterion, test_loader, device)
    print(f'Epoch [{epoch_num}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score : .5f}]')
    if scheduler is not None:
      scheduler.step(val_score)

    best_model = model
    best_score = val_score

  return best_model


def competition_metric(true, pred):
  return f1_score(true, pred, average="macro")


def validation(model, criterion, test_loader, device):
  model.eval()

  val_loss = []
  model_preds = []
  true_labels = []
  with torch.no_grad():
    for pooled_output, valid_label in tqdm(test_loader):
      valid_label = valid_label.to(device)
      pooled_output = pooled_output.to(device)

      output = model(pooled_output).to(device)

      batch_loss = criterion(output, valid_label.long())
      val_loss.append(batch_loss.item())

      model_preds += output.argmax(1).detach().cpu().numpy().tolist()
      true_labels += valid_label.detach().cpu().numpy().tolist()
      val_f1 = competition_metric(true_labels, model_preds)

  return val_loss, val_f1

In [None]:
# Run
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)
infer_model = train(model, optimizer, train_dataloader, val_dataloader, scheduler, class_weights, device)

In [70]:
torch.save(model.state_dict(), 'model.pt')

In [71]:
torch.save(infer_model.state_dict(), 'infer_model.pt')

In [67]:
torch.cuda.empty_cache()

In [None]:
# Test Data Load
test = pd.read_csv("test.csv")

test_ds = CustomDataset(test, mode = "test")
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, device):
    model.to(device)
    model.eval()

    test_predict = []
    for pooled_output in tqdm(test_loader):
        pooled_output = pooled_output.to(device)
        y_pred = model(pooled_output)
        test_predict += y_pred.argmax(1).detach().cpu().numpy().tolist()
    print('Done.')

    return test_predict

In [None]:
preds = inference(infer_model, test_dataloader, device)
preds = le.inverse_transfrom(preds)

submit = pd.read_csv('sample_submission.csv')
submit['Target'] = preds

# submit에서 Target이 0이면 anger로 변환
submit['Target'] = submit['Target'].apply(lambda x: 'anger' if x == 0 else x)
submit['Target'] = submit['Target'].apply(lambda x: 'disgust' if x == 1 else x)
submit['Target'] = submit['Target'].apply(lambda x: 'fear' if x == 2 else x)
submit['Target'] = submit['Target'].apply(lambda x: 'joy' if x == 3 else x)
submit['Target'] = submit['Target'].apply(lambda x: 'neutral' if x == 4 else x)
submit['Target'] = submit['Target'].apply(lambda x: 'sadness' if x == 5 else x)
submit['Target'] = submit['Target'].apply(lambda x: 'surprise' if x == 6 else x)

submit.to_csv('submit.csv', index=False)