In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 531 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [4]:
import os
import numpy as np
import pandas as pd

In [5]:
data_dir = '/content/drive/MyDrive/ColabNotebooks/데이터분석캡스톤디자인' #경로 지정하기
file_list = os.listdir(data_dir)
print(file_list)

['DistilBert.ipynb', 'data.csv', 'ppomppu.csv', 'fmkorea_hotdeal.csv', '2english.csv', 'data_digital.csv', 'Make_data.ipynb', '2korea.csv', 'ppomppu_DigitClassfiy_fold0.pt', 'ppomppu_DigitClassfiy_fold1.pt', 'ppomppu_DigitClassfiy_fold2.pt', 'ppomppu_DigitClassfiy_fold3.pt', 'ppomppu_DigitClassfiy_fold4.pt', '유사도모델.ipynb', 'PreProcessing+Model.ipynb']


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel,BertTokenizerFast, AlbertModel, BertModel, AutoTokenizer
from transformers import BertModel, RobertaTokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

In [7]:
class CategoryDataset(Dataset):
  def __init__(self, subjects, targets, tokenizer, max_len):
    self.subjects = subjects
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.subjects)
  def __getitem__(self, item):
    subject = str(self.subjects[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      subject,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding = 'max_length',
      truncation = True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'subject_text': subject,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }
def create_data_loader(df, tokenizer, max_len, batch_size, shuffle_=False, valid=False):
  if valid:
    ds = CategoryDataset(
      subjects=df.title.to_numpy(),
      targets=df.label.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
      )
  else:
    ds = CategoryDataset(
      subjects=df.title.to_numpy(),
      targets=df.label.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
    )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4,
    shuffle = shuffle_
  )

In [8]:
from sklearn.metrics import f1_score
import time
import math
import random
import argparse

def calc_review_acc(pred, label):
    _, idx = pred.max(1)
    
    acc = torch.eq(idx, label).sum().item() / idx.size()[0] #batchsize
    x = label.cpu().numpy()
    y = idx.cpu().numpy()
    f1_acc = f1_score(x, y, average='macro')
    return acc,f1_acc




class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [9]:
tokenizer_bert_kor_base = BertTokenizerFast.from_pretrained("kykim/bert-kor-base")
BATCH_SIZE = 256
MAX_LEN =64

Downloading:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/725 [00:00<?, ?B/s]

In [10]:
class ReviewClassifier(nn.Module):
  def __init__(self, n_classes):
    super(ReviewClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("kykim/bert-kor-base")
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def get_cls(target_size= n_classes):
      return nn.Sequential(
          nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size),
          nn.LayerNorm(self.bert.config.hidden_size),
          nn.Dropout(p = 0.1),
          nn.ReLU(),
          nn.Linear(self.bert.config.hidden_size, target_size),
      )  
    self.cls = get_cls(n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
       return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

device = torch.device("cuda")

def get_predictions(model, data_loader):
  model = model.eval()
  subject_texts = []
  predictions = []
  prediction_probs = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["subject_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      subject_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  return subject_texts, predictions, prediction_probs


In [11]:
import gc
from tqdm import tqdm
from glob import glob
pred_list_t = []

def inference(model, data_loader):
    
    for model in model_list:
        model.eval()

    for d in tqdm(data_loader):
        
        texts = d["subject_text"]
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
  
        
        with torch.no_grad():
            pred_list = []
            for model in model_list:
                outputs = model(
                  input_ids=input_ids,
                  attention_mask=attention_mask
                )
                pred_list.append(outputs)

            pred = ensemble(pred_list)

            pred_list_t.append(torch.softmax(pred,1))




def ensemble(pred_list):
    pred1= 0
    for pred in pred_list:

        pred1 += torch.softmax(pred, 1)
    pred1 /= len(pred_list) 
  
    return pred

train_df =pd.read_csv(data_dir + '/data_digital.csv')
train_df2 =pd.read_csv(data_dir + '/2korea.csv')

import copy
train_df3 = copy.deepcopy(train_df[:len(train_df2)])
train_df3["title"] = train_df2["번역"] 
train_df = pd.concat([train_df3,train_df])
dev_df = train_df.drop_duplicates(['title'])



SUBMISSION_DIR = '/content'

    
model_list = []
model_path_list = glob(os.path.join(data_dir, '*.pt'))
for model_path in model_path_list:
    model = ReviewClassifier(n_classes=32).to(device)
    if model_path != "":
        print("=> loading checkpoint '{}'".format(model_path))
        checkpoint = torch.load(model_path)                   
        model.load_state_dict(checkpoint, strict=True)  
    model.cuda()
    n_gpu = torch.cuda.device_count()
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    model_list.append(model)
if len(model_list) == 0:
    print('Please check the model directory.')
  

data_loader = create_data_loader(dev_df, tokenizer_bert_kor_base, MAX_LEN, BATCH_SIZE, valid=True)

inference(model_list,data_loader)


Downloading:   0%|          | 0.00/454M [00:00<?, ?B/s]

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


=> loading checkpoint '/content/drive/MyDrive/ColabNotebooks/데이터분석캡스톤디자인/ppomppu_DigitClassfiy_fold0.pt'


Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


=> loading checkpoint '/content/drive/MyDrive/ColabNotebooks/데이터분석캡스톤디자인/ppomppu_DigitClassfiy_fold1.pt'


Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


=> loading checkpoint '/content/drive/MyDrive/ColabNotebooks/데이터분석캡스톤디자인/ppomppu_DigitClassfiy_fold2.pt'


Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


=> loading checkpoint '/content/drive/MyDrive/ColabNotebooks/데이터분석캡스톤디자인/ppomppu_DigitClassfiy_fold3.pt'


Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


=> loading checkpoint '/content/drive/MyDrive/ColabNotebooks/데이터분석캡스톤디자인/ppomppu_DigitClassfiy_fold4.pt'


100%|██████████| 11/11 [00:25<00:00,  2.33s/it]


In [19]:
class ReviewClassifier(nn.Module):
  def __init__(self, n_classes):
    super(ReviewClassifier, self).__init__()
    self.bert = BertModel.from_pretrained("kykim/bert-kor-base")
    self.drop = nn.Dropout(p=0.1)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    def get_cls(target_size= n_classes):
      return nn.Sequential(
          nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size),
          nn.LayerNorm(self.bert.config.hidden_size),
          nn.Dropout(p = 0.1),
          nn.ReLU(),
          nn.Linear(self.bert.config.hidden_size, target_size),
      )  
    self.cls = get_cls(n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask,
       return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

device = torch.device("cuda")

def get_predictions(model, data_loader):
  model = model.eval()
  subject_texts = []
  predictions = []
  prediction_probs = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["subject_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      subject_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  return subject_texts, predictions, prediction_probs


In [26]:
import gc


EPOCHS = 60
model_bert_kor_base = ReviewClassifier(n_classes=32).to(device)
optimizer = AdamW(model_bert_kor_base.parameters(), lr=3e-5)
total_steps = len(data_loader) * EPOCHS
scheduler = get_cosine_schedule_with_warmup(
  optimizer,
  num_warmup_steps=int(total_steps*0.1),
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
model_bert_kor_base.bert.encoder.layer = model_bert_kor_base.bert.encoder.layer[0:3]

In [28]:

from tqdm import tqdm
def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples):

  batch_time = AverageMeter()     
  data_time = AverageMeter()      
  losses = AverageMeter()         
  accuracies = AverageMeter()
  f1_accuracies = AverageMeter()
  
  sent_count = AverageMeter()   
    

  start = end = time.time()

  model = model.train()
  correct_predictions = 0
  for step,d in enumerate(data_loader):
    data_time.update(time.time() - end)
    batch_size = d["input_ids"].size(0) 

    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    targets2 = pred_list_t[step].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    #loss 수정해보기 
    loss = loss_fn(outputs, targets) + loss_fn(outputs, targets2)

    correct_predictions += torch.sum(preds == targets)
    losses.update(loss.item(), batch_size)
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    batch_time.update(time.time() - end)
    end = time.time()

    sent_count.update(batch_size)
    if step % 4 == 0 or step == (len(data_loader)-1):
                acc,f1_acc = calc_review_acc(outputs, targets)
                accuracies.update(acc, batch_size)
                f1_accuracies.update(f1_acc, batch_size)

                
                print('Epoch: [{0}][{1}/{2}] '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.3f}({loss.avg:.3f}) '
                      'Acc: {acc.val:.3f}({acc.avg:.3f}) '   
                      'f1_Acc: {f1_acc.val:.3f}({f1_acc.avg:.3f}) '           
                      'sent/s {sent_s:.0f} '
                      .format(
                      epoch, step+1, len(data_loader),
                      data_time=data_time, loss=losses,
                      acc=accuracies,
                      f1_acc=f1_accuracies,
                      remain=timeSince(start, float(step+1)/len(data_loader)),
                      sent_s=sent_count.avg/batch_time.avg
                      ))

  return correct_predictions.double() / n_examples, losses.avg

for epoch in range(EPOCHS):
  print('-' * 10)
  print(f'Epoch {epoch}/{EPOCHS-1}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model_bert_kor_base,
    data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(dev_df)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')
  print("")
  print("")

----------
Epoch 0/59
----------
Epoch: [0][1/11] Data 0.384 (0.384) Elapsed 0m 0s (remain 0m 8s) Loss: 6.628(6.628) Acc: 0.020(0.020) f1_Acc: 0.009(0.009) sent/s 312 
Epoch: [0][5/11] Data 0.002 (0.083) Elapsed 0m 2s (remain 0m 2s) Loss: 7.433(6.903) Acc: 0.000(0.010) f1_Acc: 0.000(0.004) sent/s 546 
Epoch: [0][9/11] Data 0.002 (0.047) Elapsed 0m 3s (remain 0m 0s) Loss: 6.653(6.862) Acc: 0.059(0.026) f1_Acc: 0.014(0.008) sent/s 596 
Epoch: [0][11/11] Data 0.002 (0.039) Elapsed 0m 4s (remain 0m 0s) Loss: 6.921(6.856) Acc: 0.070(0.035) f1_Acc: 0.008(0.008) sent/s 606 
Train loss 6.856474090943687 accuracy 0.05758783049619703


----------
Epoch 1/59
----------
Epoch: [1][1/11] Data 0.357 (0.357) Elapsed 0m 0s (remain 0m 7s) Loss: 6.590(6.590) Acc: 0.031(0.031) f1_Acc: 0.021(0.021) sent/s 342 
Epoch: [1][5/11] Data 0.008 (0.076) Elapsed 0m 2s (remain 0m 2s) Loss: 7.339(6.844) Acc: 0.000(0.016) f1_Acc: 0.000(0.011) sent/s 564 
Epoch: [1][9/11] Data 0.004 (0.043) Elapsed 0m 3s (remain 0m 0s

In [29]:
torch.save(model_bert_kor_base.state_dict(), "ppomppu_DigitClassfiy.pt")