In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 14.4MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 53.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 46.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=049413

In [None]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [None]:
# GPU 사용
device = torch.device("cuda")

In [None]:
class NSMCDataset(Dataset, object):
  def __init__(self, csv_file, y=1):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0)
    # 중복제거
    if y==1:
      self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
    self.y=y
    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    if self.y==1:  
      row = self.dataset.iloc[idx, 1:3].values
      text = row[0]
      y = row[1]

      inputs = self.tokenizer(
          text, 
          return_tensors='pt',
          truncation=True,
          max_length=256,
          pad_to_max_length=True,
          add_special_tokens=True
          )
    
      input_ids = inputs['input_ids'][0]
      attention_mask = inputs['attention_mask'][0]

      return input_ids, attention_mask, y
    else :
      row = self.dataset.iloc[idx, 1:2].values
      text = row[0]
      y=0

      inputs = self.tokenizer(
          text, 
          return_tensors='pt',
          truncation=True,
          max_length=256,
          pad_to_max_length=True,
          add_special_tokens=True
          )
    
      input_ids = inputs['input_ids'][0]
      attention_mask = inputs['attention_mask'][0]

      return input_ids, attention_mask, y

In [None]:
train_dataset = NSMCDataset("/content/drive/MyDrive/nsmc/ratings_train.txt")
test_dataset = NSMCDataset("/content/drive/MyDrive/nsmc/ratings_test.txt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [None]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)

# 한번 실행해보기
# text, attention_mask, y = train_dataset[0]
# model(text.unsqueeze(0).to(device), attention_mask=attention_mask.unsqueeze(0).to(device))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [None]:
epochs = 5
batch_size = 4

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

### Training

In [None]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=36546.0), HTML(value='')))



Batch Loss: 69.21722507476807 Accuracy: tensor(0.5200, device='cuda:0')
Batch Loss: 133.23504096269608 Accuracy: tensor(0.5975, device='cuda:0')
Batch Loss: 182.72664107382298 Accuracy: tensor(0.6675, device='cuda:0')
Batch Loss: 227.25761848688126 Accuracy: tensor(0.7063, device='cuda:0')
Batch Loss: 272.0967660471797 Accuracy: tensor(0.7275, device='cuda:0')
Batch Loss: 309.6455512344837 Accuracy: tensor(0.7488, device='cuda:0')
Batch Loss: 353.9838161394 Accuracy: tensor(0.7532, device='cuda:0')
Batch Loss: 391.4807810112834 Accuracy: tensor(0.7647, device='cuda:0')
Batch Loss: 427.5494691878557 Accuracy: tensor(0.7756, device='cuda:0')
Batch Loss: 460.575051497668 Accuracy: tensor(0.7828, device='cuda:0')
Batch Loss: 497.3790390342474 Accuracy: tensor(0.7895, device='cuda:0')
Batch Loss: 531.1643585637212 Accuracy: tensor(0.7967, device='cuda:0')
Batch Loss: 567.481524232775 Accuracy: tensor(0.7996, device='cuda:0')
Batch Loss: 606.5991063304245 Accuracy: tensor(0.8014, device='cud

HBox(children=(FloatProgress(value=0.0, max=36546.0), HTML(value='')))

Batch Loss: 15.333025777712464 Accuracy: tensor(0.9400, device='cuda:0')
Batch Loss: 37.38345989352092 Accuracy: tensor(0.9300, device='cuda:0')
Batch Loss: 54.75651055108756 Accuracy: tensor(0.9250, device='cuda:0')
Batch Loss: 71.9959018419031 Accuracy: tensor(0.9262, device='cuda:0')
Batch Loss: 90.05059536802582 Accuracy: tensor(0.9250, device='cuda:0')
Batch Loss: 109.24697430524975 Accuracy: tensor(0.9221, device='cuda:0')
Batch Loss: 128.25333476648666 Accuracy: tensor(0.9211, device='cuda:0')
Batch Loss: 146.7034642912913 Accuracy: tensor(0.9225, device='cuda:0')
Batch Loss: 171.2762137867976 Accuracy: tensor(0.9206, device='cuda:0')
Batch Loss: 189.50553992320783 Accuracy: tensor(0.9220, device='cuda:0')
Batch Loss: 207.14536472735927 Accuracy: tensor(0.9234, device='cuda:0')
Batch Loss: 227.18746608262882 Accuracy: tensor(0.9227, device='cuda:0')
Batch Loss: 246.7364622238092 Accuracy: tensor(0.9217, device='cuda:0')
Batch Loss: 267.97862536879256 Accuracy: tensor(0.9209, dev

HBox(children=(FloatProgress(value=0.0, max=36546.0), HTML(value='')))

Batch Loss: 12.689118329901248 Accuracy: tensor(0.9650, device='cuda:0')
Batch Loss: 27.406596567947417 Accuracy: tensor(0.9537, device='cuda:0')
Batch Loss: 39.12105005141348 Accuracy: tensor(0.9567, device='cuda:0')
Batch Loss: 52.24243593146093 Accuracy: tensor(0.9550, device='cuda:0')
Batch Loss: 66.52882013539784 Accuracy: tensor(0.9540, device='cuda:0')
Batch Loss: 77.00421942654066 Accuracy: tensor(0.9575, device='cuda:0')
Batch Loss: 90.34541551559232 Accuracy: tensor(0.9575, device='cuda:0')
Batch Loss: 101.60095189581625 Accuracy: tensor(0.9584, device='cuda:0')
Batch Loss: 115.92195818643086 Accuracy: tensor(0.9572, device='cuda:0')
Batch Loss: 129.2474638747517 Accuracy: tensor(0.9570, device='cuda:0')
Batch Loss: 141.84297062153928 Accuracy: tensor(0.9568, device='cuda:0')
Batch Loss: 152.87625101138838 Accuracy: tensor(0.9575, device='cuda:0')
Batch Loss: 160.22484769043513 Accuracy: tensor(0.9588, device='cuda:0')
Batch Loss: 172.71129127731547 Accuracy: tensor(0.9580, d

HBox(children=(FloatProgress(value=0.0, max=36546.0), HTML(value='')))

Batch Loss: 8.94526504050009 Accuracy: tensor(0.9725, device='cuda:0')
Batch Loss: 16.69923048221972 Accuracy: tensor(0.9712, device='cuda:0')
Batch Loss: 28.30299501447007 Accuracy: tensor(0.9700, device='cuda:0')
Batch Loss: 39.50737463729456 Accuracy: tensor(0.9694, device='cuda:0')
Batch Loss: 47.79721613880247 Accuracy: tensor(0.9705, device='cuda:0')
Batch Loss: 58.60232532361988 Accuracy: tensor(0.9696, device='cuda:0')
Batch Loss: 67.79325605288614 Accuracy: tensor(0.9689, device='cuda:0')
Batch Loss: 75.6598373609595 Accuracy: tensor(0.9697, device='cuda:0')
Batch Loss: 84.58237877790816 Accuracy: tensor(0.9686, device='cuda:0')
Batch Loss: 92.06687269639224 Accuracy: tensor(0.9693, device='cuda:0')
Batch Loss: 102.51350448071025 Accuracy: tensor(0.9684, device='cuda:0')
Batch Loss: 112.90885937260464 Accuracy: tensor(0.9679, device='cuda:0')
Batch Loss: 122.46488365996629 Accuracy: tensor(0.9681, device='cuda:0')
Batch Loss: 131.09118460235186 Accuracy: tensor(0.9682, device=

HBox(children=(FloatProgress(value=0.0, max=36546.0), HTML(value='')))

Batch Loss: 7.709196062176488 Accuracy: tensor(0.9700, device='cuda:0')
Batch Loss: 14.143592519219965 Accuracy: tensor(0.9737, device='cuda:0')
Batch Loss: 22.754725457751192 Accuracy: tensor(0.9725, device='cuda:0')
Batch Loss: 27.85243631718913 Accuracy: tensor(0.9750, device='cuda:0')
Batch Loss: 38.42050016555004 Accuracy: tensor(0.9750, device='cuda:0')
Batch Loss: 47.831392046180554 Accuracy: tensor(0.9737, device='cuda:0')
Batch Loss: 54.16528749593999 Accuracy: tensor(0.9757, device='cuda:0')
Batch Loss: 58.937069045729004 Accuracy: tensor(0.9775, device='cuda:0')
Batch Loss: 63.262120958650485 Accuracy: tensor(0.9786, device='cuda:0')
Batch Loss: 68.74511123029515 Accuracy: tensor(0.9788, device='cuda:0')
Batch Loss: 75.88945813465398 Accuracy: tensor(0.9780, device='cuda:0')
Batch Loss: 80.49245625344338 Accuracy: tensor(0.9783, device='cuda:0')
Batch Loss: 85.23626276914729 Accuracy: tensor(0.9788, device='cuda:0')
Batch Loss: 96.07707215758273 Accuracy: tensor(0.9771, devi

In [None]:
losses, accuracies

([9844.98988864012,
  6833.175912654493,
  4913.549211624311,
  3437.4262840838346,
  2607.495783931314],
 [tensor(0.8873, device='cuda:0'),
  tensor(0.9265, device='cuda:0'),
  tensor(0.9499, device='cuda:0'),
  tensor(0.9669, device='cuda:0'),
  tensor(0.9755, device='cuda:0')])

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.9063, device='cuda:0')


In [None]:
# 모델 저장하기 : 본인 구글 드라이브
torch.save(model.state_dict(), "/content/drive/MyDrive/elec_base3_2_kor_model.pt")

In [None]:
leader_test = pd.read_csv("/content/drive/MyDrive/ko_data.csv",encoding='CP949')
leader_test= leader_test.rename(columns={'Sentence':'document'})
leader_test.to_csv("/content/drive/MyDrive/ko_data.txt",index=False , sep='\t')

In [None]:
leader_dataset = NSMCDataset("/content/drive/MyDrive/ko_data.txt", y=0)

                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000


In [None]:
leader_loader = DataLoader(leader_dataset, batch_size=1, shuffle=False)

In [None]:
##test
model.eval()
count =0
ret=[]
for input_ids_batch, attention_masks_batch, y_batch in tqdm(leader_loader):
  count+=1
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  ret.append(int(predicted.float()))


HBox(children=(FloatProgress(value=0.0, max=11187.0), HTML(value='')))






### 결과 저장

In [None]:
import csv
f = open('/content/drive/MyDrive/sample_electra_base3_2.csv','a', newline='')
wr = csv.writer(f)
wr.writerow(['Id','Predicted'])
for i in range(len(ret)):
  wr.writerow([i,ret[i]])
f.close()