<a href="https://colab.research.google.com/github/Taehwan2/hanghaeAI/blob/main/%ED%86%A0%ED%81%B0%EA%B3%BC%EC%A0%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses datasets pandas

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("debasisdotcom/name-entity-recognition-ner-dataset")

print("Path to dataset files:", path)

In [None]:
import torch
from torch.utils.data import DataLoader

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

In [None]:
import pandas as pd
data = []
df = pd.read_csv(path+'/NER dataset.csv', keep_default_na=False, encoding='ISO-8859-1')
unique_pos_tags = df['POS'].unique()  # Replace 'POS' with the actual column name for POS
num_pos_tags = len(unique_pos_tags)
print(num_pos_tags)
data = pd.DataFrame(df)

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2)

print(train_data)
print(test_data)

In [None]:
def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch.iterrows():
    texts.append(row['Word']+row['POS'])
    labels.append(row['TAG'])
  texts = torch.LongTensor(tokenizer(texts, padding=True, truncation=True, max_length=max_len).input_ids)
  labels = torch.LongTensor(labels)

  return texts, labels


train_loader = DataLoader(
    train_data, batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_data, batch_size=64, shuffle=False, collate_fn=collate_fn
)

In [None]:
from torch import nn


class TextClassifier(nn.Module):
  def __init__(self):
    super().__init__()

    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
    self.classifier = nn.Linear(768, 42)

  def forward(self, x):
    x = self.encoder(x)['attention']
    x = self.classifier(x[:, 0])

    return x


model = TextClassifier()

In [None]:
for param in model.encoder.parameters():
  param.requires_grad = False

In [None]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt


lr = 0.001
model = model.to('cuda')
loss_fn = nn.CrossEntropyLoss()

optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 50

for epoch in range(n_epochs):
  total_loss = 0.
  model.train()
  for data in train_loader:
    model.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda').long()

    preds = model(inputs)
    loss = loss_fn(preds, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

In [None]:
def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda')

    preds = model(inputs)
    preds = torch.argmax(preds, dim=-1)

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt


with torch.no_grad():
  model.eval()
  train_acc = accuracy(model, train_loader)
  test_acc = accuracy(model, test_loader)
  print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")