In [1]:
!pip install datasets



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import load_dataset

In [3]:
dataset = load_dataset("imdb")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
df_train = pd.DataFrame(dataset["train"])
df_train = pd.concat([df_train, pd.DataFrame(dataset["test"])])

df_train = df_train.reset_index(drop=True)
df_train.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [5]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df_train['text'], df_train['label'], test_size=0.3)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5)

# 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 데이터 토크나이즈
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

train_labels = list(train_labels)
val_labels = list(val_labels)
test_labels = list(test_labels)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
class IMDBDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDBDataset(train_encodings, train_labels)
val_dataset = IMDBDataset(val_encodings, val_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 2)  # 감정 분류(긍정/부정)로 2개의 클래스 사용

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        dropout_output = self.dropout(pooled_output)
        return self.classifier(dropout_output)

In [8]:
# 모델, 옵티마이저, 손실 함수 설정
model = BERTClassifier()
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# 데이터로더
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# GPU 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 학습
for epoch in range(3):  # Epoch 수 설정
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        
        # 모델에 input_id, attention_mask만 입력
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Train_loss: {avg_train_loss}", end = " ")

    model.eval()
    total_eval_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            total_eval_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    avg_eval_loss = total_eval_loss / len(val_loader)
    accuracy = correct_predictions.double() / len(val_dataset)
    print(f"Val_loss: {avg_eval_loss}, Val_Accuracy: {accuracy}")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Train_loss: 0.32290166303462214 Val_loss: 0.2759457765691189, Val_Accuracy: 0.8846666666666667
Epoch 2, Train_loss: 0.19219870144522616 Val_loss: 0.26197394941807356, Val_Accuracy: 0.892
Epoch 3, Train_loss: 0.09377258653441178 Val_loss: 0.29865372957818626, Val_Accuracy: 0.8953333333333334


In [9]:
def evaluate(model, data_loader, device, loss_fn, data_num):
    model.eval()
    total_eval_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            total_eval_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)
    
    avg_eval_loss = total_eval_loss / len(data_loader)
    accuracy = correct_predictions.double() / data_num
    print(f"Test_loss: {avg_eval_loss}, Test_Accuracy: {accuracy}")

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
data_num = len(test_dataset)
evaluate(model, test_loader, device, loss_fn, data_num)

Test_loss: 0.3249599678024376, Test_Accuracy: 0.8916000000000001
