# Task 1.

In [18]:
!pip install datasets

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset

torch.manual_seed(100)

data = load_dataset("KaungHtetCho/Harry_Potter_LSTM", split="train")
sample_sentence = data[0]["text"][:100]

char_set = list(set(sample_sentence))
dic = {c: i for i, c in enumerate(char_set)}

dic_size = len(dic)
input_size = dic_size
hidden_size = dic_size * 2
output_size = dic_size
unit_sequence_length = 20

input_batch = []
target_batch = []

for i in range(0, len(sample_sentence) - unit_sequence_length):
    input_seq = sample_sentence[i:i + unit_sequence_length]
    target_seq = sample_sentence[i + 1:i + unit_sequence_length + 1]
    input_batch.append([dic[c] for c in input_seq])
    target_batch.append([dic[c] for c in target_seq])

input_batch_onehot = []
for seq in input_batch:
    onehot_seq = []
    for idx in seq:
        onehot = [0] * dic_size
        onehot[idx] = 1
        onehot_seq.append(onehot)
    input_batch_onehot.append(onehot_seq)

X = torch.FloatTensor(input_batch_onehot)
Y = torch.LongTensor(target_batch)

print(X.shape)
print(Y.shape)

# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h_0 = torch.zeros(2, x.size(0), hidden_size)
        out, _ = self.rnn(x, h_0)
        out = self.fc(out)
        return out

learning_rate = 0.05
training_epochs = 100

model = RNNModel(input_size, hidden_size, output_size, num_layers=2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(training_epochs):
    optimizer.zero_grad()
    outputs = model(X)
    loss = criterion(outputs.reshape(-1, dic_size), Y.reshape(-1))
    loss.backward()
    optimizer.step()
    if epoch % 10 == 9:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

results = outputs.data.numpy().argmax(axis=2)
correct = 0
total = 0
for i in range(len(Y)):
    correct += np.sum(results[i] == Y[i].numpy())
    total += len(Y[i])
accuracy = correct / total * 100
print(f"RNN Model Accuracy: {accuracy:.2f}%")


torch.Size([18, 20, 16])
torch.Size([18, 20])
Epoch 10, Loss: 1.0479233264923096
Epoch 20, Loss: 0.14598821103572845
Epoch 30, Loss: 0.04988078027963638
Epoch 40, Loss: 0.038093872368335724
Epoch 50, Loss: 0.034555356949567795
Epoch 60, Loss: 0.03329882770776749
Epoch 70, Loss: 0.0328485369682312
Epoch 80, Loss: 0.032540462911129
Epoch 90, Loss: 0.03237507492303848
Epoch 100, Loss: 0.03226429969072342
RNN Model Accuracy: 98.06%


# Task 2.

In [19]:
!pip install transformers datasets huggingface_hub evaluate

from huggingface_hub import login
login(token='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')

from datasets import load_dataset
dataset = load_dataset('sepidmnorozy/Korean_sentiment')

print(dataset)

from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

model_name_1 = 'beomi/KcELECTRA-base'
model_name_2 = 'kykim/bert-kor-base'

# 모델 1 (KcELECTRA-base)
tokenizer_1 = AutoTokenizer.from_pretrained(model_name_1)
model_1 = AutoModelForSequenceClassification.from_pretrained(model_name_1)
classifier_1 = pipeline('sentiment-analysis', model=model_1, tokenizer=tokenizer_1, device=0)

# 모델 2 (bert-kor-base)
tokenizer_2 = AutoTokenizer.from_pretrained(model_name_2)
model_2 = AutoModelForSequenceClassification.from_pretrained(model_name_2)
classifier_2 = pipeline('sentiment-analysis', model=model_2, tokenizer=tokenizer_2, device=0)

import numpy as np
test_data = dataset['test']
rng = np.random.default_rng()
random_indices = rng.choice(len(test_data), size=100, replace=False)
sampled_data = test_data.select(random_indices)

import evaluate
accuracy = evaluate.load('accuracy')

def compute_metrics(predictions, labels):
    pred_labels = [pred['label'] for pred in predictions]
    mapped_labels = [1 if label == 'positive' else 0 for label in pred_labels]  # 긍정: 1, 부정: 0
    return accuracy.compute(predictions=mapped_labels, references=labels)

def evaluate_model(classifier, sampled_data):
    predictions = classifier([text for text in sampled_data['text']], truncation=True, padding=True, max_length=512)
    return compute_metrics(predictions, sampled_data['label'])

results_1 = []
results_2 = []

for _ in range(3):
    result_1 = evaluate_model(classifier_1, sampled_data)
    results_1.append(result_1['accuracy'])

    result_2 = evaluate_model(classifier_2, sampled_data)
    results_2.append(result_2['accuracy'])

avg_result_1 = np.mean(results_1)
avg_result_2 = np.mean(results_2)

print(f"모델 1 ({model_name_1}) 평균 정확도: {avg_result_1}")
print(f"모델 2 ({model_name_2}) 평균 정확도: {avg_result_2}")

if avg_result_1 > avg_result_2:
    print(f"모델 1 ({model_name_1})가 더 우수합니다.")
else:
    print(f"모델 2 ({model_name_2})가 더 우수합니다.")


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 36000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1333
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2667
    })
})


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at kykim/bert-kor-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


모델 1 (beomi/KcELECTRA-base) 평균 정확도: 0.5
모델 2 (kykim/bert-kor-base) 평균 정확도: 0.5
모델 2 (kykim/bert-kor-base)가 더 우수합니다.
