In [1]:
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import os

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
NVIDIA GeForce RTX 3090


In [2]:
if torch.cuda.is_available():
    print("GPU is available!")
    device = torch.device("cuda")
else:
    print("GPU is not available. Using CPU.")
    device = torch.device("cpu")

GPU is available!


In [3]:
train_data = pd.read_csv("news8.csv")
train_data['full_text'] = train_data['title'] + " " + train_data['contents']

test_data = pd.read_csv("./news_test.csv")
test_data['full_text'] = test_data['title'] + " " + test_data['contents']

In [4]:
train_label = train_data['category']

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=6)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_encodings = tokenizer(train_data['full_text'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_data['full_text'].tolist(), truncation=True, padding=True, max_length=512)

In [7]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

In [8]:
train_dataset = NewsDataset(train_encodings, train_label)
test_dataset = NewsDataset(test_encodings)

In [9]:
training_args = TrainingArguments(
    output_dir='./roberta',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,  # 예측 시 사용할 배치 크기를 늘립니다. GPU 메모리에 따라 조정해야 합니다.
    num_train_epochs=2,
    logging_dir='./logs',
    no_cuda=False  # Ensure GPU is used
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

In [11]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmunicef[0m ([33msrgan[0m). Use [1m`wandb login --relogin`[0m to force relogin


A Jupyter Widget

{'loss': 0.7237, 'learning_rate': 4e-05, 'epoch': 0.4}
{'loss': 0.5271, 'learning_rate': 3e-05, 'epoch': 0.8}
{'loss': 0.4205, 'learning_rate': 2e-05, 'epoch': 1.2}
{'loss': 0.3236, 'learning_rate': 1e-05, 'epoch': 1.6}
{'loss': 0.2451, 'learning_rate': 0.0, 'epoch': 2.0}
{'train_runtime': 1524.2819, 'train_samples_per_second': 13.121, 'train_steps_per_second': 1.64, 'train_loss': 0.44797908020019533, 'epoch': 2.0}


TrainOutput(global_step=2500, training_loss=0.44797908020019533, metrics={'train_runtime': 1524.2819, 'train_samples_per_second': 13.121, 'train_steps_per_second': 1.64, 'train_loss': 0.44797908020019533, 'epoch': 2.0})

In [12]:
# Predicting categories for the test data
predictions = trainer.predict(test_dataset).predictions.argmax(-1)
test_data['predicted_category'] = predictions

A Jupyter Widget

In [13]:
test_data

Unnamed: 0,id,title,contents,full_text,predicted_category
0,NEWS_10000,Yahoo! buys email search firm,"Yahoo! Inc has bought Stata Labs Inc, maker of...",Yahoo! buys email search firm Yahoo! Inc has b...,4
1,NEWS_10001,MCI reports \$3.4 billion loss,WASHINGTON - MCI Inc. on Thursday reported a \...,MCI reports \$3.4 billion loss WASHINGTON - MC...,4
2,NEWS_10002,Musharraf #39;s choice elected as new PM,Pakistan #39;s parliament elected Shaukat Aziz...,Musharraf #39;s choice elected as new PM Pakis...,5
3,NEWS_10003,Mozilla launching second act with e-mail client,Editor #39;s Summary: The Mozilla Foundation h...,Mozilla launching second act with e-mail clien...,4
4,NEWS_10004,The Power of Cities,//www.huffingtonpost.com/entry/the-power-of-ci...,The Power of Cities //www.huffingtonpost.com/e...,2
...,...,...,...,...,...
49995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...,"Dolphins Break Through, Rip Rams For First Win...",3
49996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...,"After Steep Drop, Price of Oil Rises The freef...",0
49997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...,Pro football: Culpepper puts on a show To say ...,3
49998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...,Albertsons on the Rebound The No. 2 grocer rep...,0


In [23]:
submission = pd.read_csv("./sample_submission.csv")
submission1 = submission.head(10000)
submission2 = submission.tail(50000)
submission1['category'] = train_data['category']
submission2['category'] = predictions
submission3 = pd.concat([submission1, submission2], axis=0)
submission3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission1['category'] = train_data['category']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission2['category'] = predictions


Unnamed: 0,id,category
0,NEWS_00000,3
1,NEWS_00001,1
2,NEWS_00002,1
3,NEWS_00003,4
4,NEWS_00004,4
...,...,...
59995,NEWS_59995,3
59996,NEWS_59996,0
59997,NEWS_59997,3
59998,NEWS_59998,0


In [24]:
submission3.to_csv("submit_roberta_large_2epochs_news8.csv", index=False)