<a href="https://colab.research.google.com/github/Tiabet/BaekJoon/blob/main/SentenceTransformer%2BBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install sentencepiece

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import random
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, Trainer, TrainingArguments
import torch
from torch.utils.data import TensorDataset, Subset
from sklearn.model_selection import train_test_split

In [2]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

In [3]:
df = pd.read_csv('/content/drive/MyDrive/news.csv')
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']
df['text']

0        Spanish coach facing action in race row : MADR...
1        Bruce Lee statue for divided city : In Bosnia,...
2        Only Lovers Left Alive's Tilda Swinton Talks A...
3        Macromedia contributes to eBay Stores : Macrom...
4        Qualcomm plans to phone it in on cellular repa...
                               ...                        
59995    Dolphins Break Through, Rip Rams For First Win...
59996    After Steep Drop, Price of Oil Rises : The fre...
59997    Pro football: Culpepper puts on a show : To sa...
59998    Albertsons on the Rebound : The No. 2 grocer r...
59999    Cassini Craft Spies Saturn Moon Dione (AP) : A...
Name: text, Length: 60000, dtype: object

In [4]:
def preprocess_text(text):
    if not isinstance(text, str):
          return text  # If the input is not a string, return it as is
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    text = re.sub(r':\s*//.*$', '', text)

    return text.lower()

In [5]:
df['processed_text'] = df['text'].apply(preprocess_text)

In [6]:
df_label = pd.read_csv("/content/drive/MyDrive/baseline_submit.csv")
df['preliminary_label'] = df_label['category']

In [7]:
# Load a pre-trained BERT model and tokenizer
model_name = 'textattack/albert-base-v2-ag-news'
tokenizer = AutoTokenizer.from_pretrained(model_name)

n_clusters = 6

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_clusters)


# Encode text data for BERT
encoded_data = tokenizer(df['processed_text'].tolist(), padding=True, truncation=True, return_tensors='pt')


Downloading pytorch_model.bin:   0%|          | 0.00/46.8M [00:00<?, ?B/s]

RuntimeError: ignored

In [7]:
# Prepare the dataset for training
dataset = torch.utils.data.TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], torch.tensor(df['preliminary_label']))

In [17]:
# Define the split ratios (e.g., 70% train, 15% validation, 15% test)
train_ratio = 0.2

# Calculate the number of examples for each split
num_examples = len(dataset)
num_train_examples = int(train_ratio * num_examples)

# Create indices for train, validation, and test splits
indices = list(range(num_examples))
train_indices, remaining_indices = train_test_split(indices, train_size=num_train_examples, random_state=42)

# Create Subset objects for each split
train_subset = Subset(dataset, train_indices)

In [19]:
# Training settings
training_args = TrainingArguments(
    output_dir='./bert_base_uncased_model',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    save_total_limit=2,
    save_steps=500,
    learning_rate=2e-5,
)

# Create a Trainer instance for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda features: {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.tensor([f[2] for f in features])
    },
    train_dataset=train_subset,
)

# Fine-tune the model on the preliminary labels
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

In [None]:
# Load the trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained('./bert_base_uncased_model')  # Replace with the path to your trained model
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load your 60,000 unlabeled news articles into a DataFrame or any suitable data structure.
# For this example, we assume you have a DataFrame with a 'text' column.

# Preprocess the unlabeled data
encoded_data = tokenizer(df['text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Make predictions using the trained model
with torch.no_grad():
    logits = model(**encoded_data).logits
    predicted_labels = np.argmax(logits, axis=1)

# Add the predicted categories as labels to your DataFrame
df['predicted_label'] = predicted_labels

In [None]:
sample = pd.read_csv('sample_submission.csv')
sample['category'] = df['predicted_label']
sample.to_csv('baseline_submit.csv', index=False)