<a href="https://colab.research.google.com/github/Tiabet/BaekJoon/blob/main/albert_pretrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture

!pip install transformers
!pip install accelerate -U
!pip install sentencepiece
!pip install sentence_transformers

In [18]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import random
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, Trainer, TrainingArguments
import torch
from torch.utils.data import TensorDataset, Subset
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

In [4]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/news.csv')
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']
df['text']

0        Spanish coach facing action in race row : MADR...
1        Bruce Lee statue for divided city : In Bosnia,...
2        Only Lovers Left Alive's Tilda Swinton Talks A...
3        Macromedia contributes to eBay Stores : Macrom...
4        Qualcomm plans to phone it in on cellular repa...
                               ...                        
59995    Dolphins Break Through, Rip Rams For First Win...
59996    After Steep Drop, Price of Oil Rises : The fre...
59997    Pro football: Culpepper puts on a show : To sa...
59998    Albertsons on the Rebound : The No. 2 grocer r...
59999    Cassini Craft Spies Saturn Moon Dione (AP) : A...
Name: text, Length: 60000, dtype: object

In [6]:
def preprocess_text(text):
    if not isinstance(text, str):
          return text  # If the input is not a string, return it as is
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    text = re.sub(r':\s*//.*$', '', text)

    return text.lower()

In [16]:
df['processed_text'] = df['text'].apply(preprocess_text)
df['processed_title'] = df['title'].apply(preprocess_text)

In [19]:
# Sentence BERT 모델 로드
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# 텍스트 feature 추출
sentence_embeddings = model.encode(df['processed_title'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [20]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=SEED)

df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)



In [48]:
df[df['kmeans_cluster'] == 1]['text'].head(5)

7     Bump Stock Maker Resumes Sales One Month After...
11    Kerry rolls out tax-cut plan for middle class ...
51    Oil Falls Below \$49 on Nigeria Cease-Fire : L...
70    ABN Amro Profit Rises, Buoyed by Sale of Asia ...
85    Stocks to Open Higher on Growth Outlook : NEW ...
Name: text, dtype: object

In [49]:
  mapping_dict = {
    0: 1, #Entertainment
    1: 0, #Business
    2: 2, #Politics
    3: 4, #Tech
    4: 5, #World
    5: 3  #Sports
}
df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])
sample = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
sample['category'] = df['mapping'].values
sample.to_csv('/content/drive/MyDrive/baseline_submit.csv', index=False)

In [85]:
# Load a pre-trained BERT model and tokenizer
model_name = 'textattack/albert-base-v2-ag-news'
tokenizer = AutoTokenizer.from_pretrained(model_name)

n_clusters = 6

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_clusters, ignore_mismatched_sizes=True)

# Encode text data for BERT
encoded_data = tokenizer(df['processed_title'].tolist(), padding=True, truncation=True, return_tensors='pt')

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at textattack/albert-base-v2-ag-news and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([4, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([4]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
# Prepare the dataset for training

dataset = torch.utils.data.TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], torch.tensor(df['mapping']))

In [87]:
train_ratio = 0.2

# Calculate the number of examples for each split
num_examples = len(dataset)
num_train_examples = int(train_ratio * num_examples)

# Create indices for train, validation, and test splits
indices = list(range(num_examples))
train_indices, remaining_indices = train_test_split(indices, train_size=num_train_examples, random_state=42)

# Create Subset objects for each split
train_subset = Subset(dataset, train_indices)

In [54]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [65]:
# Training settings
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-Tiabet",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    #evaluation_strategy="epoch",
    save_total_limit=2,
    save_steps=500,
    learning_rate=2e-5,
    push_to_hub = True,
)

# Create a Trainer instance for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda features: {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.tensor([f[2] for f in features])
    },
    train_dataset=train_subset,
)

# Fine-tune the model on the preliminary labels
trainer.train()

Step,Training Loss
500,0.9408
1000,0.6655
1500,0.7547
2000,0.7011
2500,0.6777
3000,0.7166
3500,0.5021
4000,0.5057
4500,0.5435
5000,0.5374


TrainOutput(global_step=9000, training_loss=0.5288315921359592, metrics={'train_runtime': 458.0982, 'train_samples_per_second': 78.586, 'train_steps_per_second': 19.646, 'total_flos': 87411984192000.0, 'train_loss': 0.5288315921359592, 'epoch': 3.0})

In [66]:
trainer.push_to_hub(commit_message="Training complete", tags="text classification")

'https://huggingface.co/Tiabet/albert-base-v2-ag-news-finetuned-Tiabet/tree/main/'

In [67]:
# Evaluation
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
predictions = []

test_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False)

with torch.no_grad():
    for step,batch in enumerate(test_loader):
        if step%100 == 0:
          print(f"{step}번째 배치 시작")
        input_ids, attention_mask, targets = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.tolist())

0번째 배치 시작
100번째 배치 시작
200번째 배치 시작
300번째 배치 시작
400번째 배치 시작
500번째 배치 시작
600번째 배치 시작
700번째 배치 시작
800번째 배치 시작
900번째 배치 시작
1000번째 배치 시작
1100번째 배치 시작
1200번째 배치 시작
1300번째 배치 시작
1400번째 배치 시작
1500번째 배치 시작
1600번째 배치 시작
1700번째 배치 시작
1800번째 배치 시작
1900번째 배치 시작
2000번째 배치 시작
2100번째 배치 시작
2200번째 배치 시작
2300번째 배치 시작
2400번째 배치 시작
2500번째 배치 시작
2600번째 배치 시작
2700번째 배치 시작
2800번째 배치 시작
2900번째 배치 시작
3000번째 배치 시작
3100번째 배치 시작
3200번째 배치 시작
3300번째 배치 시작
3400번째 배치 시작
3500번째 배치 시작
3600번째 배치 시작
3700번째 배치 시작


In [68]:
sample = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
sample['category'] = predictions
sample.to_csv('/content/drive/MyDrive/albert_based_submit_2.csv', index=False)


distilbert YuehHuan


In [73]:
# Load a pre-trained BERT model and tokenizer
model_name = 'alimazhar-110/website_classification'
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=n_clusters, ignore_mismatched_sizes=True)

# Encode text data for BERT
encoded_data = tokenizer(df['processed_title'].tolist(), padding=True, truncation=True, return_tensors='pt')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at alimazhar-110/website_classification and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([16, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([16]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
dataset = torch.utils.data.TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], torch.tensor(df['mapping']))

In [80]:
train_ratio = 0.2

# Calculate the number of examples for each split
num_examples = len(dataset)
num_train_examples = int(train_ratio * num_examples)

# Create indices for train, validation, and test splits
indices = list(range(num_examples))
train_indices, remaining_indices = train_test_split(indices, train_size=num_train_examples, random_state=42)

# Create Subset objects for each split
train_subset = Subset(dataset, train_indices)

In [81]:
# Training settings
training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-Tiabet",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    #evaluation_strategy="epoch",
    save_total_limit=2,
    save_steps=500,
    learning_rate=2e-5,
    push_to_hub = True,
)

# Create a Trainer instance for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=lambda features: {
        'input_ids': torch.stack([f[0] for f in features]),
        'attention_mask': torch.stack([f[1] for f in features]),
        'labels': torch.tensor([f[2] for f in features])
    },
    train_dataset=train_subset,
)

# Fine-tune the model on the preliminary labels
trainer.train()

Step,Training Loss
500,0.3953
1000,0.3392
1500,0.3464
2000,0.1439
2500,0.1778
3000,0.1787
3500,0.0637
4000,0.0733
4500,0.0807
5000,0.023


TrainOutput(global_step=7500, training_loss=0.1282192071914673, metrics={'train_runtime': 452.3809, 'train_samples_per_second': 132.632, 'train_steps_per_second': 16.579, 'total_flos': 667559117520000.0, 'train_loss': 0.1282192071914673, 'epoch': 5.0})

In [82]:
# Evaluation
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
predictions = []

test_loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=False)

with torch.no_grad():
    for step,batch in enumerate(test_loader):
        if step%100 == 0:
          print(f"{step}번째 배치 시작")
        input_ids, attention_mask, targets = [item.to(device) for item in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)
        predictions.extend(predicted_labels.tolist())

0번째 배치 시작
100번째 배치 시작
200번째 배치 시작
300번째 배치 시작
400번째 배치 시작
500번째 배치 시작
600번째 배치 시작
700번째 배치 시작
800번째 배치 시작
900번째 배치 시작
1000번째 배치 시작
1100번째 배치 시작
1200번째 배치 시작
1300번째 배치 시작
1400번째 배치 시작
1500번째 배치 시작
1600번째 배치 시작
1700번째 배치 시작
1800번째 배치 시작
1900번째 배치 시작
2000번째 배치 시작
2100번째 배치 시작
2200번째 배치 시작
2300번째 배치 시작
2400번째 배치 시작
2500번째 배치 시작
2600번째 배치 시작
2700번째 배치 시작
2800번째 배치 시작
2900번째 배치 시작
3000번째 배치 시작
3100번째 배치 시작
3200번째 배치 시작
3300번째 배치 시작
3400번째 배치 시작
3500번째 배치 시작
3600번째 배치 시작
3700번째 배치 시작


In [84]:
sample = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
sample['category'] = predictions
sample.to_csv('/content/drive/MyDrive/distilbert_based_submit_2.csv', index=False)