#1 부정감정 지속 분석

#1.1 기본 설정

In [8]:
!pip install fsspec==2024.9.0

Collecting fsspec==2024.9.0
  Using cached fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Using cached fsspec-2024.9.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.10.0
    Uninstalling fsspec-2024.10.0:
      Successfully uninstalled fsspec-2024.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2024.9.0


In [9]:
!pip install datasets==3.1.0 fsspec[http]==2024.9.0



In [10]:
!pip install transformers torch datasets scikit-learn tqdm



#1.2 데이터셋 불러오기

데이터 로드 및 전처리

In [11]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# 데이터 로드 (NSMC 데이터셋)
dataset = load_dataset("e9t/nsmc")  # Naver Sentiment Movie Corpus

# 데이터프레임 변환
data = dataset['train'].to_pandas()
data = data.rename(columns={"document": "text", "label": "label"})

# 데이터 중복 제거
data = data.drop_duplicates(subset=['text'])

# 사용자 ID 생성 (1,000명의 사용자 임의 생성)
data['user_id'] = np.random.randint(1, 1001, size=len(data))

# 데이터 분할
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=42)

# 데이터 크기 확인
print(f"Train Data Size: {len(train_data)}, Validation Data Size: {len(val_data)}")
print("Train Class Distribution:", train_data['label'].value_counts())
print("Validation Class Distribution:", val_data['label'].value_counts())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

nsmc.py:   0%|          | 0.00/3.18k [00:00<?, ?B/s]

The repository for e9t/nsmc contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/e9t/nsmc.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/14.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.89M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/150000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Train Data Size: 116946, Validation Data Size: 29237
Train Class Distribution: label
0    58673
1    58273
Name: count, dtype: int64
Validation Class Distribution: label
0    14669
1    14568
Name: count, dtype: int64


#1.2 모델준비

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("monologg/kobert")
model = BertForSequenceClassification.from_pretrained("monologg/kobert", num_labels=2)  # 긍정/부정 분류


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#1.4 데이터로더 및 Pytorch dataset


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import PreTrainedTokenizer

# PyTorch Dataset
class EmotionDataset(Dataset):
    def __init__(self, data, tokenizer: PreTrainedTokenizer, max_len=128):
        self.texts = data['text'].tolist()
        self.labels = data['label'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# DataLoader
train_dataset = EmotionDataset(train_data, tokenizer)
val_dataset = EmotionDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


#1.5 Training

In [None]:
from tqdm import tqdm
from torch.optim import AdamW 
from torch.nn import CrossEntropyLoss

# Optimizer 및 손실 함수 정의
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

# training 함수 정의
def train_model(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    # TQDM으로 진행률 표시 및 업데이트
    progress_bar = tqdm(dataloader, desc="Training", leave=True)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        progress_bar.set_postfix({"Batch Loss": loss.item()})

    return total_loss / len(dataloader)

# training
for epoch in range(3):
    print(f"Epoch {epoch + 1} Starting...")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch + 1} Completed. Average Loss: {train_loss:.4f}")


Epoch 1 Starting...


Training: 100%|██████████| 3655/3655 [17:40<00:00,  3.45it/s, Batch Loss=0.718]


Epoch 1 Completed. Average Loss: 0.6972
Epoch 2 Starting...


Training: 100%|██████████| 3655/3655 [17:42<00:00,  3.44it/s, Batch Loss=0.689]


Epoch 2 Completed. Average Loss: 0.6956
Epoch 3 Starting...


Training: 100%|██████████| 3655/3655 [17:42<00:00,  3.44it/s, Batch Loss=0.703]

Epoch 3 Completed. Average Loss: 0.6942





#2 사용자별 시간 기반 분석 + KoBERT

부정 감정 지속성 분석



#2.1 데이터셋 준비

데이터셋에 가상의 timestmap 추가 -> 시간 기반 특성 생성

In [None]:
from datetime import datetime, timedelta
import random

# 랜덤 타임스탬프 생성 함수
def generate_random_timestamps(data, start_time, max_minutes=1440, overlap_chance=0.3):
    timestamps = []
    for _ in range(len(data)):
        if random.random() < overlap_chance and timestamps:  # 일정 확률로 겹치는 값 생성
            timestamps.append(random.choice(timestamps))  # 기존 값에서 랜덤 선택
        else:
            random_minutes = random.randint(0, max_minutes)
            timestamps.append(start_time + timedelta(minutes=random_minutes))
    return timestamps

# 시작 시간 설정 (2023.01.01 00:00)
start_time = datetime(2023, 1, 1)

# 데이터에 랜덤 타임스탬프 추가
train_data['timestamp'] = generate_random_timestamps(train_data, start_time, max_minutes=1440, overlap_chance=0.3)
val_data['timestamp'] = generate_random_timestamps(val_data, start_time, max_minutes=1440, overlap_chance=0.3)

def calculate_emotion_features(data):
    # 사용자별 부정 감정 비율 계산
    data['negative_ratio'] = data.groupby('user_id')['label'].transform(lambda x: x.expanding().mean())
    # 부정 감정 비율의 변화율 계산
    data['rate_of_change'] = data.groupby('user_id')['negative_ratio'].diff().fillna(0)
    return data

# 데이터에 시간 기반 특성 추가
train_data = calculate_emotion_features(train_data)
val_data = calculate_emotion_features(val_data)

print(train_data[['user_id', 'label', 'negative_ratio', 'rate_of_change']].head())
print(val_data[['user_id', 'label', 'negative_ratio', 'rate_of_change']].head())

        user_id  label  negative_ratio  rate_of_change
3514        528      1             1.0             0.0
121000      266      0             0.0             0.0
118591      550      0             0.0             0.0
6198         16      1             1.0             0.0
65717       824      0             0.0             0.0
        user_id  label  negative_ratio  rate_of_change
66554       886      1             1.0             0.0
39459        49      0             0.0             0.0
145026      763      0             0.0             0.0
75996       447      1             1.0             0.0
129919      939      1             1.0             0.0


#2.2 데이터 로더 및 Pytorch Dataset

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
from transformers import PreTrainedTokenizer

# PyTorch Dataset
class TimeEnhancedEmotionDataset(Dataset):
    def __init__(self, data, tokenizer: PreTrainedTokenizer, max_len=128):
        self.texts = data['text'].tolist()
        self.labels = data['label'].tolist()
        self.negative_ratio = data['negative_ratio'].tolist()
        self.rate_of_change = data['rate_of_change'].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'negative_ratio': torch.tensor(self.negative_ratio[idx], dtype=torch.float),
            'rate_of_change': torch.tensor(self.rate_of_change[idx], dtype=torch.float),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # KoBERT를 사용할 경우 변경 필요

# Dataset
train_dataset = TimeEnhancedEmotionDataset(train_data, tokenizer)
val_dataset = TimeEnhancedEmotionDataset(val_data, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)