<a href="https://colab.research.google.com/github/SeungNap/test2/blob/main/RoBERTa_%EA%B0%90%EC%84%B1%EB%B6%84%EC%84%9D_%2B_%EC%96%91%EA%B7%B9%EC%84%B1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RoBERTa 감성분석

In [None]:
# 📦 라이브러리 설치
!pip install scipy transformers pandas openpyxl tqdm matplotlib datasets scikit-learn --quiet

In [None]:
# 📚 라이브러리
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm
from google.colab import files

# ✅ 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📂 파일 업로드
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# 📄 데이터 로딩
df = pd.read_excel(filename)
if 'Review' not in df.columns:
    raise ValueError("❌ 'Review' 열이 없습니다.")
df['Review'] = df['Review'].fillna("")

# 🤖 모델 로드
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

# ✅ 감성 분석 함수 (batch)
def analyze_sentiment_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits.cpu().numpy(), axis=1)
    return probs  # shape: (batch_size, 3)

# 🔄 배치 감성 분석
batch_size = 32
scores = []

for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['Review'].iloc[i:i+batch_size].tolist()
    try:
        probs = analyze_sentiment_batch(batch_texts)
        for prob in probs:
            pos, neu, neg = prob[2], prob[1], prob[0]
            # compound 계산
            compound = round((pos - neg) * (1 - neu), 4)
            scores.append(compound)
    except Exception as e:
        print(f"❌ 오류 at batch {i}: {e}")
        scores.extend([0.0] * len(batch_texts))

df['sentiment_score'] = scores

# ✅ 분위수 계산 (25%, 75%)
q25 = df['sentiment_score'].quantile(0.25)
q75 = df['sentiment_score'].quantile(0.75)

print(f"🎯 분위수 기준: Q25={q25:.4f}, Q75={q75:.4f}")

# 🔄 분위수 기반 라벨링 함수
def label_sentiment(score):
    if score <= q25:
        return 2  # Negative
    elif score >= q75:
        return 1  # Positive
    else:
        return 3  # Neutral

df['sentiment_label'] = df['sentiment_score'].apply(label_sentiment)

# 💾 저장 및 다운로드
output_file = filename.replace(".xlsx", "_roberta_sentiment_labeled.xlsx")
df.to_excel(output_file, index=False)
files.download(output_file)

print(f"✅ 완료: RoBERTa 감성 분석 + 분위수 라벨링 (1=Positive≥{q75:.2f}, 2=Negative≤{q25:.2f}, 3=Neutral)")


# Roberta 양극성

In [None]:
# 📚 라이브러리
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm
from google.colab import files

# ✅ 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📂 파일 업로드
uploaded = files.upload()
filename = list(uploaded.keys())[0]

# 📄 데이터 로딩
df = pd.read_excel(filename)
if 'Review_Raw' not in df.columns:
    raise ValueError("❌ 'Review_Raw' 열이 없습니다.")
df['Review_Raw'] = df['Review_Raw'].fillna("")

# 🤖 모델 로드
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

# ✅ 감성 분석 함수 (batch 처리)
def analyze_sentiment_batch(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits.cpu().numpy(), axis=1)
    return probs  # shape: (batch_size, 3)

# ✅ 배치 실행
batch_size = 32
polarity_scores = []

for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df['Review_Raw'].iloc[i:i+batch_size].tolist()
    try:
        probs = analyze_sentiment_batch(batch_texts)
        for prob in probs:
            pos = float(prob[2])
            neg = float(prob[0])
            polarity = round(abs(pos - neg), 4)  # 양극성 강도: 0~1
            polarity_scores.append(polarity)
    except Exception as e:
        print(f"❌ 오류 발생 at batch {i}: {e}")
        polarity_scores.extend([0.0] * len(batch_texts))

# ✅ 결과 저장 (Polarity_Score만 포함)
df_result = pd.DataFrame({
    'Polarity_Score': polarity_scores
})

output_file = filename.replace(".xlsx", "_roberta_polarity_score_0to1.xlsx")
df_result.to_excel(output_file, index=False)
files.download(output_file)

print("✅ 완료: 0~1 범위의 Polarity_Score 결과가 저장되었습니다.")