Trump Tweet Dataset

In [None]:
import pandas as pd
df = pd.read_csv("C:/Users/seonahryu/Desktop/urp/combined_output.csv")

# 감성분석

### VADER

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

In [None]:
def sentiment_analysis(text):
    score = sia.polarity_scores(text)
    return score['compound']

In [None]:
df['vader_sentiment'] = df['post_content'].apply(sentiment_analysis)
print(df[['post_content', 'vader_sentiment']].head(10))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.histplot(df['vader_sentiment_score'], bins=30, kde=True)
plt.title('VADER(compound score) Sentiment Score Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.axvline(x=0, color='green', linestyle='dotted') # 0 중립
plt.show()

In [None]:
# 하이퍼파라미터 설정
threshold = 0.1

# 긍정, 부정, 중립 개수 계산
positive_count = (df['vader_sentiment'] > threshold).sum()
negative_count = (df['vader_sentiment'] < -threshold).sum()
neutral_count = ((df['vader_sentiment'] <= threshold) & (df['vader_sentiment'] >= -threshold)).sum()


total_count = positive_count + negative_count + neutral_count

print(f'total : {total_count}개')
print(f'positive : {positive_count}개')
print(f'negative : {negative_count}개')
print(f'neutral : {neutral_count}개')

positive_percentage = (positive_count / total_count) * 100
negative_percentage = (negative_count / total_count) * 100
neutral_percentage = (neutral_count / total_count) * 100

print(f'positive : {positive_percentage}%')
print(f'negative : {negative_percentage}%')
print(f'neutral : {neutral_percentage}%')

# 비율 데이터프레임 생성
sentiment_distribution = pd.DataFrame({
    'Sentiment': ['Positive', 'Negative', 'Neutral'],
    'Percentage': [positive_percentage, negative_percentage, neutral_percentage]
})

# 비율 시각화
plt.figure(figsize=(8, 5))
sns.barplot(x='Sentiment', y='Percentage', data=sentiment_distribution)
plt.title('Sentiment Distribution Using VADER')
plt.xlabel('Sentiment')
plt.ylabel('Percentage (%)')
plt.ylim(0, 100)
plt.show()

### RoBERTa

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import pandas as pd

# RoBERTa 모델 및 토크나이저 로드
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

# 평가 모드로 전환
model.eval()

# 감성 분석 함수 정의
def classify_sentiment(text):
    encoded_input = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        output = model(**encoded_input)
        scores = torch.nn.functional.softmax(output.logits, dim=-1)[0].tolist()
        
        # 가장 높은 확률을 가진 클래스 선택
        sentiment_index = scores.index(max(scores))
        
        # 감정 매핑
        if sentiment_index == 0:
            return -1  # 부정
        elif sentiment_index == 1:
            return 0   # 중립
        else:
            return 1   # 긍정

# df에 RoBERTa 감정 점수 추가
df["roberta_sentiment"] = None

for i, row in df.iterrows():
    sentiment = classify_sentiment(row["post_content"])
    df.at[i, "roberta_sentiment"] = sentiment
    
    if i % 100 == 0:  # 100개마다 진행 상황 출력
        print(f"Processed {i} out of {len(df)} rows")

# 결과 출력
print(df[["post_content", "roberta_sentiment"]].head())

In [None]:
# 긍정, 부정, 중립 개수 계산
positive_count = (df['roberta_sentiment'] == 1).sum()
negative_count = (df['roberta_sentiment'] == -1).sum()
neutral_count = (df['roberta_sentiment'] == 0).sum()

total_count = positive_count + negative_count + neutral_count

print(f'total : {total_count}개')
print(f'positive : {positive_count}개')
print(f'negative : {negative_count}개')
print(f'neutral : {neutral_count}개')

positive_percentage = (positive_count / total_count) * 100
negative_percentage = (negative_count / total_count) * 100
neutral_percentage = (neutral_count / total_count) * 100

print(f'positive : {positive_percentage:.2f}%')
print(f'negative : {negative_percentage:.2f}%')
print(f'neutral : {neutral_percentage:.2f}%')

# 비율 데이터프레임 생성
sentiment_distribution = pd.DataFrame({
    'Sentiment': ['Positive', 'Negative', 'Neutral'],
    'Percentage': [positive_percentage, negative_percentage, neutral_percentage]
})

# 비율 시각화
plt.figure(figsize=(8, 5))
sns.barplot(x='Sentiment', y='Percentage', data=sentiment_distribution)
plt.title('Sentiment Distribution Using RoBERTa')
plt.xlabel('Sentiment')
plt.ylabel('Percentage (%)')
plt.ylim(0, 100)
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 데이터 시각화
plt.figure(figsize=(16, 9))

# 선 그래프 그리기
sns.lineplot(data=df[['vader_sentiment', 'roberta_sentiment]])

# 레이블 설정
plt.xlabel('Index')
plt.ylabel('Sentiment Score')
plt.title('Sentiment Scores Comparison')
plt.legend(['VADER Sentiment', 'RoBERTa Sentiment'])

# 플롯 표시
plt.show()

df post_time을 기준으로 nasdaq 종가 예측 prediction_date 열 추가 !!!

In [None]:
df.to_csv('tweet_sentiment_prediction.csv', index=False)