# 1. (문장 단위)전처리

In [22]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import re

# nltk.download('punkt')

# 문장 단위 토큰화 함수
def split_into_sentences(text):
    if not isinstance(text, str):  # 문자열이 아닌 경우 빈 문자열 반환
        return []
    return sent_tokenize(text)

def preprocess_sentence(sentence):
    if not isinstance(sentence, str):  # 문자열이 아닌 경우 빈 문자열 반환
        return ""
    sentence = sentence.lower()  # 소문자 변환
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # 특수문자 제거
    return sentence.strip()

# CSV 파일 로드
df = pd.read_csv('Warframe_reviews.csv')

# NaN 값을 빈 문자열로 대체 후 문자열 타입 변환
df['review_text'] = df['review_text'].fillna('').astype(str)

# 문장 단위로 분리
df['sentences'] = df['review_text'].apply(split_into_sentences)

# 문장 단위 데이터 펼치기
sentence_df = df.explode('sentences').reset_index(drop=True)
sentence_df.rename(columns={'sentences': 'sentence'}, inplace=True)

# 전처리 적용
sentence_df['cleaned_sentence'] = sentence_df['sentence'].apply(preprocess_sentence)

# 결과 확인
print(sentence_df[['sentence', 'cleaned_sentence']].head())


                                            sentence  \
0  Dealing 5 billion damage to a 100 health point...   
1                               Welcome to Warframe.   
2  Play the same mission 231 times for the peepee...   
3  The best part of this game is figuring out wha...   
4  I hope you guys from The First Descendant, Des...   

                                    cleaned_sentence  
0  dealing  billion damage to a  health points en...  
1                                welcome to warframe  
2  play the same mission  times for the peepeepoo...  
3  the best part of this game is figuring out wha...  
4  i hope you guys from the first descendant dest...  


# 2. 임베딩

In [23]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# 모델 로드
model = SentenceTransformer("intfloat/multilingual-e5-large", trust_remote_code=True)

# 임베딩 함수
def get_embeddings_in_batches(texts, model, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            convert_to_numpy=True,  # NumPy 배열로 반환
            show_progress_bar=True  # 진행 상태 표시
        )
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

# 문장 데이터 임베딩
embeddings = get_embeddings_in_batches(sentence_df['cleaned_sentence'].tolist(), model, batch_size=16)

# 임베딩 추가
sentence_df['embedding'] = list(embeddings)
print("임베딩 크기:", embeddings.shape)
print(sentence_df.head())


Batches:   0%|          | 0/1 [00:05<?, ?it/s]


KeyboardInterrupt: 

# 3. 감정 분석

In [4]:
# 데이터프레임을 CSV로 저장


sentence_df.to_csv("destiny2_sentence_embeddings.csv", index=False)


print("CSV 파일로 저장되었습니다!")


CSV 파일로 저장되었습니다!


In [25]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])  # 감정 레이블 추출
    return sentiments

# 감정 분석 실행: 문장 단위로 감정 분석
sentence_df['sentiment'] = analyze_sentiment(sentence_df['cleaned_sentence'].tolist())

# 결과 확인
print(sentence_df[['cleaned_sentence', 'sentiment']].head())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


TypeError: 'float' object is not subscriptable

In [8]:
# 긍정/부정 리뷰 분리
positive_reviews = sentence_df[sentence_df['sentiment'] == 'positive']
negative_reviews = sentence_df[sentence_df['sentiment'] == 'negative']
neutral_reviews = sentence_df[sentence_df['sentiment'] == 'neutral']

print(f"긍정 리뷰 개수: {len(positive_reviews)}")
print(f"부정 리뷰 개수: {len(negative_reviews)}")
print(f"중립 리뷰 개수: {len(neutral_reviews)}")

긍정 리뷰 개수: 3319
부정 리뷰 개수: 1717
중립 리뷰 개수: 2265


# 4. 토픽 모델링

In [9]:
from bertopic import BERTopic

# 긍정 리뷰 토픽 모델링
positive_texts = positive_reviews['cleaned_sentence'].tolist()
topic_model_positive = BERTopic()
topics_positive, probs_positive = topic_model_positive.fit_transform(positive_texts)

# 부정 리뷰 토픽 모델링
negative_texts = negative_reviews['cleaned_sentence'].tolist()
topic_model_negative = BERTopic()
topics_negative, probs_negative = topic_model_negative.fit_transform(negative_texts)

# 긍정 토픽 확인
print("긍정 리뷰 주요 토픽:", topic_model_positive.get_topic_info())

# 부정 토픽 확인
print("부정 리뷰 주요 토픽:", topic_model_negative.get_topic_info())

긍정 리뷰 주요 토픽:     Topic  Count                               Name  \
0      -1    814                   -1_and_the_to_it   
1       0    289               0_destiny_is_and_the   
2       1    115            1_friends_with_play_fun   
3       2    114                           2_ooo___   
4       3     83                3_pve_pvp_side_both   
..    ...    ...                                ...   
70     69     12       69_review_write_playtime_not   
71     70     12            70_gud_goodd_gmae_trust   
72     71     12      71_shape_final_believe_around   
73     72     11  72_poggers_jimbo_feet_grasshopper   
74     73     10         73_hunter_mara_sov_warlock   

                                       Representation  \
0     [and, the, to, it, of, for, its, game, you, is]   
1   [destiny, is, and, the, that, since, of, in, t...   
2   [friends, with, play, fun, have, to, friend, p...   
3                             [ooo, , , , , , , , , ]   
4   [pve, pvp, side, both, in, and, fun, 

In [10]:
# 긍정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_positive.get_topic_info())):
    print(f"Topic {topic}: {topic_model_positive.get_topic(topic)}")

# 부정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_negative.get_topic_info())):
    print(f"Topic {topic}: {topic_model_negative.get_topic(topic)}")

Topic 0: [('destiny', 0.04471613442847865), ('is', 0.012255329292324943), ('and', 0.012033344113092564), ('the', 0.011179340759426115), ('that', 0.010902878204841695), ('since', 0.010801934680939734), ('of', 0.010799891764249807), ('in', 0.010244268360286144), ('to', 0.009971452604183079), ('have', 0.009944847255136531)]
Topic 1: [('friends', 0.06964247886371094), ('with', 0.04113317418760803), ('play', 0.03328287788049399), ('fun', 0.027216048410687636), ('have', 0.019154884823327386), ('to', 0.017045642016842284), ('friend', 0.016631466235644383), ('playing', 0.015878489099273547), ('game', 0.014610890917057173), ('some', 0.014574077122564323)]
Topic 2: [('ooo', 6.3578422665081), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]
Topic 3: [('pve', 0.08372987069333734), ('pvp', 0.07924447911601182), ('side', 0.016984984825798195), ('both', 0.016557558134275303), ('in', 0.015396826081415748), ('and', 0.015130797232650393

In [12]:
import pandas as pd
import re
from bertopic import BERTopic

# topic_model_positive 파일을 로드 (예제 코드에서는 이미 메모리에 있다고 가정)
# topic_model_positive = BERTopic.load("path_to_your_model")

# 토픽 정보 추출
topics = topic_model_positive.get_topics()

# 토픽 데이터 정리
data = []
for topic_num, words in topics.items():
    keywords = [re.sub(r'\(.*?\)', '', word[0]).strip() for word in words]  # 숫자 제거
    data.append([topic_num, ', '.join(keywords)])

# DataFrame 생성
df = pd.DataFrame(data, columns=["Topic", "Keywords"])

# CSV 저장
csv_filename = "destiny2_positive_topics.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"CSV 파일이 저장되었습니다: {csv_filename}")


CSV 파일이 저장되었습니다: destiny2_positive_topics.csv


In [15]:
# 긍정 리뷰 토픽 관계도 시각화
topic_model_positive.visualize_topics()

In [16]:
# 부정 리뷰 토픽 관계도 시각화
topic_model_negative.visualize_topics()

In [14]:
import pandas as pd
import re
from bertopic import BERTopic

# topic_model_positive 파일을 로드 (예제 코드에서는 이미 메모리에 있다고 가정)
# topic_model_positive = BERTopic.load("path_to_your_model")

# 토픽 정보 추출
topics = topic_model_positive.get_topics()

# 토픽 데이터 정리
data = []
for topic_num, words in topics.items():
    keywords = [re.sub(r'\(.*?\)', '', word[0]).strip() for word in words]  # 숫자 제거
    weights = [word[1] for word in words]  # 숫자(가중치) 추가
    data.append([topic_num, ', '.join(keywords), ', '.join(map(str, weights))])

# DataFrame 생성
df = pd.DataFrame(data, columns=["Topic", "Keywords", "Weights"])

# CSV 저장
csv_filename = "destiny2_positive_topics_withnum.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"CSV 파일이 저장되었습니다: {csv_filename}")


CSV 파일이 저장되었습니다: destiny2_positive_topics_withnum.csv


In [15]:
# 긍정 리뷰
topic_model_positive.visualize_barchart(top_n_topics=20)

In [16]:
# 부정 리뷰
topic_model_negative.visualize_barchart(top_n_topics=20)

In [19]:
topic_model_positive.visualize_heatmap()

In [20]:
topic_model_negative.visualize_heatmap()