# 1. (문장 단위)전처리

In [23]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import re

#nltk.download('punkt')

# 문장 단위 토큰화 함수
def split_into_sentences(text):
    return sent_tokenize(text)

def preprocess_sentence(sentence):
    sentence = sentence.lower()  # 소문자 변환
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # 특수문자 제거
    return sentence.strip()

df = pd.read_csv('Warframe_reviews.csv')

# 문장 단위로 분리
df['sentences'] = df['review_text'].apply(split_into_sentences)

# 문장 단위 데이터 펼치기
sentence_df = df.explode('sentences').reset_index(drop=True)
sentence_df.rename(columns={'sentences': 'sentence'}, inplace=True)

# 전처리 적용
sentence_df['cleaned_sentence'] = sentence_df['sentence'].apply(preprocess_sentence)
print(sentence_df[['sentence', 'cleaned_sentence']].head())

print(sentence_df.head())

                                            sentence  \
0  Dealing 5 billion damage to a 100 health point...   
1                               Welcome to Warframe.   
2  Play the same mission 231 times for the peepee...   
3  The best part of this game is figuring out wha...   
4  I hope you guys from The First Descendant, Des...   

                                    cleaned_sentence  
0  dealing  billion damage to a  health points en...  
1                                welcome to warframe  
2  play the same mission  times for the peepeepoo...  
3  the best part of this game is figuring out wha...  
4  i hope you guys from the first descendant dest...  
      title              id  \
0  Warframe       Bowieober   
1  Warframe  BamBoozlerC0L3   
2  Warframe  BamBoozlerC0L3   
3  Warframe     Mumbo Magic   
4  Warframe         RushCit   

                                         review_text recommendation  \
0  Dealing 5 billion damage to a 100 health point...    Recommended   
1  W

# 2. 임베딩

In [24]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# 모델 로드
model = SentenceTransformer("intfloat/multilingual-e5-large", trust_remote_code=True)

# 임베딩 함수
def get_embeddings_in_batches(texts, model, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            convert_to_numpy=True,  # NumPy 배열로 반환
            show_progress_bar=True  # 진행 상태 표시
        )
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

# 문장 데이터 임베딩
embeddings = get_embeddings_in_batches(sentence_df['cleaned_sentence'].tolist(), model, batch_size=16)

# 임베딩 추가
sentence_df['embedding'] = list(embeddings)
print("임베딩 크기:", embeddings.shape)
print(sentence_df.head())


Batches: 100%|██████████| 1/1 [00:07<00:00,  7.37s/it]
Batches: 100%|██████████| 1/1 [00:12<00:00, 12.10s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.68s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.82s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.15s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.23s/it]
Batches: 100%|██████████| 1/1 [00:36<00:00, 36.13s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.25s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.34s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.03s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.64s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.32s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.21s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.09s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.77s/it]
Batches: 1

임베딩 크기: (10692, 1024)
      title              id  \
0  Warframe       Bowieober   
1  Warframe  BamBoozlerC0L3   
2  Warframe  BamBoozlerC0L3   
3  Warframe     Mumbo Magic   
4  Warframe         RushCit   

                                         review_text recommendation  \
0  Dealing 5 billion damage to a 100 health point...    Recommended   
1  Welcome to Warframe. Play the same mission 231...    Recommended   
2  Welcome to Warframe. Play the same mission 231...    Recommended   
3  The best part of this game is figuring out wha...    Recommended   
4  I hope you guys from The First Descendant, Des...    Recommended   

  posted_date  playtime                                           sentence  \
0  2022.11.28       1.0  Dealing 5 billion damage to a 100 health point...   
1  2022.05.08       4.0                               Welcome to Warframe.   
2  2022.05.08       4.0  Play the same mission 231 times for the peepee...   
3  2024.01.13     767.3  The best part of this game 




# 3. 감정 분석

In [25]:
# 데이터프레임을 CSV로 저장


sentence_df.to_csv("sentence_embeddings.csv", index=False)


print("CSV 파일로 저장되었습니다!")


CSV 파일로 저장되었습니다!


In [26]:
import pandas as pd
warframe_sentence_df = pd.read_csv('sentence_embeddings.csv')

In [27]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])  # 감정 레이블 추출
    return sentiments

# 감정 분석 실행: 문장 단위로 감정 분석
sentence_df['sentiment'] = analyze_sentiment(sentence_df['cleaned_sentence'].tolist())

# 결과 확인
print(sentence_df[['cleaned_sentence', 'sentiment']].head())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


                                    cleaned_sentence sentiment
0  dealing  billion damage to a  health points en...   neutral
1                                welcome to warframe  positive
2  play the same mission  times for the peepeepoo...   neutral
3  the best part of this game is figuring out wha...  positive
4  i hope you guys from the first descendant dest...   neutral


In [28]:
# 긍정/부정 리뷰 분리
positive_reviews = sentence_df[sentence_df['sentiment'] == 'positive']
negative_reviews = sentence_df[sentence_df['sentiment'] == 'negative']
neutral_reviews = sentence_df[sentence_df['sentiment'] == 'neutral']

print(f"긍정 리뷰 개수: {len(positive_reviews)}")
print(f"부정 리뷰 개수: {len(negative_reviews)}")
print(f"중립 리뷰 개수: {len(neutral_reviews)}")

긍정 리뷰 개수: 4571
부정 리뷰 개수: 1694
중립 리뷰 개수: 4427


# 4. 토픽 모델링

In [29]:
from bertopic import BERTopic

# 긍정 리뷰 토픽 모델링
positive_texts = positive_reviews['cleaned_sentence'].tolist()
topic_model_positive = BERTopic()
topics_positive, probs_positive = topic_model_positive.fit_transform(positive_texts)

# 부정 리뷰 토픽 모델링
negative_texts = negative_reviews['cleaned_sentence'].tolist()
topic_model_negative = BERTopic()
topics_negative, probs_negative = topic_model_negative.fit_transform(negative_texts)

# 긍정 토픽 확인
print("긍정 리뷰 주요 토픽:", topic_model_positive.get_topic_info())

# 부정 토픽 확인
print("부정 리뷰 주요 토픽:", topic_model_negative.get_topic_info())

긍정 리뷰 주요 토픽:     Topic  Count                                       Name  \
0      -1   1214                   -1_and_the_warframe_game   
1       0    216                        0_free_play_best_to   
2       1    181       1_graphics_beautiful_design_gameplay   
3       2    161                        2_uwu_goooo_aha_day   
4       3    139                3_grind_grinding_but_grindy   
..    ...    ...                                        ...   
82     81     11            81_loop_gameplay_core_secondary   
83     82     11               82_received_product_free_for   
84     83     10            83_pve_pvp_focusing_repititious   
85     84     10  84_destiny_similar_developersomg_limiless   
86     85     10              85_behind_paywall_fp_paywalls   

                                       Representation  \
0   [and, the, warframe, game, of, it, this, to, y...   
1   [free, play, best, to, game, one, out, games, ...   
2   [graphics, beautiful, design, gameplay, are, l...   
3 

In [11]:
# 긍정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_positive.get_topic_info())):
    print(f"Topic {topic}: {topic_model_positive.get_topic(topic)}")

# 부정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_negative.get_topic_info())):
    print(f"Topic {topic}: {topic_model_negative.get_topic(topic)}")

Topic 0: [('free', 0.07262238573958281), ('play', 0.045386318593049466), ('best', 0.038331509135117854), ('one', 0.019358830132712394), ('to', 0.01908436176511034), ('games', 0.018558525559972977), ('out', 0.017415371618352875), ('game', 0.01707576511994289), ('ever', 0.01700065348003434), ('its', 0.013887431315653499)]
Topic 1: [('goooo', 0.7376186432407245), ('aha', 0.7376186432407245), ('uwu', 0.6607476281407322), ('ah', 0.6607476281407322), ('lets', 0.5075829813203881), ('boy', 0.5075829813203881), ('hey', 0.4946399969653951), ('oh', 0.4462647800835987), ('guys', 0.43171359151455185), ('', 1e-05)]
Topic 2: [('graphics', 0.04719429058401307), ('design', 0.02323330642434248), ('beautiful', 0.0218503900891224), ('run', 0.01748031207129792), ('are', 0.01707887297826493), ('art', 0.01701074521911177), ('runs', 0.01618173175782274), ('on', 0.016010136154629786), ('music', 0.01599126541650886), ('effects', 0.01502712773645886)]
Topic 3: [('community', 0.0767553356615447), ('friendly', 0.0

In [30]:
import pandas as pd
import re
from bertopic import BERTopic

# topic_model_positive 파일을 로드 (예제 코드에서는 이미 메모리에 있다고 가정)
# topic_model_positive = BERTopic.load("path_to_your_model")

# 토픽 정보 추출
topics = topic_model_negative.get_topics()

# 토픽 데이터 정리
data = []
for topic_num, words in topics.items():
    keywords = [re.sub(r'\(.*?\)', '', word[0]).strip() for word in words]  # 숫자 제거
    data.append([topic_num, ', '.join(keywords)])

# DataFrame 생성
df = pd.DataFrame(data, columns=["Topic", "Keywords"])

# CSV 저장
csv_filename = "warframe_negative_topics.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"CSV 파일이 저장되었습니다: {csv_filename}")


CSV 파일이 저장되었습니다: warframe_negative_topics.csv


In [15]:
# 긍정 리뷰 토픽 관계도 시각화
topic_model_positive.visualize_topics()

In [16]:
# 부정 리뷰 토픽 관계도 시각화
topic_model_negative.visualize_topics()

In [31]:
import pandas as pd
import re
from bertopic import BERTopic

# topic_model_positive 파일을 로드 (예제 코드에서는 이미 메모리에 있다고 가정)
# topic_model_positive = BERTopic.load("path_to_your_model")

# 토픽 정보 추출
topics = topic_model_negative.get_topics()

# 토픽 데이터 정리
data = []
for topic_num, words in topics.items():
    keywords = [re.sub(r'\(.*?\)', '', word[0]).strip() for word in words]  # 숫자 제거
    weights = [word[1] for word in words]  # 숫자(가중치) 추가
    data.append([topic_num, ', '.join(keywords), ', '.join(map(str, weights))])

# DataFrame 생성
df = pd.DataFrame(data, columns=["Topic", "Keywords", "Weights"])

# CSV 저장
csv_filename = "warframe_negative_topics_withnum.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"CSV 파일이 저장되었습니다: {csv_filename}")


CSV 파일이 저장되었습니다: warframe_negative_topics_withnum.csv


In [32]:
# 긍정 리뷰
topic_model_positive.visualize_barchart(top_n_topics=20)

In [33]:
# 부정 리뷰
topic_model_negative.visualize_barchart(top_n_topics=20)

In [19]:
topic_model_positive.visualize_heatmap()

In [20]:
topic_model_negative.visualize_heatmap()