# 1. (문장 단위)전처리

In [3]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import re

#nltk.download('punkt')

# 문장 단위 토큰화 함수
def split_into_sentences(text):
    return sent_tokenize(text)

def preprocess_sentence(sentence):
    sentence = sentence.lower()  # 소문자 변환
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # 특수문자 제거
    return sentence.strip()

df = pd.read_csv('Warframe_reviews.csv')

# 문장 단위로 분리
df['sentences'] = df['review_text'].apply(split_into_sentences)

# 문장 단위 데이터 펼치기
sentence_df = df.explode('sentences').reset_index(drop=True)
sentence_df.rename(columns={'sentences': 'sentence'}, inplace=True)

# 전처리 적용
sentence_df['cleaned_sentence'] = sentence_df['sentence'].apply(preprocess_sentence)
print(sentence_df[['sentence', 'cleaned_sentence']].head())

print(sentence_df.head())

                                            sentence  \
0  Dealing 5 billion damage to a 100 health point...   
1                               Welcome to Warframe.   
2  Play the same mission 231 times for the peepee...   
3  The best part of this game is figuring out wha...   
4  I hope you guys from The First Descendant, Des...   

                                    cleaned_sentence  
0  dealing  billion damage to a  health points en...  
1                                welcome to warframe  
2  play the same mission  times for the peepeepoo...  
3  the best part of this game is figuring out wha...  
4  i hope you guys from the first descendant dest...  
      title              id  \
0  Warframe       Bowieober   
1  Warframe  BamBoozlerC0L3   
2  Warframe  BamBoozlerC0L3   
3  Warframe     Mumbo Magic   
4  Warframe         RushCit   

                                         review_text recommendation  \
0  Dealing 5 billion damage to a 100 health point...    Recommended   
1  W

# 2. 임베딩

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# 모델 로드
tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
model = AutoModel.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")

# 임베딩 함수
def get_embeddings_in_batches(texts, tokenizer, model, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        with torch.no_grad():
            outputs = model(**tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(embeddings.cpu().numpy())
    return np.vstack(all_embeddings)

# 문장 데이터 임베딩
embeddings = get_embeddings_in_batches(sentence_df['cleaned_sentence'].tolist(), tokenizer, model, batch_size=16)

# 임베딩 추가
sentence_df['embedding'] = list(embeddings)
print("임베딩 크기:", embeddings.shape)


  from .autonotebook import tqdm as notebook_tqdm


임베딩 크기: (10692, 1024)


# 3. 감정 분석

In [5]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])  # 감정 레이블 추출
    return sentiments

# 감정 분석 실행: 문장 단위로 감정 분석
sentence_df['sentiment'] = analyze_sentiment(sentence_df['cleaned_sentence'].tolist())

# 결과 확인
print(sentence_df[['cleaned_sentence', 'sentiment']].head())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


                                    cleaned_sentence sentiment
0  dealing  billion damage to a  health points en...   neutral
1                                welcome to warframe  positive
2  play the same mission  times for the peepeepoo...   neutral
3  the best part of this game is figuring out wha...  positive
4  i hope you guys from the first descendant dest...   neutral


In [7]:
# 긍정/부정 리뷰 분리
positive_reviews = sentence_df[sentence_df['sentiment'] == 'positive']
negative_reviews = sentence_df[sentence_df['sentiment'] == 'negative']
neutral_reviews = sentence_df[sentence_df['sentiment'] == 'neutral']

print(f"긍정 리뷰 개수: {len(positive_reviews)}")
print(f"부정 리뷰 개수: {len(negative_reviews)}")
print(f"중립 리뷰 개수: {len(neutral_reviews)}")

긍정 리뷰 개수: 4571
부정 리뷰 개수: 1694
중립 리뷰 개수: 4427


# 4. 토픽 모델링

In [8]:
from bertopic import BERTopic

# 긍정 리뷰 토픽 모델링
positive_texts = positive_reviews['cleaned_sentence'].tolist()
topic_model_positive = BERTopic()
topics_positive, probs_positive = topic_model_positive.fit_transform(positive_texts)

# 부정 리뷰 토픽 모델링
negative_texts = negative_reviews['cleaned_sentence'].tolist()
topic_model_negative = BERTopic()
topics_negative, probs_negative = topic_model_negative.fit_transform(negative_texts)

# 긍정 토픽 확인
print("긍정 리뷰 주요 토픽:", topic_model_positive.get_topic_info())

# 부정 토픽 확인
print("부정 리뷰 주요 토픽:", topic_model_negative.get_topic_info())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


긍정 리뷰 주요 토픽:     Topic  Count                                     Name  \
0      -1   1193                       -1_and_the_to_game   
1       0    235                      0_free_play_best_to   
2       1    160          1_graphics_design_beautiful_are   
3       2    157                         2_uwu_aha_ah_boy   
4       3    142              3_grind_grinding_but_grindy   
..    ...    ...                                      ...   
81     80     11          80_loop_gameplay_core_secondary   
82     81     11  81_gear_warframe_activities_challenging   
83     82     11             82_fp_warframe_example_hands   
84     83     10     83_currency_premium_chat_recruitment   
85     84     10                 84_destiny_bungie_buy_im   

                                       Representation  \
0   [and, the, to, game, warframe, it, of, this, i...   
1   [free, play, best, to, one, game, games, out, ...   
2   [graphics, design, beautiful, are, run, pc, ar...   
3            [uwu, aha, ah

In [9]:
# 긍정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_positive.get_topic_info())):
    print(f"Topic {topic}: {topic_model_positive.get_topic(topic)}")

# 부정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_negative.get_topic_info())):
    print(f"Topic {topic}: {topic_model_negative.get_topic(topic)}")

Topic 0: [('free', 0.06703212564338314), ('play', 0.039259058258832344), ('best', 0.03229214443375012), ('to', 0.018307391236725718), ('one', 0.01757597966559067), ('game', 0.016362435723592438), ('games', 0.01630233677741976), ('out', 0.014641467789533107), ('for', 0.013366761814056313), ('its', 0.013298561471544857)]
Topic 1: [('graphics', 0.0522282725787824), ('design', 0.023003916202264187), ('beautiful', 0.02020789745245238), ('are', 0.019550635100427198), ('run', 0.01876447620584864), ('pc', 0.0169334824754878), ('art', 0.016840435368206615), ('runs', 0.01601482074921051), ('on', 0.01596790691408905), ('gorgeous', 0.015817535345413056)]
Topic 2: [('uwu', 1.4924963788577144), ('aha', 0.8327318425681639), ('ah', 0.7462481894288572), ('boy', 0.5739155973957548), ('hey', 0.5593510460156059), ('oh', 0.5049105981298841), ('guys', 0.4885332445688042), ('', 1e-05), ('', 1e-05), ('', 1e-05)]
Topic 3: [('grind', 0.14284566230429005), ('grinding', 0.04423656103636433), ('but', 0.01891673029

In [10]:
# 긍정 리뷰 토픽 관계도 시각화
topic_model_positive.visualize_topics()

In [11]:
# 부정 리뷰 토픽 관계도 시각화
topic_model_negative.visualize_topics()

In [12]:
# 긍정 리뷰
topic_model_positive.visualize_barchart(top_n_topics=10)

In [13]:
# 부정 리뷰
topic_model_negative.visualize_barchart(top_n_topics=10)

In [14]:
topic_model_positive.visualize_heatmap()

In [15]:
topic_model_negative.visualize_heatmap()