# 1. 토큰화

### 품사 태깅 이유
+ game, play 등의 의미없는 단어가 너무 많음. => 일일히 걸러주자
+ 품사 태깅 이유 : 키워드 추출 할때 부사와 같은 꾸며주는 단어가 많이 나와서 추출하는데 의미가 없다고 생각이 들었음
+ 동사, 명사와 같은 의미있는 단어만 뽑아주기 위해 품사 태깅 사용

+ 원형, 표제어 추출하면 품사 태깅 안해도 괜찮을 듯

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import pandas as pd
import re

# 불용어와 어간 추출, 표제어 추출 초기화
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()  # 어간 추출기
lemmatizer = WordNetLemmatizer()  # 표제어 추출기

# 정의한 불용어 리스트
stop_text = ['game', 'warframe', 'destiny', 'destiny2', 'thefirstdescendant', 'gameplay', 'descendant', 'tfd', 'first', 'play']

# 전처리 함수 정의
def preprocess_review(text):
    # 1. 소문자 변환
    text = text.lower()
    # 2. 특수문자 및 숫자 제거
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 3. 토큰화
    tokens = word_tokenize(text)
    # 4. 품사 태깅
    tagged_tokens = pos_tag(tokens)
    # 5. 명사(NN), 동사(VB), 형용사(JJ)만 필터링하고 표제어 추출
    filtered_tokens = [
        lemmatizer.lemmatize(word)  # 표제어 추출
        for word, tag in tagged_tokens
        if tag.startswith(('NN', 'VB', 'JJ')) and (word not in stop_words) and (word not in stop_text)
    ]
    return filtered_tokens

df = pd.read_csv('Warframe_reviews.csv')

# 전처리 적용
df['cleaned_text'] = df['review_text'].apply(preprocess_review)
print(df[['review_text', 'cleaned_text']].head())


                                         review_text  \
0  Dealing 5 billion damage to a 100 health point...   
1  Welcome to Warframe. Play the same mission 231...   
2  The best part of this game is figuring out wha...   
3  I hope you guys from The First Descendant, Des...   
4  Once Cross save is available, I will start pla...   

                                        cleaned_text  
0  [dealing, damage, health, point, enemy, crucia...  
1  [welcome, mission, time, peepeepoopoo, blueprint]  
2  [best, part, figuring, want, next, there, wors...  
3  [hope, guy, division, read, faq, raid, raid, u...  
4  [cross, save, available, start, playing, ill, ...  


In [None]:
# 품사 태깅 테스트 (동사 많이 나온 단어 추출)
# from collections import Counter

# tagged_tokens = nltk.pos_tag(all_tokens)
# nouns = [word for word, pos in tagged_tokens if pos.startswith('VB')]

# counter = Counter(nouns)
# top_30_nn = counter.most_common(30)
# print(top_30_nn)

# 2. 임베딩

In [4]:
# 토큰 리스트를 문자열로 변환
df['cleaned_text'] = df['cleaned_text'].apply(lambda tokens: ' '.join(tokens)) 

# 결과 확인
print(df[['review_text', 'cleaned_text']].head())

                                         review_text  \
0  Dealing 5 billion damage to a 100 health point...   
1  Welcome to Warframe. Play the same mission 231...   
2  The best part of this game is figuring out wha...   
3  I hope you guys from The First Descendant, Des...   
4  Once Cross save is available, I will start pla...   

                                        cleaned_text  
0  dealing damage health point enemy crucial plot...  
1        welcome mission time peepeepoopoo blueprint  
2  best part figuring want next there worst part ...  
3  hope guy division read faq raid raid used hect...  
4  cross save available start playing ill say gre...  


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
model = AutoModel.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")

def get_embeddings_in_batches(texts, tokenizer, model, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        with torch.no_grad():
            outputs = model(**tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # 평균 풀링
            all_embeddings.append(embeddings.cpu().numpy())  # CPU 메모리로 이동
    return np.vstack(all_embeddings)  # 배열 합치기

# 임베딩 실행
texts = df['cleaned_text'].tolist() 
embeddings = get_embeddings_in_batches(texts, tokenizer, model, batch_size=16)

print("임베딩 크기:", embeddings.shape)

임베딩 크기: (2780, 1024)


# 3. 감정 분석

In [8]:
print(tokenizer.model_max_length)


512


In [10]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])
    return sentiments

# 감정 분석 실행
df['sentiment'] = analyze_sentiment(df['cleaned_text'].tolist())

# 결과 확인
print(df[['cleaned_text', 'sentiment']].head())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


                                        cleaned_text sentiment
0  dealing damage health point enemy crucial plot...   neutral
1        welcome mission time peepeepoopoo blueprint  positive
2  best part figuring want next there worst part ...   neutral
3  hope guy division read faq raid raid used hect...   neutral
4  cross save available start playing ill say gre...  positive


In [14]:
# 긍정/부정 리뷰 분리
positive_reviews = df[df['sentiment'] == 'positive']
negative_reviews = df[df['sentiment'] == 'negative']
neutral_reviews = df[df['sentiment'] == 'neutral']

print(f"긍정 리뷰 개수: {len(positive_reviews)}")
print(f"부정 리뷰 개수: {len(negative_reviews)}")
print(f"중립 리뷰 개수: {len(neutral_reviews)}")

긍정 리뷰 개수: 1406
부정 리뷰 개수: 334
중립 리뷰 개수: 1040


# 4. 토픽 모델링

In [15]:
from bertopic import BERTopic

# 긍정 리뷰 토픽 모델링
positive_texts = positive_reviews['cleaned_text'].tolist()
topic_model_positive = BERTopic()
topics_positive, probs_positive = topic_model_positive.fit_transform(positive_texts)

# 부정 리뷰 토픽 모델링
negative_texts = negative_reviews['cleaned_text'].tolist()
topic_model_negative = BERTopic()
topics_negative, probs_negative = topic_model_negative.fit_transform(negative_texts)

# 긍정 토픽 확인
print("긍정 리뷰 주요 토픽:", topic_model_positive.get_topic_info())

# 부정 토픽 확인
print("부정 리뷰 주요 토픽:", topic_model_negative.get_topic_info())


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


긍정 리뷰 주요 토픽:     Topic  Count                                    Name  \
0      -1    485                   -1_free_get_fun_great   
1       0     60          0_received_product_free_played   
2       1     54             1_grind_grinding_grindy_fun   
3       2     52          2_weapon_mission_warframes_get   
4       3     52                   3_uwu_antipay_lol_win   
5       4     50                    4_good_owo_gamee_sir   
6       5     45                 5_pc_devs_content_steam   
7       6     41              6_space_ninja_robot_weapon   
8       7     39             7_awesome_cool_awsome_great   
9       8     35        8_coop_character_get_multiplayer   
10      9     32                9_hour_seems_played_good   
11     10     31                 10_player_get_time_feel   
12     11     30         11_graphic_story_nice_beautiful   
13     12     30         12_free_anthem_better_excellent   
14     13     28     13_shooter_person_third_thirdperson   
15     14     28           

In [16]:
# 긍정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_positive.get_topic_info())):
    print(f"Topic {topic}: {topic_model_positive.get_topic(topic)}")

# 부정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_negative.get_topic_info())):
    print(f"Topic {topic}: {topic_model_negative.get_topic(topic)}")


Topic 0: [('received', 0.16323244093185652), ('product', 0.16246694255802344), ('free', 0.08430609281335033), ('played', 0.024457257076038328), ('best', 0.021873688933877956), ('ive', 0.021259029959625024), ('fun', 0.020459050747535144), ('give', 0.019651447829026766), ('good', 0.01891327363944329), ('time', 0.017379659029284813)]
Topic 1: [('grind', 0.5100401981689908), ('grinding', 0.08017773360100601), ('grindy', 0.03745232663626611), ('fun', 0.036388534539078116), ('good', 0.026557271306499125), ('shiny', 0.02471847300659537), ('fest', 0.023728127693038968), ('love', 0.02260751678082329), ('grindfest', 0.020660010348720514), ('great', 0.018363788091615305)]
Topic 2: [('weapon', 0.02318770882764612), ('mission', 0.01721793683229958), ('warframes', 0.01621569077124849), ('get', 0.015906892305297984), ('great', 0.015090145770033092), ('story', 0.01456450572206172), ('space', 0.014091394715671817), ('graphic', 0.013907943657818434), ('rpg', 0.013899827097796488), ('system', 0.013478889

In [19]:
# 긍정 리뷰 토픽 관계도 시각화
topic_model_positive.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [20]:
# 부정 리뷰 토픽 관계도 시각화
topic_model_negative.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [21]:
# 긍정 리뷰
topic_model_positive.visualize_barchart(top_n_topics=10)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [22]:
# 부정 리뷰
topic_model_negative.visualize_barchart(top_n_topics=10)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [23]:
topic_model_positive.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [24]:
topic_model_negative.visualize_heatmap()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed