# 1. 토큰화

### 품사 태깅 이유
+ game, play 등의 의미없는 단어가 너무 많음. => 일일히 걸러주자
+ 품사 태깅 이유 : 키워드 추출 할때 부사와 같은 꾸며주는 단어가 많이 나와서 추출하는데 의미가 없다고 생각이 들었음
+ 동사, 명사와 같은 의미있는 단어만 뽑아주기 위해 품사 태깅 사용

+ 원형, 표제어 추출하면 품사 태깅 안해도 괜찮을 듯

In [59]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import pandas as pd
import re

# 불용어와 어간 추출, 표제어 추출 초기화
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()  # 어간 추출기
lemmatizer = WordNetLemmatizer()  # 표제어 추출기

# 정의한 불용어 리스트
stop_text = ['game', 'warframe', 'destiny', 'destiny2', 'thefirstdescendant', 'gameplay', 'descendant', 'tfd', 'first', 'play']

# 전처리 함수 정의
def preprocess_review(text):
    if isinstance(text,str):
        # 1. 소문자 변환
        text = text.lower()
        # 2. 특수문자 및 숫자 제거
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # 3. 토큰화
        tokens = word_tokenize(text)
        # 4. 품사 태깅
        tagged_tokens = pos_tag(tokens)
        # 5. 명사(NN), 동사(VB), 형용사(JJ)만 필터링하고 표제어 추출
        filtered_tokens = [
            lemmatizer.lemmatize(word)  # 표제어 추출
            for word, tag in tagged_tokens
            if tag.startswith(('NN', 'VB', 'JJ')) and (word not in stop_words) and (word not in stop_text)
        ]
        return filtered_tokens
    else:
        return ""

df = pd.read_csv('The_first_descendant_reviews.csv')

# 전처리 적용
df['cleaned_text'] = df['review_text'].apply(preprocess_review)
print(df[['review_text', 'cleaned_text']].head())


                                         review_text  \
0  Product received for free  In the beta test yo...   
1    Game is extremely intrusive on data collection.   
2  The First Descendant is more predatory than Dr...   
3  The game was fun enough for the time I played ...   
4  I was going to play it but POW! moment you go ...   

                                        cleaned_text  
0  [product, received, free, beta, test, craft, p...  
1                      [intrusive, data, collection]  
2  [predatory, drake, keep, wallet, keep, credit,...  
3  [time, played, fact, kernellevel, anticheats, ...  
4  [going, pow, moment, go, log, asking, consent,...  


In [None]:
# 품사 태깅 테스트 (동사 많이 나온 단어 추출)
# from collections import Counter

# tagged_tokens = nltk.pos_tag(all_tokens)
# nouns = [word for word, pos in tagged_tokens if pos.startswith('VB')]

# counter = Counter(nouns)
# top_30_nn = counter.most_common(30)
# print(top_30_nn)

# 2. 임베딩

In [60]:
# 토큰 리스트를 문자열로 변환
df['cleaned_text'] = df['cleaned_text'].apply(lambda tokens: ' '.join(tokens)) 

# 결과 확인
print(df[['review_text', 'cleaned_text']].head())

                                         review_text  \
0  Product received for free  In the beta test yo...   
1    Game is extremely intrusive on data collection.   
2  The First Descendant is more predatory than Dr...   
3  The game was fun enough for the time I played ...   
4  I was going to play it but POW! moment you go ...   

                                        cleaned_text  
0  product received free beta test craft paint fr...  
1                          intrusive data collection  
2  predatory drake keep wallet keep credit card k...  
3  time played fact kernellevel anticheats eula a...  
4  going pow moment go log asking consent share i...  


In [61]:
# Load model directly
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
model = AutoModel.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")

def get_embeddings_in_batches(texts, tokenizer, model, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            return_tensors="pt",
            max_length=512
        )
        with torch.no_grad():
            outputs = model(**tokens)
            embeddings = outputs.last_hidden_state.mean(dim=1)  # 평균 풀링
            all_embeddings.append(embeddings.cpu().numpy())  # CPU 메모리로 이동
    return np.vstack(all_embeddings)  # 배열 합치기

# 임베딩 실행
texts = df['cleaned_text'].tolist() 
embeddings = get_embeddings_in_batches(texts, tokenizer, model, batch_size=16)

print("임베딩 크기:", embeddings.shape)

임베딩 크기: (4223, 1024)


# 3. 감정 분석

In [62]:
print(tokenizer.model_max_length)


512


In [63]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])
    return sentiments

# 감정 분석 실행
df['sentiment'] = analyze_sentiment(df['cleaned_text'].tolist())

# 결과 확인
print(df[['cleaned_text', 'sentiment']].head())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


                                        cleaned_text sentiment
0  product received free beta test craft paint fr...   neutral
1                          intrusive data collection   neutral
2  predatory drake keep wallet keep credit card k...   neutral
3  time played fact kernellevel anticheats eula a...  positive
4  going pow moment go log asking consent share i...   neutral


In [64]:
# 긍정/부정 리뷰 분리
positive_reviews = df[df['sentiment'] == 'positive']
negative_reviews = df[df['sentiment'] == 'negative']
neutral_reviews = df[df['sentiment'] == 'neutral']

print(f"긍정 리뷰 개수: {len(positive_reviews)}")
print(f"부정 리뷰 개수: {len(negative_reviews)}")
print(f"중립 리뷰 개수: {len(neutral_reviews)}")

긍정 리뷰 개수: 486
부정 리뷰 개수: 2470
중립 리뷰 개수: 1267


# 4. 토픽 모델링

In [65]:
from bertopic import BERTopic

# 긍정 리뷰 토픽 모델링
positive_texts = positive_reviews['cleaned_text'].tolist()
topic_model_positive = BERTopic()
topics_positive, probs_positive = topic_model_positive.fit_transform(positive_texts)

# 부정 리뷰 토픽 모델링
negative_texts = negative_reviews['cleaned_text'].tolist()
topic_model_negative = BERTopic()
topics_negative, probs_negative = topic_model_negative.fit_transform(negative_texts)

# 긍정 토픽 확인
print("긍정 리뷰 주요 토픽:", topic_model_positive.get_topic_info())

# 부정 토픽 확인
print("부정 리뷰 주요 토픽:", topic_model_negative.get_topic_info())


긍정 리뷰 주요 토픽:    Topic  Count                  Name  \
0      0    371   0_good_fun_get_time   
1      1    115  1_hahaha_lol_please_   

                                      Representation  \
0  [good, fun, get, time, character, free, grind,...   
1                [hahaha, lol, please, , , , , , , ]   

                                 Representative_Docs  
0  [sort cross ii dash borderland tera good fun s...  
1                              [please, hahaha, lol]  
부정 리뷰 주요 토픽:     Topic  Count                                              Name  \
0      -1   1052                             -1_time_dont_get_feel   
1       0     97                     0_story_boring_feel_character   
2       1     67                              1_rate_drop_item_low   
3       2     66                2_nexon_greedy_nexons_monetization   
4       3     66                        3_grind_grinding_part_hour   
5       4     63                   4_received_product_free_mission   
6       5     61          

In [66]:
# 긍정 리뷰 토픽 내 주요 단어
print(len(topic_model_positive.get_topic_info()))
for topic in range(len(topic_model_positive.get_topic_info())):
    print(f"Topic {topic}: {topic_model_positive.get_topic(topic)}")

# # 부정 리뷰 토픽 내 주요 단어
# for topic in range(len(topic_model_negative.get_topic_info())):
#     print(f"Topic {topic}: {topic_model_negative.get_topic(topic)}")


2
Topic 0: [('good', 0.05502277498378641), ('fun', 0.04716654605028752), ('get', 0.04658142183055576), ('time', 0.04390420800848603), ('character', 0.03958398683815743), ('free', 0.03636801630504205), ('grind', 0.036039874959828445), ('make', 0.03336843049332297), ('dont', 0.03060810116603543), ('mission', 0.029191271744208307)]
Topic 1: [('hahaha', 2.8156617654575804), ('lol', 2.4496006693800703), ('please', 2.123014817242203), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]


In [67]:
# 부정적 토픽 
import pandas as pd

# 부정적인 토픽 데이터 저장을 위한 리스트
negative_topics_list = []

for topic in range(len(topic_model_negative.get_topic_info())):
    topic_data = topic_model_negative.get_topic(topic)
    
    # topic_data가 비어있거나 False일 경우 건너뛰기
    if not topic_data or topic_data is False:
        negative_topics_list.append([topic, "False"])
        continue
    
    # 빈 단어 제거 후 단어만 추출
    topic_words = [word for word, _ in topic_data if word.strip()]  # 빈 문자열 제거

    if topic_words:
        negative_topics_list.append([topic, ", ".join(topic_words)])
    else:
        negative_topics_list.append([topic, "(Empty Topic)"])

# DataFrame 생성
positive_topics_df = pd.DataFrame(negative_topics_list, columns=["Topic", "Keywords"])

# CSV 파일로 저장
positive_topics_df.to_csv("(영화 이름)_negative_topics.csv", index=False, encoding="utf-8")

print("부정적인 토픽 데이터를 'negative_topics.csv' 파일로 저장 완료!")
    

부정적인 토픽 데이터를 'negative_topics.csv' 파일로 저장 완료!


In [None]:
# 긍정적 토픽 
import pandas as pd

# 긍정적인 토픽 데이터 저장을 위한 리스트
positive_topics_list = []

for topic in range(len(topic_model_positive.get_topic_info())):
    topic_data = topic_model_positive.get_topic(topic)
    
    # topic_data가 비어있거나 False일 경우 건너뛰기
    if not topic_data or topic_data is False:
        positive_topics_list.append([topic, "False"])
        continue
    
    # 빈 단어 제거 후 단어만 추출
    topic_words = [word for word, _ in topic_data if word.strip()]  # 빈 문자열 제거

    if topic_words:
        positive_topics_list.append([topic, ", ".join(topic_words)])
    else:
        positive_topics_list.append([topic, "(Empty Topic)"])

# DataFrame 생성
positive_topics_df = pd.DataFrame(positive_topics_list, columns=["Topic", "Keywords"])

# CSV 파일로 저장
positive_topics_df.to_csv("(영화 이름)_positive_topics.csv", index=False, encoding="utf-8")

print("긍정적인 토픽 데이터를 'positive_topics.csv' 파일로 저장 완료!")

In [68]:
# 긍정 리뷰 토픽 관계도 시각화
topic_model_positive.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity

In [69]:
# 부정 리뷰 토픽 관계도 시각화
topic_model_negative.visualize_topics()

In [70]:
# 긍정 리뷰
topic_model_positive.visualize_barchart(top_n_topics=10)

In [71]:
# 부정 리뷰
topic_model_negative.visualize_barchart(top_n_topics=10)

In [72]:
topic_model_positive.visualize_heatmap()

In [73]:
topic_model_negative.visualize_heatmap()

In [74]:
positive_df = pd.DataFrame({
    "sentence": positive_texts,
    "topic": topics_positive
})
positive_df.to_csv("The_first_descendant_positive_words_with_topics.csv", index=False)

# 4. 부정 리뷰 각 문장별 토픽 번호 저장
negative_df = pd.DataFrame({
    "sentence": negative_texts,
    "topic": topics_negative
})
negative_df.to_csv("The_first_descendant_negative_words_with_topics.csv", index=False)

print("긍정 및 부정 리뷰 토픽 모델링 결과가 CSV로 저장되었습니다.")

긍정 및 부정 리뷰 토픽 모델링 결과가 CSV로 저장되었습니다.
