# 1. (문장 단위)전처리

In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
import nltk
import re

# nltk.download('punkt')

# 문장 단위 토큰화 함수
def split_into_sentences(text):
    if not isinstance(text, str):  # 문자열이 아닌 경우 빈 문자열 반환
        return []
    return sent_tokenize(text)

def preprocess_sentence(sentence):
    if not isinstance(sentence, str):  # 문자열이 아닌 경우 빈 문자열 반환
        return ""
    sentence = sentence.lower()  # 소문자 변환
    sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)  # 특수문자 제거
    return sentence.strip()

# CSV 파일 로드
df = pd.read_csv('new reviews/Warframe_Negative_reviews2.csv')

# NaN 값을 빈 문자열로 대체 후 문자열 타입 변환
df['review_text'] = df['review_text'].fillna('').astype(str)

# 문장 단위로 분리
df['sentences'] = df['review_text'].apply(split_into_sentences)

# 문장 단위 데이터 펼치기
sentence_df = df.explode('sentences').reset_index(drop=True)
sentence_df.rename(columns={'sentences': 'sentence'}, inplace=True)

# 전처리 적용
sentence_df['cleaned_sentence'] = sentence_df['sentence'].apply(preprocess_sentence)

# 결과 확인
print(sentence_df[['sentence', 'cleaned_sentence']].head())


                                            sentence  \
0  Was banned, and had an account with them since...   
1  Ban reason- suspected cheating, and given a ba...   
2  Just paid 55 dollars into game literally 3 day...   
3  When asked how I cheated they just completely ...   
4  You tell me why I would cheat in a game in whi...   

                                    cleaned_sentence  
0      was banned and had an account with them since  
1  ban reason suspected cheating and given a ban ...  
2  just paid  dollars into game literally  days b...  
3  when asked how i cheated they just completely ...  
4  you tell me why i would cheat in a game in whi...  


# 2. 임베딩

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# 모델 로드
model = SentenceTransformer("intfloat/multilingual-e5-large", trust_remote_code=True)

# 임베딩 함수
def get_embeddings_in_batches(texts, model, batch_size=16):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        embeddings = model.encode(
            batch_texts,
            batch_size=batch_size,
            convert_to_numpy=True,  # NumPy 배열로 반환
            show_progress_bar=True  # 진행 상태 표시
        )
        all_embeddings.append(embeddings)
    return np.vstack(all_embeddings)

# 문장 데이터 임베딩
embeddings = get_embeddings_in_batches(sentence_df['cleaned_sentence'].tolist(), model, batch_size=16)

# 임베딩 추가
sentence_df['embedding'] = list(embeddings)
print("임베딩 크기:", embeddings.shape)
print(sentence_df.head())


  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1/1 [00:12<00:00, 12.81s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  4.00s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.57s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.01s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.27s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.03s/it]
Batches: 100%|██████████| 1/1 [00:07<00:00,  7.65s/it]
Batches: 100%|██████████| 1/1 [00:05<00:00,  5.95s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.15s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.35s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]
Batches: 100%|██████████| 1/1 [00:03<00:00,  3.64s/it]
Batches: 100%|██████████| 1/1 [00:04<00:00,  4.28s/it]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]
Batches: 100%|██████████| 1/1 [00:06<00:00,  6.04s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.48s/it]
Batches: 100%|██████████| 1/1 [00:02<00:00,  2.54s/it]
Batches: 100%|█

임베딩 크기: (11094, 1024)
      title        id                                        review_text  \
0  Warframe  Travlink  Was banned, and had an account with them since...   
1  Warframe  Travlink  Was banned, and had an account with them since...   
2  Warframe  Travlink  Was banned, and had an account with them since...   
3  Warframe  Travlink  Was banned, and had an account with them since...   
4  Warframe  Travlink  Was banned, and had an account with them since...   

    recommendation posted_date  playtime  \
0  Not Recommended  2023-02-01    1169.5   
1  Not Recommended  2023-02-01    1169.5   
2  Not Recommended  2023-02-01    1169.5   
3  Not Recommended  2023-02-01    1169.5   
4  Not Recommended  2023-02-01    1169.5   

                                            sentence  \
0  Was banned, and had an account with them since...   
1  Ban reason- suspected cheating, and given a ba...   
2  Just paid 55 dollars into game literally 3 day...   
3  When asked how I cheated they




# 3. 감정 분석

In [4]:
# 데이터프레임을 CSV로 저장


sentence_df.to_csv("warframe_negative_embeddings.csv", index=False)


print("CSV 파일로 저장되었습니다!")


CSV 파일로 저장되었습니다!


In [5]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])  # 감정 레이블 추출
    return sentiments

# 감정 분석 실행: 문장 단위로 감정 분석
sentence_df['sentiment'] = analyze_sentiment(sentence_df['cleaned_sentence'].tolist())

# 결과 확인
print(sentence_df[['cleaned_sentence', 'sentiment']].head())

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


                                    cleaned_sentence sentiment
0      was banned and had an account with them since  negative
1  ban reason suspected cheating and given a ban ...  negative
2  just paid  dollars into game literally  days b...   neutral
3  when asked how i cheated they just completely ...  negative
4  you tell me why i would cheat in a game in whi...  negative


In [6]:
# 긍정/부정 리뷰 분리
positive_reviews = sentence_df[sentence_df['sentiment'] == 'positive']
negative_reviews = sentence_df[sentence_df['sentiment'] == 'negative']
neutral_reviews = sentence_df[sentence_df['sentiment'] == 'neutral']

print(f"긍정 리뷰 개수: {len(positive_reviews)}")
print(f"부정 리뷰 개수: {len(negative_reviews)}")
print(f"중립 리뷰 개수: {len(neutral_reviews)}")

긍정 리뷰 개수: 1502
부정 리뷰 개수: 6258
중립 리뷰 개수: 3334


# 4. 토픽 모델링

In [8]:
from bertopic import BERTopic

# 각 리뷰에 맞는 임베딩을 정확히 필터링
positive_embeddings = embeddings[positive_reviews.index]
negative_embeddings = embeddings[negative_reviews.index]

# 긍정 리뷰 토픽 모델링
topic_model_positive = BERTopic()
topics_positive, probs_positive = topic_model_positive.fit_transform(
    positive_reviews['cleaned_sentence'].tolist(),
    embeddings=positive_embeddings
)

# 부정 리뷰 토픽 모델링
topic_model_negative = BERTopic()
topics_negative, probs_negative = topic_model_negative.fit_transform(
    negative_reviews['cleaned_sentence'].tolist(),
    embeddings=negative_embeddings
)

# 긍정 토픽 확인
print("긍정 리뷰 주요 토픽:", topic_model_positive.get_topic_info())

# 부정 토픽 확인
print("부정 리뷰 주요 토픽:", topic_model_negative.get_topic_info())

긍정 리뷰 주요 토픽:    Topic  Count               Name  \
0      0   1391  0_the_and_game_to   
1      1    111              1____   

                                     Representation  \
0  [the, and, game, to, of, is, you, it, this, for]   
1                              [, , , , , , , , , ]   

                                 Representative_Docs  
0  [i believe that this game is amazing for the p...  
1                                             [, , ]  
부정 리뷰 주요 토픽:     Topic  Count                                 Name  \
0      -1   2941                   -1_the_and_to_game   
1       0    429          0_warframe_warframes_and_to   
2       1    159             1_de_their_has_community   
3       2    155        2_recommend_anyone_this_would   
4       3    137          3_grind_grindy_too_grinding   
..    ...    ...                                  ...   
80     79     11     79_update_worse_remastered_clone   
81     80     11        80_sucks_illogical_this_greed   
82     81     1

In [9]:
# 긍정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_positive.get_topic_info())):
    print(f"Topic {topic}: {topic_model_positive.get_topic(topic)}")

# 부정 리뷰 토픽 내 주요 단어
for topic in range(len(topic_model_negative.get_topic_info())):
    print(f"Topic {topic}: {topic_model_negative.get_topic(topic)}")

Topic 0: [('the', 0.1125848674605349), ('and', 0.09732943408811147), ('game', 0.09040397520501511), ('to', 0.08369838874034817), ('of', 0.06471138313035955), ('is', 0.06440033424668475), ('you', 0.06346203675818239), ('it', 0.06166809979988311), ('this', 0.056216808791439664), ('for', 0.054753426985505796)]
Topic 1: [('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)]
Topic 0: [('warframe', 0.04593237390779273), ('warframes', 0.018473039337940898), ('and', 0.010699876289971549), ('to', 0.010654516829024957), ('the', 0.01061035891166947), ('of', 0.00999333757607698), ('is', 0.009557744335320948), ('new', 0.009512756828191962), ('in', 0.009184713500029607), ('for', 0.008794579748727816)]
Topic 1: [('de', 0.08028152606525507), ('their', 0.01626669106103861), ('has', 0.014040284141325413), ('community', 0.012883750470546157), ('to', 0.012250875412560622), ('players', 0.010691853725835811), ('and', 0.0103518116116

In [10]:
import pandas as pd
import re
from bertopic import BERTopic

# topic_model_positive 파일을 로드 (예제 코드에서는 이미 메모리에 있다고 가정)
# topic_model_positive = BERTopic.load("path_to_your_model")

# 토픽 정보 추출
topics = topic_model_positive.get_topics()

# 토픽 데이터 정리
data = []
for topic_num, words in topics.items():
    keywords = [re.sub(r'\(.*?\)', '', word[0]).strip() for word in words]  # 숫자 제거
    data.append([topic_num, ', '.join(keywords)])

# DataFrame 생성
df = pd.DataFrame(data, columns=["Topic", "Keywords"])

# CSV 저장
csv_filename = "warframe_대충_topics.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"CSV 파일이 저장되었습니다: {csv_filename}")


CSV 파일이 저장되었습니다: warframe_대충_topics.csv


In [12]:
# 긍정 리뷰 토픽 관계도 시각화
topic_model_positive.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity

In [16]:
# 부정 리뷰 토픽 관계도 시각화
topic_model_negative.visualize_topics()

In [14]:
import pandas as pd
import re
from bertopic import BERTopic

# topic_model_positive 파일을 로드 (예제 코드에서는 이미 메모리에 있다고 가정)
# topic_model_positive = BERTopic.load("path_to_your_model")

# 토픽 정보 추출
topics = topic_model_positive.get_topics()

# 토픽 데이터 정리
data = []
for topic_num, words in topics.items():
    keywords = [re.sub(r'\(.*?\)', '', word[0]).strip() for word in words]  # 숫자 제거
    weights = [word[1] for word in words]  # 숫자(가중치) 추가
    data.append([topic_num, ', '.join(keywords), ', '.join(map(str, weights))])

# DataFrame 생성
df = pd.DataFrame(data, columns=["Topic", "Keywords", "Weights"])

# CSV 저장
csv_filename = "destiny2_positive_topics_withnum.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"CSV 파일이 저장되었습니다: {csv_filename}")


CSV 파일이 저장되었습니다: destiny2_positive_topics_withnum.csv


In [11]:
# 긍정 리뷰
topic_model_positive.visualize_barchart(top_n_topics=20)

In [14]:
# 부정 리뷰
topic_model_negative.visualize_barchart(top_n_topics=25)

In [19]:
topic_model_positive.visualize_heatmap()

In [20]:
topic_model_negative.visualize_heatmap()