# 1. 토큰화
## 1-1. casual_tokenize 토큰화, stopwords로 단어 삭제

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import casual_tokenize
import pandas as pd

#nltk.download('stopwords')

df = pd.read_csv('Warframe_reviews.csv')

stop_words = set(stopwords.words('english'))

processed_tokens_list = []

for text in df['review_text']:
    text = str(text)
    
    # 토큰화
    tokens = casual_tokenize(text, reduce_len=True, strip_handles=True)
    
    # nltk.stop_word로 단어 삭제
    filtered_tokens = [t for t in tokens if len(t) > 1 and t.lower() not in stop_words]
    
    processed_tokens_list.append(filtered_tokens)
    
print(processed_tokens_list)

# 각 리뷰의 토큰들을 모두 모으기
all_tokens = [word for tokens in processed_tokens_list for word in tokens]



## 1-2. 품사 태깅
+ game, play 등의 의미없는 단어가 너무 많음. => 일일히 걸러주자
+ 품사 태깅 이유 : 키워드 추출 할때 부사와 같은 꾸며주는 단어가 많이 나와서 추출하는데 의미가 없다고 생각이 들었음
+ 동사, 명사와 같은 의미있는 단어만 뽑아주기 위해 품사 태깅 사용

In [None]:
# 품사 태깅 테스트 (동사 많이 나온 단어 추출)
# from collections import Counter

# tagged_tokens = nltk.pos_tag(all_tokens)
# nouns = [word for word, pos in tagged_tokens if pos.startswith('VB')]

# counter = Counter(nouns)
# top_30_nn = counter.most_common(30)
# print(top_30_nn)

In [5]:
# 불용어 품사 정의
stopPos = ['IN', 'CC', 'UH', 'TO', 'MD', 'DT', 'VBZ','VBP']

tagged_tokens = nltk.pos_tag(all_tokens)
print(tagged_tokens)

from collections import Counter

# most_common() : Counter 객체에서 빈도수가 높은 순서대로 데이터를 정렬한 리스트 반환
print(Counter(tagged_tokens).most_common(20))

[(('game', 'NN'), 3464), (('Warframe', 'NNP'), 725), (('free', 'JJ'), 651), (('like', 'IN'), 609), (('time', 'NN'), 595), (('play', 'NN'), 574), (('good', 'JJ'), 501), (('playing', 'VBG'), 492), (('grind', 'NN'), 452), (('new', 'JJ'), 450), (('...', ':'), 387), (('one', 'CD'), 376), (('hours', 'NNS'), 365), (('get', 'VB'), 346), (('still', 'RB'), 334), (('games', 'NNS'), 328), (('weapons', 'NNS'), 324), (('really', 'RB'), 316), (('best', 'JJS'), 312), (('great', 'JJ'), 312)]


In [6]:
# 불용어 처리: 특정 품사 태그에 해당하는 단어만 필터링
stop_words = [word for word, tag in tagged_tokens if tag in stopPos]
filtered_tokens = [token for token in all_tokens if token not in stop_words]

## 1-3. 정규화 (원형 복원)
### 1-3-1. 소문자 정규화

In [7]:
from matplotlib.pylab import normal

# lower메소드로 정규화 
normalized_tokens = [x.lower() for x in filtered_tokens]
print(normalized_tokens)



### 1-3-2. 어간 추출 (stemming)

In [8]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in all_tokens]
print(stemmed_tokens)

['deal', 'billion', 'damag', '100', 'health', 'point', 'enemi', 'crucial', 'plot', 'trust', 'welcom', 'warfram', 'play', 'mission', '231', 'time', 'peepeepoopoo', 'blueprint', 'best', 'part', 'game', 'figur', 'want', 'next', "there'", 'much', 'howev', 'worst', 'part', 'game', 'figur', 'want', 'next', "there'", 'much', 'hope', 'guy', 'first', 'descend', 'destini', 'divis', 'read', 'faq', 'warfram', 'raid', 'warfram', 'raid', 'like', 'destini', 'use', 'back', 'hectic', 'hard', 'understand', 'dev', 'decid', 'pull', 'redesign', 'properli', 'mayb', 'futur', 'warfram', 'might', 'raid', 'warfram', 'pay', 'win', 'ye', 'buy', 'way', 'skip', 'build', 'equip', 'upgrad', 'mod', 'end', "there'", 'noth', 'win', 'make', 'strong', 'shorter', 'period', 'time', 'free', 'play', 'player', 'get', 'result', 'play', 'earn', 'part', 'sell', 'part', 'player', 'currenc', 'market', 'system', 'warfram', 'healthi', 'like', 'counter', 'strike', 'skin', 'market', 'warfram', 'easi', 'ye', 'warfram', 'easi', 'get', 'h

### 1-3-3. 표제어 추출

In [9]:
# nltk.download('wordnet')
# nltk.download('omw-1.4')  # 선택: 시소러스(동의어) 관련

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
print(lemmatized_tokens)

['deal', 'billion', 'damag', '100', 'health', 'point', 'enemi', 'crucial', 'plot', 'trust', 'welcom', 'warfram', 'play', 'mission', '231', 'time', 'peepeepoopoo', 'blueprint', 'best', 'part', 'game', 'figur', 'want', 'next', "there'", 'much', 'howev', 'worst', 'part', 'game', 'figur', 'want', 'next', "there'", 'much', 'hope', 'guy', 'first', 'descend', 'destini', 'divis', 'read', 'faq', 'warfram', 'raid', 'warfram', 'raid', 'like', 'destini', 'use', 'back', 'hectic', 'hard', 'understand', 'dev', 'decid', 'pull', 'redesign', 'properli', 'mayb', 'futur', 'warfram', 'might', 'raid', 'warfram', 'pay', 'win', 'ye', 'buy', 'way', 'skip', 'build', 'equip', 'upgrad', 'mod', 'end', "there'", 'noth', 'win', 'make', 'strong', 'shorter', 'period', 'time', 'free', 'play', 'player', 'get', 'result', 'play', 'earn', 'part', 'sell', 'part', 'player', 'currenc', 'market', 'system', 'warfram', 'healthi', 'like', 'counter', 'strike', 'skin', 'market', 'warfram', 'easi', 'ye', 'warfram', 'easi', 'get', 'h

## 1-4. 불용어 처리

In [10]:
stop_text = ['game', 'warframe', 'destiny', 'destiny2', 'thefirstdescendant', 'gameplay', 'descendant', 'tfd', 'first', 'play']

filtered_tokens = [tokens for tokens in lemmatized_tokens if tokens not in stop_text]

# 2. 임베딩

In [11]:
pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.0-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.48.1-py3-none-any.whl.metadata (44 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.5.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting pyyaml>=5.1 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)

In [23]:
from sentence_transformers import SentenceTransformer
import pandas as pd

# Jina-Embeddings-v3 모델 로드
model = SentenceTransformer("intfloat/multilingual-e5-large", trust_remote_code=True)

# 데이터 로드 및 전처리
df = pd.read_csv('Warframe_reviews.csv')
df['review_text'] = df["review_text"].fillna("").astype(str)

# review_text 열을 리스트로 변환
sentences = df['review_text'].tolist()

# 문장 임베딩 생성
embeddings = model.encode(
    sentences,
    device="cpu"  
)

# 결과 출력
print(f"임베딩 크기: {embeddings.shape}")  # (문장 수, 임베딩 크기)

embeddings


임베딩 크기: (2780, 1024)


array([[ 0.02736148,  0.00691012, -0.0286948 , ...,  0.01136253,
        -0.03099005,  0.00176563],
       [ 0.02741469, -0.02640538, -0.03989339, ..., -0.01065446,
        -0.05603047,  0.02099335],
       [ 0.02449915, -0.02302465, -0.04070162, ..., -0.028931  ,
        -0.01090803,  0.00420128],
       ...,
       [ 0.00476159, -0.03746838, -0.03921036, ..., -0.02048945,
        -0.01843286, -0.01652845],
       [ 0.02628706, -0.03358242, -0.04440314, ..., -0.00993321,
        -0.04132478, -0.01851745],
       [ 0.02032715, -0.00302691, -0.03567348, ..., -0.02404277,
         0.00091506,  0.00349985]], dtype=float32)

In [17]:
# numpy 배열을 pandas DataFrame으로 변환
embeddings_df = pd.DataFrame(embeddings)

# CSV로 저장
embeddings_df.to_csv('embeddings.csv', index=False)
print("임베딩을 'embeddings.csv'로 저장 완료!")


임베딩을 'embeddings.csv'로 저장 완료!


## 감정 분석

In [None]:
# 임베딩 추가
sentence_df['embedding'] = list(embeddings)
print("임베딩 크기:", embeddings.shape)

KeyError: 'cleaned_text'

In [None]:
from transformers import pipeline

# 감정 분석 파이프라인 로드
pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

# 감정 분석 수행
def analyze_sentiment(texts):
    sentiments = []
    for text in texts:
        # 감정 분석 수행 및 결과 추출
        sentiment_result = pipe(text[:512])  # 길이 초과 방지를 위해 512 토큰 제한
        sentiments.append(sentiment_result[0]['label'])  # 감정 레이블 추출
    return sentiments

# 감정 분석 실행: 문장 단위로 감정 분석
sentence_df['sentiment'] = analyze_sentiment(sentence_df['cleaned_sentence'].tolist())

# 결과 확인
print(sentence_df[['cleaned_sentence', 'sentiment']].head())

In [19]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import pipeline
import pandas as pd

# 모델 초기화
model_name = "siebert/sentiment-roberta-large-english"
classifier = pipeline("sentiment-analysis", model=model_name, truncation=True)

# 데이터 로드
file_path = 'Warframe_reviews.csv'
data = pd.read_csv(file_path)

# 감정 분석 수행 (batch 단위로 처리)
batch_size = 16  # 배치 크기 설정
reviews = data["review_text"].tolist()  # 리뷰 데이터를 리스트로 변환
sentiments = []

# 배치로 처리
for i in range(0, len(reviews), batch_size):
    batch = reviews[i:i+batch_size]
    results = classifier(batch)
    sentiments.extend(results)

# 결과를 데이터프레임에 추가
data["sentiment"] = [result["label"] for result in sentiments]
data["confidence"] = [result["score"] for result in sentiments]

# 결과 저장
output_path = 'analyzed_reviews.csv'
data.to_csv(output_path, index=False)

# 결과 확인
print(data[["review_text", "sentiment", "confidence"]].head())


Device set to use cpu


                                         review_text sentiment  confidence
0  Dealing 5 billion damage to a 100 health point...  POSITIVE    0.992166
1  Welcome to Warframe. Play the same mission 231...  NEGATIVE    0.997277
2  The best part of this game is figuring out wha...  POSITIVE    0.978301
3  I hope you guys from The First Descendant, Des...  POSITIVE    0.998829
4  Once Cross save is available, I will start pla...  POSITIVE    0.998830


In [7]:
analyzed_reviews = pd.read_csv('analyzed_reviews.csv')

In [20]:
# 특정 조건을 만족하는 행을 찾기
for index, row in analyzed_reviews.iterrows():
    if row['confidence'] < 0.9:  # 조건 설정
        print(row)
        print("-" * 50)  # 구분선 추가


title                                                      Warframe
id                                                      ProfileName
review_text       You need to be on crack to understand the game...
recommendation                                          Recommended
posted_date                                              2023.04.27
playtime                                                      748.6
sentiment                                                  NEGATIVE
confidence                                                 0.779638
Name: 19, dtype: object
--------------------------------------------------
title                Warframe
id                      Spasp
review_text          warframe
recommendation    Recommended
posted_date        2024.01.22
playtime                  1.0
sentiment            POSITIVE
confidence           0.789745
Name: 164, dtype: object
--------------------------------------------------
title                                            Warframe
id     

In [15]:
len(analyzed_reviews)

2780

TypeError: string indices must be integers