In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from tqdm import tqdm, tqdm_pandas
from sklearn.metrics import accuracy_score
tqdm.pandas()

np.random.seed(42)
tf.random.set_seed(42)

In [2]:
# 데이터 불러오기
df = pd.read_csv('/content/drive/MyDrive/project4(boardgame_review).csv')
df

Unnamed: 0,rating,comment,ID,name
0,10.0,Hands down my favorite new game of BGG CON 200...,30549,Pandemic
1,10.0,I tend to either love or easily tire of co-op ...,30549,Pandemic
2,10.0,This is an amazing co-op game. I play mostly ...,30549,Pandemic
3,10.0,Hey! I can finally rate this game I've been pl...,30549,Pandemic
4,10.0,"Fun, fun game. Strategy is required, but defin...",30549,Pandemic
...,...,...,...,...
715270,2.0,We sold this game last week. Despite having ow...,12962,Reef Encounter
715271,2.0,Opened the box and read the rules and just can...,12962,Reef Encounter
715272,2.0,There must be a good game in there because so ...,12962,Reef Encounter
715273,2.0,Played with my gaming group. We're experience...,12962,Reef Encounter


In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
# 데이터 전처리 함수

def tokenize(text):

    # 소문자로 치환
    tokens = text.lower()

    # 정규식 적용
    tokens = re.sub("[^a-z ]","", tokens)
    
    return tokens

In [5]:
# 전처리

df['step1']=df['comment'].apply(tokenize)

In [6]:
# 전처리 이후 결측치 제거 및 토큰화

df['step1'].replace('', np.nan, inplace=True)
df.dropna(inplace=True)

df['step2']=df.step1.str.split()
df

Unnamed: 0,rating,comment,ID,name,step1,step2
0,10.0,Hands down my favorite new game of BGG CON 200...,30549,Pandemic,hands down my favorite new game of bgg con w...,"[hands, down, my, favorite, new, game, of, bgg..."
1,10.0,I tend to either love or easily tire of co-op ...,30549,Pandemic,i tend to either love or easily tire of coop g...,"[i, tend, to, either, love, or, easily, tire, ..."
2,10.0,This is an amazing co-op game. I play mostly ...,30549,Pandemic,this is an amazing coop game i play mostly wi...,"[this, is, an, amazing, coop, game, i, play, m..."
3,10.0,Hey! I can finally rate this game I've been pl...,30549,Pandemic,hey i can finally rate this game ive been play...,"[hey, i, can, finally, rate, this, game, ive, ..."
4,10.0,"Fun, fun game. Strategy is required, but defin...",30549,Pandemic,fun fun game strategy is required but definite...,"[fun, fun, game, strategy, is, required, but, ..."
...,...,...,...,...,...,...
715270,2.0,We sold this game last week. Despite having ow...,12962,Reef Encounter,we sold this game last week despite having own...,"[we, sold, this, game, last, week, despite, ha..."
715271,2.0,Opened the box and read the rules and just can...,12962,Reef Encounter,opened the box and read the rules and just can...,"[opened, the, box, and, read, the, rules, and,..."
715272,2.0,There must be a good game in there because so ...,12962,Reef Encounter,there must be a good game in there because so ...,"[there, must, be, a, good, game, in, there, be..."
715273,2.0,Played with my gaming group. We're experience...,12962,Reef Encounter,played with my gaming group were experienced ...,"[played, with, my, gaming, group, were, experi..."


In [7]:
# 불용어 제거

STOP_WORDS = nlp.Defaults.stop_words.union(['board'])

step3 = []

for doc in tqdm(df['step2']):
    doc_tokens = [x for x in doc if x not in list(STOP_WORDS) and 'play' not in x and 'game' not in x]
    
    step3.append(doc_tokens)

df['step3'] = step3

100%|██████████| 714849/714849 [03:42<00:00, 3210.89it/s]


In [8]:
#어간 추출

from nltk.stem import PorterStemmer

ps = PorterStemmer()

tokens = []
for doc in tqdm(df['step3']):
    doc_tokens = []
    for token in doc:
        doc_tokens.append(ps.stem(token))
    tokens.append(doc_tokens)

df['tokens'] = tokens

100%|██████████| 714849/714849 [06:41<00:00, 1779.42it/s]


In [9]:
df

Unnamed: 0,rating,comment,ID,name,step1,step2,step3,tokens
0,10.0,Hands down my favorite new game of BGG CON 200...,30549,Pandemic,hands down my favorite new game of bgg con w...,"[hands, down, my, favorite, new, game, of, bgg...","[hands, favorite, new, bgg, con, times, row, g...","[hand, favorit, new, bgg, con, time, row, good..."
1,10.0,I tend to either love or easily tire of co-op ...,30549,Pandemic,i tend to either love or easily tire of coop g...,"[i, tend, to, either, love, or, easily, tire, ...","[tend, love, easily, tire, coop, pandemic, joi...","[tend, love, easili, tire, coop, pandem, join,..."
2,10.0,This is an amazing co-op game. I play mostly ...,30549,Pandemic,this is an amazing coop game i play mostly wi...,"[this, is, an, amazing, coop, game, i, play, m...","[amazing, coop, wife, cant, imagine, getting, ...","[amaz, coop, wife, cant, imagin, get, tire, wi..."
3,10.0,Hey! I can finally rate this game I've been pl...,30549,Pandemic,hey i can finally rate this game ive been play...,"[hey, i, can, finally, rate, this, game, ive, ...","[hey, finally, rate, ive, couple, years, like,...","[hey, final, rate, ive, coupl, year, like, pan..."
4,10.0,"Fun, fun game. Strategy is required, but defin...",30549,Pandemic,fun fun game strategy is required but definite...,"[fun, fun, game, strategy, is, required, but, ...","[fun, fun, strategy, required, definitely, acc...","[fun, fun, strategi, requir, definit, access, ..."
...,...,...,...,...,...,...,...,...
715270,2.0,We sold this game last week. Despite having ow...,12962,Reef Encounter,we sold this game last week despite having own...,"[we, sold, this, game, last, week, despite, ha...","[sold, week, despite, having, owned, months, m...","[sold, week, despit, have, own, month, manag, ..."
715271,2.0,Opened the box and read the rules and just can...,12962,Reef Encounter,opened the box and read the rules and just can...,"[opened, the, box, and, read, the, rules, and,...","[opened, box, read, rules, people, think, good...","[open, box, read, rule, peopl, think, good, ne..."
715272,2.0,There must be a good game in there because so ...,12962,Reef Encounter,there must be a good game in there because so ...,"[there, must, be, a, good, game, in, there, be...","[good, people, like, cant, wont, find, theme, ...","[good, peopl, like, cant, wont, find, theme, d..."
715273,2.0,Played with my gaming group. We're experience...,12962,Reef Encounter,played with my gaming group were experienced ...,"[played, with, my, gaming, group, were, experi...","[gaming, group, experienced, euro, far, favori...","[game, group, experienc, euro, far, favorit, w..."


In [10]:
df['rating'].mean()

7.225462159322971

In [11]:
# 과정 정리 및 라벨링

df = df[['rating','comment','name','tokens']]

def labeling(rating):
    if rating >= 7.2:
        return 1
    else:
        return 0
    
df['label'] = df['rating'].apply(lambda x: labeling(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [12]:
df

Unnamed: 0,rating,comment,name,tokens,label
0,10.0,Hands down my favorite new game of BGG CON 200...,Pandemic,"[hand, favorit, new, bgg, con, time, row, good...",1
1,10.0,I tend to either love or easily tire of co-op ...,Pandemic,"[tend, love, easili, tire, coop, pandem, join,...",1
2,10.0,This is an amazing co-op game. I play mostly ...,Pandemic,"[amaz, coop, wife, cant, imagin, get, tire, wi...",1
3,10.0,Hey! I can finally rate this game I've been pl...,Pandemic,"[hey, final, rate, ive, coupl, year, like, pan...",1
4,10.0,"Fun, fun game. Strategy is required, but defin...",Pandemic,"[fun, fun, strategi, requir, definit, access, ...",1
...,...,...,...,...,...
715270,2.0,We sold this game last week. Despite having ow...,Reef Encounter,"[sold, week, despit, have, own, month, manag, ...",0
715271,2.0,Opened the box and read the rules and just can...,Reef Encounter,"[open, box, read, rule, peopl, think, good, ne...",0
715272,2.0,There must be a good game in there because so ...,Reef Encounter,"[good, peopl, like, cant, wont, find, theme, d...",0
715273,2.0,Played with my gaming group. We're experience...,Reef Encounter,"[game, group, experienc, euro, far, favorit, w...",0


In [13]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size = 0.2, random_state = 42, stratify = df['name'])

In [14]:
X_train = train['tokens']
y_train = np.array(train['label'])

X_test = test['tokens']
y_test = np.array(test['label'])

In [15]:
from collections import Counter

In [16]:
# lecture note N421 토큰 카운트

def word_count(docs):
    """
    토큰화된 문서들을 입력받아 토큰별로 개수를 카운트 하고 관련된 속성을 가진 데이터프레임을 반환합니다.
    
    Args:
        docs (series or list): 토큰화 된 문서가 들어있는 list
    Returns:
        list: Dataframe
    """
    
    # word_counts : 말뭉치에서 단어의 개수입니다.
    word_counts = Counter()

    # word_in_docs : 단어가 존재하는 문서의 빈도입니다. 해당 단어가 한 번 이상 존재하면 +1
    word_in_docs = Counter()

    # total_docs : 전체 문서의 개수입니다.
    total_docs = len(docs)

    for doc in docs:
        word_counts.update(doc)
        word_in_docs.update(set(doc))

    temp = zip(word_counts.keys(), word_counts.values())

    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    # rank 열에 단어 빈도 순으로 순위를 저장합니다.
    # method='first': 같은 값의 경우 먼저나온 요소를 상위에 배치합니다.
    wc['rank'] = wc['count'].rank(method='first', ascending=False).astype(int)
    total = wc['count'].sum()

    # percent 열에는 말뭉치 내 단어의 비율을 계산합니다.
    wc['percent'] = wc['count'].apply(lambda x: x / total)

    wc = wc.sort_values(by='rank')

    # cul_percent 열에는 누적 비율을 저장합니다.
    wc['cul_percent'] = wc['percent'].cumsum()

    temp2 = zip(word_in_docs.keys(), word_in_docs.values())
    ac = pd.DataFrame(temp2, columns=['word', 'word_in_docs'])
    wc = ac.merge(wc, on='word')
    
    # word_in_docs_percent 열에는 전체 문서 중 해당 단어가 존재하는 문서의 비율을 저장합니다.
    wc['word_in_docs_percent'] = wc['word_in_docs'].apply(lambda x: x / total_docs)

    return wc.sort_values(by='rank')

In [17]:
wc = word_count(X_train)
wc

Unnamed: 0,word,word_in_docs,count,rank,percent,cul_percent,word_in_docs_percent
21,like,145494,197261,1,1.417598e-02,0.014176,0.254414
28,card,97031,164432,2,1.181675e-02,0.025993,0.169671
69,fun,128233,150177,3,1.079233e-02,0.036785,0.224231
125,time,107367,135622,4,9.746348e-03,0.046531,0.187744
298,good,101562,121185,5,8.708846e-03,0.055240,0.177594
...,...,...,...,...,...,...,...
235120,implementieren,1,1,235123,7.186406e-08,1.000000,0.000002
235125,kippen,1,1,235124,7.186406e-08,1.000000,0.000002
235107,siegreichen,1,1,235125,7.186406e-08,1.000000,0.000002
235117,kolonialherrn,1,1,235126,7.186406e-08,1.000000,0.000002


In [18]:
print(len(wc[wc['word_in_docs_percent'] >= 0.001])) #등장 빈도 0.1% 이상 단어 2326개
wc[wc['word_in_docs_percent'] >= 0.001].head(10)

2319


Unnamed: 0,word,word_in_docs,count,rank,percent,cul_percent,word_in_docs_percent
21,like,145494,197261,1,0.014176,0.014176,0.254414
28,card,97031,164432,2,0.011817,0.025993,0.169671
69,fun,128233,150177,3,0.010792,0.036785,0.224231
125,time,107367,135622,4,0.009746,0.046531,0.187744
298,good,101562,121185,5,0.008709,0.05524,0.177594
34,great,93834,108940,6,0.007829,0.063069,0.16408
160,lot,74678,88201,7,0.006338,0.069408,0.130584
16,feel,64750,80458,8,0.005782,0.07519,0.113223
2,enjoy,70372,79152,9,0.005688,0.080878,0.123054
74,love,64658,75888,10,0.005454,0.086331,0.113062


In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [20]:
vocab_size = len(wc[wc['word_in_docs_percent'] >= 0.001]) + 2 # 패딩토큰과 OOV 토큰을 고려하여 +1
tokenizer = Tokenizer(vocab_size, oov_token='OOV')
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [21]:
drop_train = [index for index, sentence in enumerate(X_train) if len(sentence) < 1] # 말뭉치 단어 선정으로 결측치가 된 행의 인덱스
len(drop_train)

1549

In [22]:
# 결측치 제거
X_train = np.delete(X_train, drop_train, axis=0)
y_train = np.delete(y_train, drop_train, axis=0)

  arr = asarray(arr)


In [23]:
# 패딩
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [24]:
desc = pd.Series([len(x) for x in X_train]).describe(percentiles=[0.95]) # 리뷰의 95% 는 길이가 67 이하
desc

count    570330.000000
mean         24.398438
std          30.102313
min           1.000000
50%          16.000000
95%          67.000000
max        2022.000000
dtype: float64

In [25]:
maxlen = int(desc['95%'])
X_train = pad_sequences(X_train, maxlen = maxlen)
X_test = pad_sequences(X_test, maxlen = maxlen)

In [26]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=20, callbacks=[es, mc], batch_size=128, validation_split=0.2)

Epoch 1/20
Epoch 1: val_acc improved from -inf to 0.71033, saving model to best_model.h5
Epoch 2/20
Epoch 2: val_acc improved from 0.71033 to 0.71437, saving model to best_model.h5
Epoch 3/20
Epoch 3: val_acc improved from 0.71437 to 0.71738, saving model to best_model.h5
Epoch 4/20
Epoch 4: val_acc did not improve from 0.71738
Epoch 5/20
Epoch 5: val_acc did not improve from 0.71738
Epoch 6/20
Epoch 6: val_acc did not improve from 0.71738
Epoch 7/20
Epoch 7: val_acc did not improve from 0.71738
Epoch 7: early stopping


<keras.callbacks.History at 0x7efc32c13290>

In [27]:
best_model = load_model('best_model.h5')
print("Chance Level :",round(accuracy_score([1 for i in range(len(y_test))],y_test),4) )
print("모델 정확도 :", round(best_model.evaluate(X_test, y_test)[1],4))

Chance Level : 0.5078
모델 정확도 : 0.7165


In [28]:
# 리뷰 감성 함수
def predict(text):
  step1 = tokenize(text)
  step2 = step1.split(' ')
  step3 = [x for x in step2 if (x not in list(STOP_WORDS) and 'play' not in x and 'game' not in x)]

  tokens = []
  for token in step3:
    tokens.append(ps.stem(token))

  tokens = tokenizer.texts_to_sequences([tokens])
  tokens = pad_sequences(tokens, maxlen = maxlen)

  preference = float(best_model.predict(tokens))

  return preference

In [49]:
# 테스트
test_text1 = 'this game is pretty nice.'
test_text2 = 'this game is nice.'
print(predict(test_text1))
print(predict(test_text2))

0.477692574262619
0.6013678312301636


In [30]:
feedback = test.copy()

In [31]:
feedback['predict'] = best_model.predict(X_test)

In [32]:
feedback.drop(['tokens'],axis=1,inplace=True)

pd.options.display.max_colwidth = max([len(x) for x in df['comment']]) # comment 전문이 표시 되도록

In [33]:
# 부정적인 리뷰 1
Gloomhaven_nagative = feedback[(feedback['name']=='Gloomhaven') & (feedback['predict'] <= 0.2)].sample(n=5, random_state = 42)
Gloomhaven_nagative

Unnamed: 0,rating,comment,name,label,predict
267023,6.0,"This game surely overstayed its welcome - it was fun at start, became less interesting over time and turned into a chore by the end of the campaign. The story isn't bad, but it's kind of unremarkable. The freshness of playing a new character sometimes lasted for a while, other times disappeared within a couple of missions. Oh, and I hated every single scenario that contained swarms of puny enemies. Moving and ""rolling"" attacks for every one of them sucked out the fun of the game for the whole group.",Gloomhaven,0,0.173345
271930,4.0,"Gloomhaven is D&D for eurogamers; it has slow incremental growth, character customization, and of course skull smashing. And Gloomhaven ports a lot of this quite well. The characters are unique and inviting, the world is evocative and new, and the card powers are fun from the start and only grow stronger with time. Most notable is the action selection mechanism which feeds into a palpable feeling of stamina drain as cards slowly leak out your deck and the scenario creeps to its conclusion. The problem is that Gloomhaven isn't fun. Despite all of its strengths, Gloomhaven is an overly protracted slog with unrelenting bookkeeping. I am a fan of heavy euros, but never has a game felt as laborious or as tedious as Gloomhaven. Even with my friend who'd setup the games in advance, and automate enemy damage/effect tracking, the game felt horribly burdensome. After six scenarios we called it quits and returned to the plethora of other games that bring us joy.",Gloomhaven,0,0.127881
271163,4.0,"Gloomhaven was great at first. The character progression was fun, the scenarios were challenging, the card play was interesting. Quickly things went downhill. The randomness of the card draws for damage continually felt punishing. The loss of a great turn due to a null damage draw was demoralizing because of how difficult it was to manage the hand of player cards (I began missing dice rolling immediately). The scenarios began way too hard and stayed way too hard to be fun for us. The setup and takedown were way too cumbersome (even with the Broken Token insert). The bookkeeping was tedious and fiddly. The Monster AI rules were so difficult to keep straight I never felt confident we were playing the game correctly. Any game where fan-made flow charts are needed to get a grasp of the Monster AI rules is a huge negative. This is the major design flaw of the game in my opinion. If you browse the forums you'll constantly see many questions and contradictory answers regarding this aspect. I was mostly excited to experience the progression of my character. Instead I experienced having to replay the same (90 minute plus long) scenarios over and over again to make any progress with the story arc. In the end, I'd rather play Descent 2.0, Heroquest, or Dungeon World.",Gloomhaven,0,0.113285
267578,6.0,"I'll be honest... only played 2 games so far, but I don't get the hype. Yeah it's solid, but nothing more than that. Hugely overpriced in a way since it has genuinely *too much* content for a game this simplistic. UItimately it comes down to pretty low-tactics/no-strategy dungeon crawling, room spawns enemies, throw biggest attacks, mop up, next room. Repeat ~400 times and you're done with your campaign. I'd recommend pen&paper instead, even pretty mediocre RPGs do this better since it isn't the whole game there. It isn't bad though. It's just too long and way, way, way too samey and simplistic.",Gloomhaven,0,0.021693
202947,9.0,"I like the gameplay and the sense of progression in levelling your character. The world building, story and random encounters are also good and produce a fun, rich atmosphere. I do however feel some things could have been done better. Standard scenario difficulty is high, to the point where you need to do some grinding to beat them. Also the exhaustion mechanic, while providing some interesting urgency to the game, adds yet another opportunity to fail a scenario. Finally the scenario requirements are not well explained, especially considering that there are several subtilities you need to take into account. I also feel the personal quests are highly unbalanced. Some taking as little as 3-4 plays while others take far, far longer.",Gloomhaven,1,0.107424


In [34]:
# 부정적인 리뷰 2
Gloomhaven_important = feedback[(feedback['name']=='Gloomhaven') & (feedback['predict'] <= 0.2) & (feedback['label']==1)].sample(n=5, random_state = 42)
Gloomhaven_important

Unnamed: 0,rating,comment,name,label,predict
154457,9.5,"Gloomhaven is an exceptional legacy game which forces extremely tight decision making with incredibly engaging mechanics. The tradeoff between strength and endurance make for a unique experience that I look forward to every week. The class balancing mechanisms ensure that, over the course of the campaign, players have the opportunity to try a variety of play styles. Even a given class can shift between damage mitigation options, damage-dealing, and support roles depending on party compositions and personal mood. While there are certainly situations which make the play styles feel constricted, the strategy generally encourages creativity and rewards players for actively seeking ways to benefit from class synergies. Difficulty is managed very poorly. Increasing monster stats feels un-interesting, but Gloomhaven is not a game that is fun to lose. The threat should be present, as easy sessions are also boring and slow/tedious, but actually losing is rarely a gratifying experience. ""Kill all enemies"" scenarios can get monotonous, but they feel necessary to make the non-extermination scenarios more engaging. Within scenarios, the progression system encourages players to skirt the line of selfishness and group compassion, as loot cannot be shared or transferred and is necessary to increase character power.",Gloomhaven,1,0.186248
153135,9.5,"Set-up, bookkeeping & teardown is tedious. No linear narrative, ""final boss fight"" was anti-climatic as shown by other reviews/feedbacks. It has many branching stories due to taking on missions/quests. I'm definitely okay with missing even with advantage.(null + rolling modifier).",Gloomhaven,1,0.026369
257921,7.5,"Currently in a game. Nice idea, but must like coop...does get a tad repetitive, but I like the character retirement.",Gloomhaven,1,0.14009
202947,9.0,"I like the gameplay and the sense of progression in levelling your character. The world building, story and random encounters are also good and produce a fun, rich atmosphere. I do however feel some things could have been done better. Standard scenario difficulty is high, to the point where you need to do some grinding to beat them. Also the exhaustion mechanic, while providing some interesting urgency to the game, adds yet another opportunity to fail a scenario. Finally the scenario requirements are not well explained, especially considering that there are several subtilities you need to take into account. I also feel the personal quests are highly unbalanced. Some taking as little as 3-4 plays while others take far, far longer.",Gloomhaven,1,0.107424
255474,8.0,"Played with family a bit but was better solo with 4 Got bored though half way through, story didn’t hold my attention, quite choppy, loads of side quests I didn’t even attempt. Far to difficult but feels like cheating to lower the difficulty.",Gloomhaven,1,0.026743
