> ## 진행순서

1. 벡터화 진행 (word2vec)
2. 로지스틱 회귀분석 진행
3. pycaret을 사용해서 적절한 모델을 선정
4. 해당 모델로 다시 진행해 보기

In [1]:
import pandas as pd

In [18]:
df = pd.read_csv('./beer_tok.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,beer_name,ID,rating,date,review,Aroma,Appearance,Flavor,Mouthfeel,beer_st,alcohol,tokens,tokens_filtered
0,0,Kloud Original Gravity,"vinivini(9,584)",2.9,"December 29, 2022","Straight from the can. Clean, malty, bready & grassy. Light sweet, light to medium bitter. Mouth feel: light to medium body, avera to lively carbonation.",2,4,3,4,Pale Lager - International / Premium,5.0%,"['straight', 'can', 'clean', 'malty', 'bready', 'grassy', 'light', 'sweet', 'light', 'medium', 'bitter', 'mouth', 'feel', 'light', 'medium', 'body', 'avera', 'lively', 'carbonation']","['straight', 'clean', 'malty', 'bready', 'grassy', 'light', 'sweet', 'light', 'medium', 'bitter', 'mouth', 'feel', 'light', 'medium', 'body', 'avera', 'lively', 'carbonation']"


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68567 entries, 0 to 68566
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       68567 non-null  int64  
 1   beer_name        68567 non-null  object 
 2   ID               68567 non-null  object 
 3   rating           68567 non-null  float64
 4   date             68567 non-null  object 
 5   review           68567 non-null  object 
 6   Aroma            68567 non-null  int64  
 7   Appearance       68567 non-null  int64  
 8   Flavor           68567 non-null  int64  
 9   Mouthfeel        68567 non-null  int64  
 10  beer_st          68567 non-null  object 
 11  alcohol          68567 non-null  object 
 12  tokens           68567 non-null  object 
 13  tokens_filtered  68567 non-null  object 
dtypes: float64(1), int64(5), object(8)
memory usage: 7.3+ MB


## Word2vec

- Word2Vec의 하이퍼파라미터값<br>
vector_size = 워드 벡터의 특징 값. 즉, 임베딩 된 벡터의 차원<br>
window = 컨텍스트 윈도우 크기<br>
min_count = 단어 최소 빈도 수 제한 (빈도가 적은 단어들은 학습하지 않는다.)<br>
workers = 학습을 위한 프로세스 수<br>
sg = 0은 CBOW, 1은 Skip-gram

In [5]:
import numpy as np

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [19]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np

# 전처리된 리뷰 데이터가 담긴 데이터프레임
df = pd.DataFrame(df)  # 리뷰 데이터가 담긴 데이터프레임

# NaN 값 제거
df = df.dropna(subset=['tokens'])

# Word2Vec 모델 학습
sentences = [token.split() for token in df['tokens']]
model = Word2Vec(sentences, min_count=1, vector_size=100, workers=4, window=5, sg=1)

# 모든 리뷰 데이터에 대한 단어 벡터화
word_vectors = model.wv
all_reviews_vectors = []

for review in df['tokens']:
    review_vectors = [word_vectors[word] for word in review.split()]
    all_reviews_vectors.append(review_vectors)

# 결과 확인
for vectors in all_reviews_vectors[:3]:
    for vector in vectors:
        print(vector)


[-0.15594584 -0.0788304   0.00509457  0.08341511  0.19723573 -0.43571678
 -0.0746394   0.2629544  -0.20223895 -0.07520092 -0.18496907 -0.02631396
  0.13969627  0.17247403 -0.08153775 -0.08569057 -0.0610023  -0.08813538
  0.05876917 -0.3905474   0.17114452 -0.02232898  0.06568035 -0.08386966
  0.12500404  0.05853286 -0.2581614   0.07841756 -0.18790713  0.09285856
  0.19969238 -0.02986326 -0.00898096 -0.1567469  -0.05453914  0.40551993
  0.06092166 -0.0526864  -0.16376941 -0.26098368  0.04635953 -0.27962035
 -0.2686687   0.11178658  0.19723903 -0.21295507 -0.13022438 -0.21289098
  0.00734454 -0.13960153  0.13442379 -0.1635759   0.09860782  0.03759647
 -0.04945132  0.00113565  0.02462682 -0.22455047 -0.28137532  0.01673091
  0.19261278 -0.13733575  0.05146943 -0.01408944 -0.17785493  0.07741385
 -0.01209641  0.22247021 -0.26684391  0.34352446 -0.05729835  0.09485663
  0.21432915  0.05163098  0.1097403  -0.06582137 -0.20172918  0.19720116
 -0.1368405   0.04906264 -0.2651569  -0.14696206 -0

In [40]:
len(all_reviews_vectors)

68567

In [34]:
# 회귀분석을 위해 벡터화 진행된 리뷰 행만 살림
# df_subset = df.iloc[:50974, :]

In [35]:
#df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50974 entries, 0 to 50973
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       50974 non-null  int64  
 1   beer_name        50974 non-null  object 
 2   ID               50974 non-null  object 
 3   rating           50974 non-null  float64
 4   date             50974 non-null  object 
 5   review           50974 non-null  object 
 6   Aroma            50974 non-null  int64  
 7   Appearance       50974 non-null  int64  
 8   Flavor           50974 non-null  int64  
 9   Mouthfeel        50974 non-null  int64  
 10  beer_st          50974 non-null  object 
 11  alcohol          50974 non-null  object 
 12  tokens           50974 non-null  object 
 13  tokens_filtered  50974 non-null  object 
dtypes: float64(1), int64(5), object(8)
memory usage: 5.4+ MB


벡터화는 진행됬는데 어떻게 생겼는지 보기위해 그냥 가져오면 token list 가 키로 사용될 때 문제가 발생함. 따라서 str로 변환한 후에 출력하고자 함

## TF-IDF

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

## Rogistic 회귀분석

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [67]:
X = all_reviews_vectors
y = df['Aroma']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [57]:
all_reviews_vectors[1000]

[array([ 0.40571108, -0.21128343,  0.22419575,  0.5265784 , -0.22395441,
        -0.4631749 ,  0.45516312,  0.495964  , -0.74446756,  0.0638322 ,
        -0.14539701,  0.12449594, -0.07830676,  0.20925735, -0.5521927 ,
         0.14406544,  0.8117755 , -0.47146204, -0.19680338, -0.79133904,
         0.18223152,  0.01192395,  0.01596821,  0.6551435 , -0.24958794,
         0.0659494 , -0.18009283, -0.11215715,  0.03262434, -0.08311585,
        -0.02903092, -0.05164417,  0.5059029 , -0.54920864,  0.08543787,
         0.42749792, -0.1603613 , -0.1434424 , -0.593845  , -0.4971732 ,
        -0.02255994, -0.31135315, -0.32469118,  0.28870684,  0.06272899,
        -0.45874757, -0.2129957 ,  0.3301798 ,  0.03084627,  0.1247837 ,
        -0.26701748,  0.01123512,  0.2029417 ,  0.11228437,  0.10078453,
        -0.54105407, -0.24310112,  0.04360054, -0.7457123 , -0.31498322,
        -0.02376834, -0.31025732,  0.31402043,  0.03375589, -0.18511358,
         0.4826214 ,  0.11233974,  0.6667711 , -0.5

In [69]:
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (54853,) + inhomogeneous part.