In [None]:
!pip install konlpy > /dev/null

### 22_영화평감성분석2.ipynb

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import files
up = files.upload()

Saving naver_movie_test_전처리완료.tsv to naver_movie_test_전처리완료.tsv
Saving naver_movie_train_전처리완료.tsv to naver_movie_train_전처리완료.tsv


In [None]:
list(up.keys())[1]


'naver_movie_train_전처리완료.tsv'

In [None]:
train_df = pd.read_csv(list(up.keys())[1], sep='\t')
test_df = pd.read_csv(list(up.keys())[0], sep='\t')
train_df.shape, test_df.shape

((145393, 3), (48852, 3))

In [None]:
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠 포스터보고 초딩영화줄 오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


- Tokenizer 함수 정의

In [None]:
from konlpy.tag import Okt
okt = Okt()

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [None]:
def okt_tokenizer(text):
    morphs = okt.morphs(text)
    tokens = [word for word in morphs if word not in stopwords]
    return tokens

In [None]:
print(okt.morphs('열심히 일한 당신 주말엔 여행을 떠나봐요.'))
okt_tokenizer('열심히 일한 당신 주말엔 여행을 떠나봐요.')

['열심히', '일', '한', '당신', '주말', '엔', '여행', '을', '떠나', '봐요', '.']


['열심히', '일', '당신', '주말', '엔', '여행', '떠나', '봐요', '.']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline([
    ('TFIDF', TfidfVectorizer(tokenizer=okt_tokenizer)),
    # tokenizer로 함수 지정해서 거쳐서 가도록 함.
    ('LR', LogisticRegression(random_state=2022))
])
%time pipeline.fit(train_df.document, train_df.label)

CPU times: user 6min, sys: 6.06 s, total: 6min 6s
Wall time: 6min 20s


Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(tokenizer=<function okt_tokenizer at 0x7f9d8f0238c0>)),
                ('LR', LogisticRegression(random_state=2022))])

In [None]:
pipeline.score(test_df.document, test_df.label)

0.8477032670105625

- 실제 데이터 테스트

In [None]:
import re
reviews = ['모든 국민이 봤으면 하는 영화입니다.',
           '생각보다 지루하고 별로였네요... 보면서 좀 졸았습니다.']
reviews = map(lambda x: re.sub('[^가-힣]', ' ', x), reviews)

In [None]:
pipeline.predict(reviews)

array([1, 0])

- 최적 파라미터 찾기
 - 매 시행마다 한글 형태소 분석을 하느라 시간이 너무 오래 걸림
 - 최적 파라메터를 찾으려고 하면 한글 형태소 분석을 한데이터로 할것

In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    'TFIDF__ngram_range' : [(1,1), (1,2)],
    'TFIDF__max_df' : [0.95, 0.98],
    'LR__C': [1, 5]
}

In [None]:
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy',cv=3)
%time grid_pipe.fit(train_df.document, train_df.label)

CPU times: user 2h 33min 6s, sys: 3min 37s, total: 2h 36min 43s
Wall time: 2h 30min 57s


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('TFIDF',
                                        TfidfVectorizer(tokenizer=<function okt_tokenizer at 0x7f9d8f0238c0>)),
                                       ('LR',
                                        LogisticRegression(random_state=2022))]),
             param_grid={'LR__C': [1, 5], 'TFIDF__max_df': [0.95, 0.98],
                         'TFIDF__ngram_range': [(1, 1), (1, 2)]},
             scoring='accuracy')

- CountVectorizer 사례에서 찾은 최적의 파라메터로 평가

In [None]:
print(grid_pipe.best_params_,  grid_pipe.best_score_)
best_pipeline = grid_pipe.best_estimator_
best_pipeline.score(test_df.document, test_df.label)

{'LR__C': 5, 'TFIDF__max_df': 0.95, 'TFIDF__ngram_range': (1, 2)} 0.8548829563074446


0.8615614509129615

#### 문서 유사도 - 코사인 유사도

In [None]:
doc_list = ['if you take the blue pill, the story ends' ,
            'if you take the red pill, you stay in Wonderland',
            'if you take the red pill, I show you how deep the rabbit hole goes']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cvect = CountVectorizer()
doc_cv = cvect.fit_transform(doc_list)
doc_cv.toarray()

array([[1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 2, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 2],
       [0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 2, 0, 2]])

In [None]:
d1, d2, d3 = doc_cv.toarray()
d1

array([1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 2, 0, 1])

In [None]:
tvect = TfidfVectorizer()
doc_tv = tvect.fit_transform(doc_list)
doc_tv.toarray()

array([[0.4155636 , 0.        , 0.4155636 , 0.        , 0.        ,
        0.        , 0.24543856, 0.        , 0.24543856, 0.        ,
        0.        , 0.        , 0.        , 0.4155636 , 0.24543856,
        0.49087711, 0.        , 0.24543856],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.23402865, 0.39624495, 0.23402865, 0.        ,
        0.3013545 , 0.        , 0.39624495, 0.        , 0.23402865,
        0.23402865, 0.39624495, 0.4680573 ],
       [0.        , 0.30985601, 0.        , 0.30985601, 0.30985601,
        0.30985601, 0.18300595, 0.        , 0.18300595, 0.30985601,
        0.23565348, 0.30985601, 0.        , 0.        , 0.18300595,
        0.3660119 , 0.        , 0.3660119 ]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
# 리스트 형식으로 표현해서 넣기
cosine_similarity([d1], [d2])

array([[0.6092718]])

### 컨텐츠 기반 필터링(Content-based Filtering)

In [None]:
import numpy as np
import pandas as pd

In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 7.3 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1630106 sha256=074482da06984a9573f5772f276746c853e44781fc1f765a540a88bfb68245cc
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
from surprise import Dataset
data = Dataset.load_builtin('ml-100k', prompt=False)
df = pd.DataFrame(data.raw_ratings, columns=['user-id','movie-id','rating','timestamp'])
df.head()

Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


Unnamed: 0,user-id,movie-id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [None]:
df.shape

(100000, 4)

### 1. Adjacent Matrix 생성
 - 행은 사용자
 - 열은 영화
 - 내용은 평점

In [None]:
raw_data = np.array(data.raw_ratings, dtype=int)
np.min(raw_data, axis=0)

array([        1,         1,         1, 874724710])

In [None]:
np.max(raw_data, axis=0)


array([      943,      1682,         5, 893286638])

In [None]:
raw_data[:, :2] -= 1
raw_data[:5]

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       [      243,        50,         2, 880606923],
       [      165,       345,         1, 886397596]])

In [None]:
nrows = df['user-id'].nunique()
ncols = df['movie-id'].nunique()
nrows, ncols

(943, 1682)

In [None]:
adj_matrix = np.zeros([nrows, ncols], int)
for user_id, movie_id, _, _ in raw_data:
    adj_matrix[user_id, movie_id] = 1
adj_matrix[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [None]:
my_id, my_vector = 0, adj_matrix[0]

In [None]:
np.dot(my_vector, adj_matrix[10]), np.dot(my_vector, adj_matrix[20])

(71, 42)

In [None]:
best_score, best_id = 0, 0
for i in range(1, len(adj_matrix)):
    dot = np.dot(my_vector, adj_matrix[i])
    if dot > best_score:
        best_score, best_id = dot, i
best_score, best_id        

(183, 275)

In [None]:
best_vector = adj_matrix[best_id]
my_vector[200:210], best_vector[200:210]

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1]))

In [None]:
rec_list = []
for i, (my_view, best_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_view == 1:
        rec_list.append(i)
len(rec_list), rec_list[:10]

(335, [272, 273, 275, 280, 281, 283, 287, 288, 289, 290])

In [None]:
adj_matrix = np.zeros([nrows, ncols], int)
for user_id, movie_id, rating, _ in raw_data:
    adj_matrix[user_id, movie_id] = rating
adj_matrix[:5]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 3, 0, ..., 0, 0, 0]])

In [None]:
# 누가 나랑 닮았나?
best_score, best_match_id = 100000, 0
my_vector = adj_matrix[0]

for i in range(1, len(adj_matrix)):
    euc = np.sqrt(np.sum(np.square(my_vector - adj_matrix[i])))
    if euc < best_score:
        best_score, best_match_id = euc, i

best_score, best_match_id

(55.06359959174482, 737)

In [None]:
best_match_vector = adj_matrix[best_match_id]
res_list = []

for i, (my_view, best_view) in enumerate(zip(my_vector, best_match_vector)):
    if my_view == 0 and best_view >= 1:
        res_list.append(i)
len(res_list), res_list[:10]

(48, [297, 312, 317, 342, 356, 366, 379, 384, 392, 402])

In [None]:
res_list = []

for i, (my_view, best_view) in enumerate(zip(my_vector, best_match_vector)):
    if my_view == 0 and best_view >= 4:
        res_list.append(i)
len(res_list), res_list[:10]

(21, [312, 317, 356, 384, 407, 422, 433, 454, 469, 473])