In [1]:
from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import sqlite3
from Recommend import Recommend as rec
from precleaning import Precleaning  as pc

def keyword_search(keywords):
    model = Word2Vec.load('W2V/MV_Model')
    pre_clean_list = pc.preclean(keywords)
    sentence = []
    for word in pre_clean_list:
        labels = []
        sim_word = model.wv.most_similar(word, topn = 5)
        for label, _ in sim_word:
            labels.append(label)

        for i, word in enumerate(labels):
             sentence += [word] * (9-i)
        
    for keyword in keywords:
        sentence += [keyword] * 10
    sentence = ' '.join(sentence)
        
    # DB 불러오기
    connection = sqlite3.connect("db/data.db")
 
    # 테이블 읽기
    data = pd.read_sql("SELECT * FROM data;", connection)

    # TF-IDF 변환
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data.storyWord)
    sentence_vec = tfidf.transform([sentence])  

    cosine_sim = linear_kernel(sentence_vec, tfidf_matrix) 

    # 각 코사인 유사도 값에 인덱스를 붙임
    simScore = list(enumerate(cosine_sim[-1]))
    # 코사인 유사도가 큰것부터 정렬
    simScore = sorted(simScore, key = lambda x:x[1], reverse = True)
    # 유사한 영화 5개 리스트
    simScore = simScore[1:6]
    movie_indices = [i[0] for i in simScore]
        
    result_df = data.iloc[movie_indices].copy()
    result_df['score'] = [i[1] for i in simScore]

    del result_df['story']
    del result_df['storyWord']
    del result_df['poster']
    del result_df['actor']
    del result_df['directer']
    del result_df['time']
    del result_df['limit_age']
    del result_df['id']
    del result_df['theme']

    return result_df 




In [3]:
keyword_search("코미디 영화")

Unnamed: 0,title,score,keyword1,keyword2,keyword3,keyword4,keyword5
10713,마티아스와 막심,0.25227,세상,오다,지금,뜨겁다,순간
10096,컴 앤 파인드 미,0.232974,벌어지다,미스테리,사라지다,여자친구,흔적
7204,P.S 온리 유,0.230921,죽다,남자,닮다,연인,사랑
15173,스케어리 스토리: 어둠의 속삭임,0.225083,스토리,태우다,죽음,기괴하다,기묘하다
7485,행복의 알리바이: 사진,0.211103,죽음,코믹,에피소드,옴니버스,묶다


In [7]:
keyword_search("컴퓨터")

Unnamed: 0,title,score,keyword1,keyword2,keyword3,keyword4,keyword5
16079,해커스,0.20625,해커,테러,음모,해킹,폭탄
4542,후 엠 아이,0.166869,히틀러,해커,슈퍼히어로,사이트,회사
1916,컨스피러시,0.16384,사이버,해커,공격,경고,위험하다
15931,디버그: 슈퍼컴퓨터 VS 천재해커,0.162803,해커,우주선,해킹,두뇌,공격
15345,어쌔신 걸스,0.16216,해킹,범죄,폭력,정보,사적
