In [1]:
import os
import numpy as np 
import pandas as pd 

import tweepy
from konlpy.tag import Okt, Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
from hanspell import spell_checker

from pprint import pprint
import re
from math import log # IDF 계산을 위해

import warnings
warnings.filterwarnings(action='ignore')

In [2]:

current_dir = os.getcwd()
genre_df = pd.read_csv(current_dir + '/data/genre_df.csv')
genre_df.head()

Unnamed: 0,genre,script
0,액션,"액션 영화(action film)는 영화의 한 갈래로, 등장인물의 육체적인 움직임을..."
1,코미디,"코미디 영화(comedy film)는 유머에 중점을 둔 영화 장르로, 관객들로부터 ..."
2,다큐멘터리,"다큐멘터리 영화(영어: Documentary Film), 또는 기록 영화(記錄映畵,..."
3,판타지,판타지 영화란 판타지적인 내용을 영화로 만든 것이다.
4,공포,"공포 영화(恐怖映畫, 영어: horror film)란 귀신이나 유령 등 무서운 것과..."


In [38]:
genre_df.loc[18, 'script'] = "리얼리티 방송(영어: reality television)은 대본에 기반하지 않는, 멜로드라마틱한 상황이나 유머스러운 상황을 표출하는 텔레비전 프로그램의 한 장르로, 전문 배우 대신 일반인을 주연으로 하는 것이 보통이며 상을 수여하는 콘테스트에서도 이따금 이루어진다.[1] 리얼리티 방송은 1948년 TV 시리즈 캔디드 카메라(Candid Camera)에서 비롯한다.[2] 이 장르는 1999년부터 2000년 즈음 빅 브라더와 서바이벌과 같은 텔레비전 시리즈의 성공과 더불어 하나의 현상으로 급격히 발전하였다.[1] 리얼리티 방송 장르에 속하는 프로그램을 흔히 리얼리티 쇼(reality shows)라 부르며 텔레비전 시리즈로 자주 제작된다. 다큐멘터리, 뉴스, 스포츠는 일반적으로 리얼리티 쇼로 분류하지 않는다."

genre_df.loc[9, 'script'] = "그리스 네덜란드 스페인 영국 프랑스 독일"

In [39]:
genre_list = list(genre_df['genre'])

In [40]:
# 불용어 설정
stop_words = set(genre_df['genre'])
stop_words.add('영화')

In [41]:
def extract_kor(doc):
    pattern = re.compile('[^ㄱ-ㅣ가-힣]') # 한글만 추출
    kor_doc = pattern.sub('', doc)
    result = spell_checker.check(kor_doc).checked

    return result

def tokenizing(text):
    tokenizer = Mecab()
    total_tokens = tokenizer.nouns(text)
    total_tokens = [token for token in total_tokens if token not in stop_words and len(token) > 1]
    return ' '.join(total_tokens)
    
def preprocessing(corpus):
    kor_corpus = [extract_kor(doc) for doc in corpus]
    kor_tokens = [tokenizing(doc) for doc in kor_corpus]
    return kor_tokens

In [42]:
contents = list(genre_df['script'])
kor_tokens = preprocessing(contents)
kor_total_tokens = []
for L in kor_tokens:
    kor_total_tokens += L.split()

In [43]:
vocab = list(set(kor_total_tokens))
vocab.sort()

In [44]:
# 총 문서의 수
N = len(kor_tokens) 

def tf(t, d):
  return d.count(t)

def idf(t):
  df = 0
  for doc in kor_tokens:
    df += t in doc
  return log(N/(df+1))

def tfidf(t, d):
  return tf(t,d)* idf(t)

In [45]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
  result.append([])
  d = kor_tokens[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_.head()

Unnamed: 0,가상,갈래,감상,개척사,건전,게임,경기,경련,경우,고조,...,현대,현상,현실,혐오감,형식,형태,호러,혼재,활극,활약
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,5,0,0,0


In [46]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_.head()

Unnamed: 0,IDF
가상,2.251292
갈래,2.251292
감상,2.251292
개척사,2.251292
건전,2.251292


In [47]:
result = []
for i in range(N):
  result.append([])
  d = kor_tokens[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_.head()

Unnamed: 0,가상,갈래,감상,개척사,건전,게임,경기,경련,경우,고조,...,현대,현상,현실,혐오감,형식,형태,호러,혼재,활극,활약
0,0.0,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,2.251292
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,...,0.0,0.0,0.0,0.0,0.0,0.0,11.256459,0.0,0.0,0.0


In [48]:
df = tfidf_.transpose()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
가상,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0
갈래,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
감상,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0
개척사,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
건전,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0


In [49]:
df.columns = genre_list

In [52]:
for genre in df.columns:
    print('========= %s =========' % genre)
    print(df[genre].sort_values(ascending=False).head(5))
    print(df[df[genre]>0][genre].sort_values(ascending=False))

활약     2.251292
해결     2.251292
정의     2.251292
주안점    2.251292
의미     2.251292
Name: 액션, dtype: float64
갈래      2.251292
의미      2.251292
활극      2.251292
해결      2.251292
주안점     2.251292
정의      2.251292
재난      2.251292
권선징악    2.251292
활약      2.251292
육체      2.251292
움직임     2.251292
이용      1.845827
등장인물    1.845827
이야기     1.845827
인물      1.558145
서부극     1.558145
등장      1.558145
포함      1.335001
사건      1.152680
Name: 액션, dtype: float64
관객    4.502584
중점    2.251292
웃음    2.251292
방법    2.251292
행동    2.251292
Name: 코미디, dtype: float64
관객     4.502584
과장     2.251292
방법     2.251292
웃음     2.251292
중점     2.251292
행동     2.251292
대부분    1.845827
상황     1.845827
성격     1.845827
유머     1.845827
제작     1.845827
장르     1.093087
Name: 코미디, dtype: float64
기록     4.502584
시각     2.251292
라틴어    2.251292
시도     2.251292
문화     2.251292
Name: 다큐멘터리, dtype: float64
기록     4.502584
논픽션    2.251292
단어     2.251292
라틴어    2.251292
문화     2.251292
시각     2.251292
시도     2.251292
어시     2