In [2]:
import os
import numpy as np 
import pandas as pd 

import tweepy
from konlpy.tag import Okt, Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import nltk
from hanspell import spell_checker

from pprint import pprint
import re
from math import log # IDF 계산을 위해

import warnings
warnings.filterwarnings(action='ignore')

In [3]:

current_dir = os.getcwd()
genre_df = pd.read_csv(current_dir + '/data/genre_df.csv')
genre_df

Unnamed: 0,genre,script
0,액션,"액션 영화(action film)는 영화의 한 갈래로, 등장인물의 육체적인 움직임을..."
1,코미디,"코미디 영화(comedy film)는 유머에 중점을 둔 영화 장르로, 관객들로부터 ..."
2,다큐멘터리,"다큐멘터리 영화(영어: Documentary Film), 또는 기록 영화(記錄映畵,..."
3,판타지,판타지 영화란 판타지적인 내용을 영화로 만든 것이다.
4,공포,"공포 영화(恐怖映畫, 영어: horror film)란 귀신이나 유령 등 무서운 것과..."
5,음악,"뮤지컬 영화(Musical film)는 영화의 장르 중 하나로, 이야기에 혼재하여 ..."
6,로맨스,"로맨스 영화(영어: romance film)는 영화의 장르 중 하나로, 사랑을 주제..."
7,스포츠,"스포츠 영화(-映畵)는 스포츠를 소재로 한 영화로, 스포츠 선수나 팀, 경기 등을 ..."
8,서부,서부극(西部劇)은 미국의 역사 중에서 서부 개척사를 소재로 한 영화나 극 작품이다.
9,Made in Europe,그리스 네덜란드 스페인 영국 프랑스 독일


In [109]:
genre_df.loc[18, 'script'] = "리얼리티 방송(영어: reality television)은 대본에 기반하지 않는, 멜로드라마틱한 상황이나 유머스러운 상황을 표출하는 텔레비전 프로그램의 한 장르로, 전문 배우 대신 일반인을 주연으로 하는 것이 보통이며 상을 수여하는 콘테스트에서도 이따금 이루어진다.[1] 리얼리티 방송은 1948년 TV 시리즈 캔디드 카메라(Candid Camera)에서 비롯한다.[2] 이 장르는 1999년부터 2000년 즈음 빅 브라더와 서바이벌과 같은 텔레비전 시리즈의 성공과 더불어 하나의 현상으로 급격히 발전하였다.[1] 리얼리티 방송 장르에 속하는 프로그램을 흔히 리얼리티 쇼(reality shows)라 부르며 텔레비전 시리즈로 자주 제작된다. 다큐멘터리, 뉴스, 스포츠는 일반적으로 리얼리티 쇼로 분류하지 않는다."

genre_df.loc[9, 'script'] = "그리스 네덜란드 스페인 영국 프랑스 독일"

In [110]:
genre_df.to_csv(current_dir+'/data/genre_df.csv', index=False)

In [57]:
genre_list = list(genre_df['genre'])

In [58]:
# 불용어 설정
stop_words = set(genre_df['genre'])
stop_words.add('영화')

In [59]:
def extract_kor(doc):
    pattern = re.compile('[^ㄱ-ㅣ가-힣]') # 한글만 추출
    kor_doc = pattern.sub('', doc)
    result = spell_checker.check(kor_doc).checked

    return result

def tokenizing(text, ):
    tokenizer = Mecab()
    total_tokens = tokenizer.nouns(text)
    total_tokens = [token for token in total_tokens if token not in stop_words and len(token) > 1]
    return ' '.join(total_tokens)
    
def preprocessing(corpus):
    kor_corpus = [extract_kor(doc) for doc in corpus]
    kor_tokens = [tokenizing(doc) for doc in kor_corpus]
    return kor_tokens

In [60]:
contents = list(genre_df['script'])
kor_tokens = preprocessing(contents)
kor_total_tokens = []
for L in kor_tokens:
    kor_total_tokens += L.split()

In [61]:
vocab = list(set(kor_total_tokens))
vocab.sort()

In [62]:
# 총 문서의 수
N = len(kor_tokens) 

def tf(t, d):
  return d.count(t)

def idf(t):
  df = 0
  for doc in kor_tokens:
    df += t in doc
  return log(N/(df+1))

def tfidf(t, d):
  return tf(t,d)* idf(t)

In [63]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
  result.append([])
  d = kor_tokens[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_.head()

Unnamed: 0,가상,갈래,감상,개척사,건전,게임,경기,경련,경우,고조,...,현대,현상,현실,혐오감,형식,형태,호러,혼재,활극,활약
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,5,0,0,0


In [64]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_.head()

Unnamed: 0,IDF
가상,2.251292
갈래,2.251292
감상,2.251292
개척사,2.251292
건전,2.251292


In [65]:
result = []
for i in range(N):
  result.append([])
  d = kor_tokens[i]
  for j in range(len(vocab)):
    t = vocab[j]
    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_.head()

Unnamed: 0,가상,갈래,감상,개척사,건전,게임,경기,경련,경우,고조,...,현대,현상,현실,혐오감,형식,형태,호러,혼재,활극,활약
0,0.0,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,2.251292
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,...,0.0,0.0,0.0,0.0,0.0,0.0,11.256459,0.0,0.0,0.0


In [66]:
df = tfidf_.transpose()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
가상,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0
갈래,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
감상,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0
개척사,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
건전,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0


In [67]:
df.columns = genre_list

In [68]:
genre_list

['액션',
 '코미디',
 '다큐멘터리',
 '판타지',
 '공포',
 '음악',
 '로맨스',
 '스포츠',
 '서부',
 'Made in Europe',
 '애니메이션',
 '범죄',
 '드라마',
 '역사',
 '가족',
 '스릴러',
 'SF',
 '전쟁',
 'Reality TV']

In [73]:
df.head()

Unnamed: 0,액션,코미디,다큐멘터리,판타지,공포,음악,로맨스,스포츠,서부,Made in Europe,애니메이션,범죄,드라마,역사,가족,스릴러,SF,전쟁,Reality TV
가상,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0
갈래,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
감상,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0
개척사,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
건전,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.251292,0.0,0.0,0.0,0.0


In [101]:

total_df = pd.DataFrame(columns=['genre', 'keyword'])
for genre in df.columns:
    print('========= {} ========'.format(genre))
    print(df[genre].sort_values(ascending=False).head(10))
    genre_top5 = pd.DataFrame(df[genre].sort_values(ascending=False).head(10)).reset_index().rename(columns={'index':'keyword'})
    genre_top5['genre'] = genre
    genre_top5.drop('{}'.format(genre), axis=1, inplace=True)
    total_df = pd.concat([total_df, genre_top5], axis=0)

활약      2.251292
해결      2.251292
정의      2.251292
주안점     2.251292
의미      2.251292
육체      2.251292
움직임     2.251292
권선징악    2.251292
재난      2.251292
갈래      2.251292
Name: 액션, dtype: float64
관객     4.502584
중점     2.251292
웃음     2.251292
방법     2.251292
행동     2.251292
과장     2.251292
제작     1.845827
상황     1.845827
성격     1.845827
대부분    1.845827
Name: 코미디, dtype: float64
기록     4.502584
시각     2.251292
라틴어    2.251292
시도     2.251292
문화     2.251292
논픽션    2.251292
형태     2.251292
단어     2.251292
어시     2.251292
사실     1.845827
Name: 다큐멘터리, dtype: float64
내용    2.251292
가상    0.000000
인물    0.000000
유래    0.000000
유령    0.000000
유머    0.000000
육체    0.000000
의문    0.000000
의미    0.000000
이상    0.000000
Name: 판타지, dtype: float64
호러      11.256459
자연      11.256459
좀비       4.502584
유령       4.502584
두려움      4.502584
장르       2.732719
소재       2.305359
중첩       2.251292
목표       2.251292
메리셸리     2.251292
Name: 공포, dtype: float64
혼재     2.251292
노래     2.251292
형식     2.251292
뮤지

- 액션: '활약', '해결', '정의', '움직임', '권선징악'
- 코미디: '웃음', '과장', '개그', '풍자', '병맛'
- 다큐멘터리: '기록', '논픽션', '사실', '자연', '사회'
- 판타지: '가상', '의문', '비리얼리즘'
- 공포: '호러', '좀비', '유령', '두려움', '놀람'
- 음악: '노래', '뮤지컬', '연주' ,'뮤지션', '보컬'
- 로맨스: '사랑', '치정', '관계', '로맨틱', '연애', '남녀'
- 스포츠: '선수', '경기', '코치', '기록', '올림픽'
- 서부: '미국', '개척사', '총잡이'
- Made in Europe: '스페인', '독일', '영국', '그리스', '프랑스'
- 애니메이션: '극장판', '캐릭터','만화', '디즈니', '픽사', '더빙'
- 범죄: '범죄자', '사건', '형사', '신문', '교도소'
- 드라마: '정서'
- 역사: '사실', '고려', '전투', '부대', '소련'
- 가족: '오락', '건전', '아이들'
- 스릴러: '대결', '게임', '추적', '살인마', '비밀리'
- SF: '과학', '외계', '우주', '인류', '인공지능', '실험'
- 전쟁: '현대', '중세', '나치', '임무', '암호'
- Reality TV: '리얼리티', '시리즈', '일반',

In [5]:
keyword_dic = {
    '액션': ['활약', '해결', '정의', '움직임', '권선징악'],
    '코미디': ['웃음', '과장', '개그', '풍자', '병맛'],
    '다큐멘터리': ['기록', '논픽션', '사실', '자연', '사회'],
    '판타지': ['가상', '의문', '비리얼리즘', '', ''],
    '공포': ['호러', '좀비', '유령', '두려움', '놀람'],
    '음악': ['노래', '뮤지컬', '연주' ,'뮤지션', '보컬'],
    '로맨스': ['사랑', '치정', '로맨틱', '연애', '남녀'],
    '스포츠': ['선수', '경기', '코치', '기록', '올림픽'],
    '서부': ['미국', '개척사', '총잡이', '', ''],
    'Made in Europe': ['스페인', '독일', '영국', '그리스', '프랑스'],
    '애니메이션': ['극장판', '캐릭터','만화', '디즈니', '더빙'],
    '범죄': ['범죄자', '사건', '형사', '신문', '교도소'],
    '드라마': ['정서', '', '', '', ''],
    '역사': ['사실', '고려', '전투', '부대', '소련'],
    '가족': ['오락', '건전', '아이들', '', ''],
    '스릴러': ['대결', '게임', '추적', '살인마', '비밀리'],
    'SF': ['과학', '외계', '우주', '인류', '인공지능'],
    '전쟁': ['현대', '중세', '나치', '임무', '암호'],
    'Reality TV': ['리얼리티', '시리즈', '일반', '', '']
}

In [7]:
key_df = pd.DataFrame(keyword_dic)
key_df

Unnamed: 0,액션,코미디,다큐멘터리,판타지,공포,음악,로맨스,스포츠,서부,Made in Europe,애니메이션,범죄,드라마,역사,가족,스릴러,SF,전쟁,Reality TV
0,활약,웃음,기록,가상,호러,노래,사랑,선수,미국,스페인,극장판,범죄자,정서,사실,오락,대결,과학,현대,리얼리티
1,해결,과장,논픽션,의문,좀비,뮤지컬,치정,경기,개척사,독일,캐릭터,사건,,고려,건전,게임,외계,중세,시리즈
2,정의,개그,사실,비리얼리즘,유령,연주,로맨틱,코치,총잡이,영국,만화,형사,,전투,아이들,추적,우주,나치,일반
3,움직임,풍자,자연,,두려움,뮤지션,연애,기록,,그리스,디즈니,신문,,부대,,살인마,인류,임무,
4,권선징악,병맛,사회,,놀람,보컬,남녀,올림픽,,프랑스,더빙,교도소,,소련,,비밀리,인공지능,암호,


In [29]:
df = pd.DataFrame(columns=['genre', 'keyword'])
genre_list = []
keyword_list =  []
for col in key_df:
    genre_list.append(col)
    genre_list.append(col)
    genre_list.append(col)
    genre_list.append(col)
    genre_list.append(col)
    keyword_list += list(key_df['{}'.format(col)].values)
    # df = pd.concat(df, key_df['{}'.format(col)], axis=0)


In [30]:
df['keyword'] = keyword_list

In [31]:
df['genre']=genre_list

In [32]:
df

Unnamed: 0,genre,keyword
0,액션,활약
1,액션,해결
2,액션,정의
3,액션,움직임
4,액션,권선징악
...,...,...
90,Reality TV,리얼리티
91,Reality TV,시리즈
92,Reality TV,일반
93,Reality TV,


In [34]:
idx = df[df['keyword']==''].index
df = df.drop(idx, axis=0).reset_index(drop=True)

In [35]:
df

Unnamed: 0,genre,keyword
0,액션,활약
1,액션,해결
2,액션,정의
3,액션,움직임
4,액션,권선징악
...,...,...
78,전쟁,임무
79,전쟁,암호
80,Reality TV,리얼리티
81,Reality TV,시리즈


In [36]:
df.to_csv(current_dir + '/data/genre_keyword.csv', index=False)

In [38]:
genre = pd.read_csv(current_dir + '/data/genre.csv')

In [43]:
genre = genre.rename(columns={'Unnamed: 0':'genre_id'})

In [45]:
genre.to_csv(current_dir + '/data/genre.csv', index=False)

In [48]:
df = df.reset_index().rename(columns={'index':'keyword_id'})

In [51]:
keyword_id.to_csv(current_dir + '/data/keyword_id.csv', index=False)

In [52]:
genre_keyword_id = pd.merge(genre, df, on='genre', how='right')

In [54]:
genre_keyword_id = genre_keyword_id[['genre_id', 'keyword_id']]

In [55]:
genre_keyword_id.to_csv(current_dir + '/data/genre_keyword_id.csv', index=False)

In [56]:
genre = pd.read_csv(current_dir + '/data/genre.csv')
genre_keyword_id = pd.read_csv(current_dir + '/data/genre_keyword_id.csv')
genre_keyword = pd.read_csv(current_dir + '/data/genre_keyword.csv')
keyword_id = pd.read_csv(current_dir + '/data/keyword_id.csv')

In [57]:
genre

Unnamed: 0,genre_id,genre
0,0,액션
1,1,코미디
2,2,다큐멘터리
3,3,판타지
4,4,공포
5,5,음악
6,6,로맨스
7,7,스포츠
8,8,서부
9,9,Made in Europe


In [58]:
genre_keyword_id

Unnamed: 0,genre_id,keyword_id
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
...,...,...
78,17,78
79,17,79
80,18,80
81,18,81


In [59]:
genre_keyword

Unnamed: 0,genre,keyword
0,액션,활약
1,액션,해결
2,액션,정의
3,액션,움직임
4,액션,권선징악
...,...,...
78,전쟁,임무
79,전쟁,암호
80,Reality TV,리얼리티
81,Reality TV,시리즈


In [60]:
keyword_id

Unnamed: 0,keyword_id,keyword
0,0,활약
1,1,해결
2,2,정의
3,3,움직임
4,4,권선징악
...,...,...
78,78,임무
79,79,암호
80,80,리얼리티
81,81,시리즈


In [97]:
total_df.reset_index(inplace=True, drop=True)

In [104]:
list(genre_df['genre'])

['액션',
 '코미디',
 '다큐멘터리',
 '판타지',
 '공포',
 '음악',
 '로맨스',
 '스포츠',
 '서부',
 'Made in Europe',
 '애니메이션',
 '범죄',
 '드라마',
 '역사',
 '가족',
 '스릴러',
 'SF',
 '전쟁',
 'Reality TV']