In [1]:
import numpy as np
import pandas as pd
from konlpy.tag import Mecab
import math
import hanja
import re
import string
import operator
import random
import matplotlib.pyplot as plt
import itertools
import cnouns as cn
import check_utils as cu
import deep_utils as du
from sklearn.metrics import adjusted_rand_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN
from datetime import datetime
from sklearn.decomposition import PCA
from gensim import models
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
import gensim.models.doc2vec
from collections import OrderedDict
from gensim.models.doc2vec import LabeledSentence

import multiprocessing
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
import cPickle as pickle
from spherecluster import SphericalKMeans
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

from random import shuffle

from collections import namedtuple

Articles = namedtuple('Articles', 'words tags split')

# Load Data

In [2]:
# test = 1
test = 2

In [None]:
if(test==1):
    topics = {
          0: u'올림픽',
          1: u'테러', 
          2: u'브렉시트', 
          3: u'미국 금리',
          4: u'바이러스', 
          5: u'미국대선,힐러리,트럼프', 
          6: u'시리아 전쟁, 난민'
         }
    train_df = pd.read_pickle("../datastore/international.p")
    num_clusters = len(topics)
elif(test==2):    
    train_df = pd.read_pickle("../datastore/weekly_2.p")

# Preprocessing

In [None]:
s_t_preprocessing = time()

In [None]:
train_df['target_str'] = [cn.tokenize(row.title + " " + row.content) for idx, row in train_df.iterrows()]

In [None]:
size = len(train_df) / 4
print size, len(train_df), size * 4

In [None]:
alldocs = []
for idx, row in train_df.iterrows():
    tokens = row['target_str'].split(' ')
    words = tokens[0:]
    tags = [idx]
    tmp = idx//size % 4
    split = ['train','test','extra','extra'][tmp]  # 25k train, 25k test, 25k extra
    alldocs.append(Articles(words, tags, split))
doc_list = alldocs[:]

In [None]:
e_t_preprocessing = time()

# Learning

In [None]:
s_t_learning = time()

In [None]:
simple_models = [
    # PV-DM Distributed Momory Model of PV
    # w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW Distributed Bag of Words version of PV
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

In [None]:
simple_models[0].load_word2vec_format("../datastore/w-w2v.p")

In [None]:
simple_models[0].build_vocab(alldocs)
print simple_models[0]
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

In [None]:
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results

    for name, train_model in models_by_name.items():
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(doc_list)
        print("%i passes : %s" % (epoch + 1, name))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta

In [None]:
e_t_learning = time()

# Save or Load

In [None]:
train_df.to_pickle("../datastore/deep_df.p")

models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)'].save("../datastore/d2v-dmc_%d.p" % test)
models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'].save("../datastore/d2v-dbow_%d.p" % test)
models_by_name['Doc2Vec(dm/m,d100,n5,w10,mc2,t8)'].save("../datastore/d2v-dmm_%d.p" % test)

In [3]:
train_df = pd.read_pickle("../datastore/deep_df.p")

models_by_name = OrderedDict()
models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)'] = Doc2Vec.load("../datastore/d2v-dmc_%d.p" % test)
models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'] = Doc2Vec.load("../datastore/d2v-dbow_%d.p" % test)
models_by_name['Doc2Vec(dm/m,d100,n5,w10,mc2,t8)'] = Doc2Vec.load("../datastore/d2v-dmm_%d.p" % test)

In [4]:
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'], models_by_name['Doc2Vec(dm/m,d100,n5,w10,mc2,t8)']])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'], models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)']])

# Select Model

In [5]:
# dm = models_by_name['dbow+dmm']
# dm = models_by_name['dbow+dmc']
dm = models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)']

In [6]:
doc_arr = dm.docvecs
inp = np.array(doc_arr)

# Clustering

In [None]:
model = KMeans
# model = SphericalKMeans

# Find Best K

In [None]:
best_score = 0.0
best_k = 0

In [None]:
for k in range(50, 60):
    t_km = model(n_clusters=k, n_jobs=-1).fit(inp)
    score = silhouette_score(inp, t_km.labels_)
    if best_score < score:
        best_score = score
        best_k = k
    print("In Clusters =", k, ", Score is : %0.3f" % score)
print("In Clusters =", best_k, ", Best score is : %0.3f" % best_score)

# Clustering

In [None]:
n_cluster = 200

In [None]:
d_km = model(n_clusters=n_cluster, n_jobs=-1)
d_km.fit(inp)

In [None]:
clusters = d_km.labels_.tolist()
train_df['cluster'] = clusters

# Scoring

In [None]:
print "inertia : ", d_km.inertia_
print "silhouette score : ", silhouette_score(inp, d_km.labels_)

In [None]:
sorted_cluster = cu.sort_count(train_df, range(n_cluster))
sorted_cluster

In [None]:
target_cluster_idx = 32

In [None]:
center_idx = cu.find_center_article(d_km, target_cluster_idx, inp)
print center_idx, train_df.loc[center_idx].title

In [None]:
target_cluster = train_df[train_df.cluster==target_cluster_idx]
print "size ", len(target_cluster)
target_cluster.title

In [None]:
print dm.docvecs.similarity(d1=2183, d2=2165)
print dm.docvecs.similarity(d1=2267, d2=2328)

In [None]:
cu.test_similar(1, dm.docvecs, train_df, threadsold=0.5, is_last = False)

# Similarity Clustering

In [None]:
centers = du.similarity_clustering(train_df, dm.docvecs, 0.8)

In [None]:
train_df.to_pickle("../datastore/deep_result_df.p")
pickle.dump(centers, open("../datastore/deep_centers.p", "wb"))

In [7]:
train_df = pd.read_pickle("../datastore/deep_result_df.p")
centers = pickle.load(open("../datastore/deep_centers.p", "rb"))

# Similarity Scoring

In [8]:
score = du.similiarity_iner_score(centers, train_df, dm.docvecs)

In [9]:
size_1 = score[score.cnt==1]
countby = score[score.cnt>10]
print "total:", len(score), ", size_1:",len(size_1), ", countby:", len(countby)
ss = countby.sum(axis=0)
print "distance:", ss['distance'] * 100
print "variance:", ss['variance']
print "similarity:", (ss['similarity'] * 100)/len(countby)

total: 997 , size_1: 808 , countby: 22
distance: 21712.8087121
variance: 4.32924858294
similarity: 80.1201240607


In [None]:
score.sort_values('similarity', ascending=True)[:10]

In [None]:
score.sort_values('cnt', ascending=False)[:10]

In [None]:
countby.sort_values('similarity', ascending=False)[:10]

# Get Topics

In [15]:
topics = du.get_all_topics(train_df, countby.cluster.tolist())

Number of cluster : 22
progress - 0 / 22
progress - 1 / 22
progress - 2 / 22
progress - 3 / 22
progress - 4 / 22
progress - 5 / 22
progress - 6 / 22
progress - 7 / 22
progress - 8 / 22
progress - 9 / 22
progress - 10 / 22
progress - 11 / 22
progress - 12 / 22
progress - 13 / 22
progress - 14 / 22
progress - 15 / 22
progress - 16 / 22
progress - 17 / 22
progress - 18 / 22
progress - 19 / 22
progress - 20 / 22
progress - 21 / 22


In [16]:
pickle.dump(topics, open("../datastore/deep_topics.p", "wb"))

In [None]:
topics = pickle.load(open("../datastore/deep_topics.p", "rb"))

In [11]:
countby.cluster

1         2
2       111
3        90
4        66
13      162
17       24
18      688
21       42
43      180
44      817
48       80
54      145
55      361
57      210
68      122
69     2145
70     1159
73     1622
85     1026
104     709
106     580
129     252
Name: cluster, dtype: int64

In [14]:
cluster_idx = 111
du.topic_print(topics[cluster_idx])
train_df[train_df.cluster==cluster_idx].title

 대표NNG 누리NNG 호남NNG
 대표NNG 경제NNG 추NNG대표NNG
 대표NNG 의원NNG 국회NNG


4          [사설] 경제 앞길 막은 야당이 경제 失政 비판할 자격 있나
68       [플라자] 대한민국ROTC중앙회, 나라 사랑 조찬 포럼 개최 외
111                  증세카드 꺼낸 野대표 "법인세 정상화해야"
112               국감에 부를 증인 4100명 넘어 최대규모 될듯
113                      혹평한 국민의黨 "집권당처럼 행동"
114                     호평해준 새누리 "민생 집중한 연설"
115                          '창조경제센터 동물원' 공방
116                  이정현 "김대중 정부때 반대만 한것 사과"
118                 문재인 "한진해운 일시적 국유화까지 검토를"
132                [사설] 여당 대표 입에서도 나온 "國害의원"
134           [류근일 칼럼] 있지만 없는 '제3의 길' '제3지대'
229               DJ·盧에 사과한 與대표 "호남·새누리 손잡자"
230        김재수, 장관 되니 딴소리… "청문회 때 흙수저라 무시당해"
263          [사설] 나라 사방이 다 막혔는데 大選 경쟁은 벌써 열기
280     [TV조선 주중 하이라이트] '정두언·김유정의 이것이 정치다' 외
372         직접 운전·페이스북 동영상… 與 주자들은 조심스레 '꿈틀'
373               오죽했으면… 文, 지지자들에 "선플 좀 답시다"
374           문재인·김부겸, 팬클럽 勢몰이… 안철수, 창조경제 비판
551      [플라자] 한국문학교육학회 창립 20주년 기념 학술대회 개최 외
583                              속으로 웃는 국민의黨
585                      光州로 달려간 野지도부와 대선주자들
587                    국회 정상화에 서청원 의원이 막후 역할
590       