In [None]:
import numpy as np
import pandas as pd
from konlpy.tag import Mecab
import math
import hanja
import re
import string
import operator
import random
import matplotlib.pyplot as plt
import itertools
import cnouns as cn
import check_utils as cu
import deep_utils as du
from sklearn.metrics import adjusted_rand_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import SpectralClustering
from sklearn.cluster import DBSCAN
from datetime import datetime
from sklearn.decomposition import PCA
from gensim import models
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
import gensim.models.doc2vec
from collections import OrderedDict
from gensim.models.doc2vec import LabeledSentence

import multiprocessing
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
import cPickle as pickle
from spherecluster import SphericalKMeans
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

from random import shuffle

from collections import namedtuple

Articles = namedtuple('Articles', 'words tags split')

# Load Data

In [None]:
# test = 1
test = 2

times = {
    "preprocessing": {},
    "learning": {},
    "clustering": {},
    "similarity": {},
    "topic": {}
}

In [None]:
if(test==1):
    topics = {
          0: u'올림픽',
          1: u'테러', 
          2: u'브렉시트', 
          3: u'미국 금리',
          4: u'바이러스', 
          5: u'미국대선,힐러리,트럼프', 
          6: u'시리아 전쟁, 난민'
         }
    train_df = pd.read_pickle("../datastore/international.p")
    num_clusters = len(topics)
elif(test==2):    
    train_df = pd.read_pickle("../datastore/weekly_2.p")

# Preprocessing

In [None]:
times["preprocessing"]["start"] = time()

In [None]:
train_df['target_str'] = [cn.tokenize(row.title + " " + row.content) for idx, row in train_df.iterrows()]

In [None]:
size = len(train_df) / 4
print size, len(train_df), size * 4

In [None]:
alldocs = []
for idx, row in train_df.iterrows():
    tokens = row['target_str'].split(' ')
    words = tokens[0:]
    tags = [idx]
    tmp = idx//size % 4
    split = ['train','test','extra','extra'][tmp]  # 25k train, 25k test, 25k extra
    alldocs.append(Articles(words, tags, split))

In [None]:
times["preprocessing"]["end"] = time()

# Learning

In [None]:
times["learning"]["start"] = time()

In [None]:
simple_models = [
    # PV-DM Distributed Momory Model of PV
    # w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW Distributed Bag of Words version of PV
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

In [None]:
simple_models[0].load_word2vec_format("../datastore/sejongcorpus_w2v.p")

In [None]:
simple_models[0].build_vocab(alldocs)
print simple_models[0]
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

In [None]:
doc_list = alldocs[:]

alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results

    for name, train_model in models_by_name.items():
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(doc_list)
        print("%i passes : %s" % (epoch + 1, name))

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta

In [None]:
times["learning"]["end"] = time()

# Save

In [None]:
train_df.to_pickle("../datastore/deep_df.p")

models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)'].save("../datastore/d2v-dmc_%d.p" % test)
models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'].save("../datastore/d2v-dbow_%d.p" % test)
models_by_name['Doc2Vec(dm/m,d100,n5,w10,mc2,t8)'].save("../datastore/d2v-dmm_%d.p" % test)

In [None]:
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'], models_by_name['Doc2Vec(dm/m,d100,n5,w10,mc2,t8)']])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([models_by_name['Doc2Vec(dbow,d100,n5,mc2,t8)'], models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)']])

# Select Model

In [None]:
# dm = models_by_name['dbow+dmm']
# dm = models_by_name['dbow+dmc']
dm = models_by_name['Doc2Vec(dm/c,d100,n5,w5,mc2,t8)']

In [None]:
doc_arr = dm.docvecs
inp = np.array(doc_arr)

# Clustering

In [None]:
model = KMeans
# model = SphericalKMeans

# Find Best K

In [None]:
best_score = 0.0
best_k = 0

In [None]:
# for k in range(50, 60):
#     t_km = model(n_clusters=k, n_jobs=-1).fit(inp)
#     score = silhouette_score(inp, t_km.labels_)
#     if best_score < score:
#         best_score = score
#         best_k = k
#     print("In Clusters =", k, ", Score is : %0.3f" % score)
# print("In Clusters =", best_k, ", Best score is : %0.3f" % best_score)

# Clustering

In [None]:
times["clustering"]["start"] = time()

In [None]:
n_cluster = 200

In [None]:
d_km = model(n_clusters=n_cluster, n_jobs=-1)
d_km.fit(inp)

In [None]:
clusters = d_km.labels_.tolist()
train_df['cluster'] = clusters

In [None]:
times["clustering"]["end"] = time()

# Scoring

In [None]:
print "inertia : ", d_km.inertia_
print "silhouette score : ", silhouette_score(inp, d_km.labels_)

In [None]:
sorted_cluster = cu.sort_count(train_df, range(n_cluster))
sorted_cluster

In [None]:
target_cluster_idx = 32

In [None]:
center_idx = cu.find_center_article(d_km, target_cluster_idx, inp)
print center_idx, train_df.loc[center_idx].title

In [None]:
target_cluster = train_df[train_df.cluster==target_cluster_idx]
print "size ", len(target_cluster)
target_cluster.title

In [None]:
print dm.docvecs.similarity(d1=2183, d2=2165)
print dm.docvecs.similarity(d1=2267, d2=2328)

In [None]:
cu.test_similar(1, dm.docvecs, train_df, threadsold=0.5, is_last = False)

# Similarity Clustering

In [None]:
times["similarity"]["start"] = time()

In [None]:
centers = du.similarity_clustering(train_df, dm.docvecs, 0.8)

In [None]:
times["similarity"]["end"] = time()

In [None]:
train_df.to_pickle("../datastore/deep_result_df.p")
pickle.dump(centers, open("../datastore/deep_centers.p", "wb"))

# Similarity Scoring

In [None]:
scores = du.similarity_iner_score(centers, train_df, dm.docvecs)

In [None]:
size_1 = scores[scores.cnt==1]
countby = scores[scores.cnt>10]
print "total:", len(scores), ", size_1:",len(size_1), ", countby:", len(countby)
ss = countby.sum(axis=0)
print "distance:", ss['distance'] * 100
print "variance:", ss['variance']
print "similarity:", (ss['similarity'] * 100)/len(countby)

# Get Topics

In [None]:
times["topic"]["start"] = time()

In [None]:
topics = du.get_all_topics(train_df, countby.cluster.tolist())

In [None]:
times["topic"]["end"] = time()

In [None]:
pickle.dump(topics, open("../datastore/deep_topics.p", "wb"))

In [None]:
for key, value in times.iteritems():
    value["elapsed"]= value["end"] - value["start"]

In [None]:
pickle.dump(times, open("../datastore/deep_times.p", "wb"))