In [None]:
# coding: utf-8

from SimilarityClustering import SimilarityClustering
import articles_data

from gensim.models import Word2Vec
import numpy as np
import pandas as pd
import cnouns as cn
from pymongo import MongoClient
import datetime

client = MongoClient('mongodb://localhost:27017/somanews')
client.somanews.authenticate('ssomanews', 'ssomanews1029')
db = client.get_database('somanews')

crawled_collection = db.get_collection('crawledArticles')
clusters_collection = db.get_collection('bclusters')
articles_collection = db.get_collection('barticles')

datastore_dir = "../datastore/"
catelist_path = datastore_dir + "category2.p"
w2v_src_dir = datastore_dir + "w2v_src4"
w2v_path = datastore_dir + "sejongcorpus_w2v4_2.p"
nnp_dict_path = datastore_dir + "nnps2.p"
corpus_path = datastore_dir + "corpus2.p"

target_time = datetime.datetime.now()
# target_time = datetime.datetime(2016, 11, 19)
prefix = int("%.2d%.2d"%(target_time.month, target_time.day))
prefix_str = "%d_00" % prefix

nnp_dict_df = pd.read_pickle(nnp_dict_path)
nnp_dict_df = nnp_dict_df[nnp_dict_df>10]
nnp_dict = nnp_dict_df.index.tolist()

custom_dict = [u'새누리', u'새누리당', u'더민주', u'더민주당', u'최순실', u'박대통령', u'국회의장', u'야권의요구', u'정기국회', u'참여정부']
dicts = set(nnp_dict + custom_dict)

def tokenizer(inp_str):
    return cn.custom_pos_tags(inp_str, dicts)

In [None]:
# # Model
train_df = articles_data.find_recent_articles(crawled_collection, catelist_path, target_time, 7)
sc = SimilarityClustering()
sc.train("cate", w2v_path, train_df, path=datastore_dir, prefix=prefix_str, tokenizer=tokenizer,
            threshold=0.65,
            cnt_threshold=10,
            repeat=3,
            model_name='dbow+dmm')

# # Save
# sc.iner_score(threshold=0.7, cnt_threshold=8)
sc.save(path=datastore_dir, prefix=prefix_str)
calced_clusters, sort_cdf = sc.save_to_db(prefix, clusters_collection, articles_collection, target_time)

In [None]:
%matplotlib inline
sc.print_error_rate()

# Load

In [None]:
sc = SimilarityClustering.load(only_d2v=True, path=datastore_dir, prefix=prefix_str, model_name='dbow+dmm')

In [None]:
vec651 = sc.get_cluster_similarity(0.65, 1, datastore_dir, prefix_str)
vec653 = sc.get_cluster_similarity(0.65, 3, datastore_dir, prefix_str)
vec701 = sc.get_cluster_similarity(0.70, 1, datastore_dir, prefix_str)
vec703 = sc.get_cluster_similarity(0.70, 3, datastore_dir, prefix_str)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.xlabel("Cluster")
plt.ylabel("Similarity")
plt.plot(range(len(vec651)), vec651, 'r', label='Thresold: 0.65, iter: 1')
plt.plot(range(len(vec653)), vec653, 'g', label='Thresold: 0.65, iter: 3')
plt.plot(range(len(vec701)), vec701, 'b', label='Thresold: 0.7, iter: 1')
plt.plot(range(len(vec703)), vec703, 'y', label='Thresold: 0.7, iter: 3')
plt.show()

print("Color        Thresold  iter  Avg(Similarity)  Number of clusters")
print("----------------------------------------------------------------")
print("Red Line       0.65     1       %.3f           %d"%(sum(vec651)/float(len(vec651)), len(vec651)))
print("Green Line     0.65     3       %.3f           %d"%(sum(vec653)/float(len(vec653)), len(vec653)))
print("Blue Line      0.70     1       %.3f           %d"%(sum(vec701)/float(len(vec701)), len(vec701)))
print("Yellow Line    0.70     3       %.3f           %d"%(sum(vec703)/float(len(vec703)), len(vec703)))

In [None]:
vec653 = sc.get_cluster_similarity(0.65, 3, datastore_dir, prefix_str)

In [None]:
factor = {
    "portionRank": 1,
    "deltaTimeRank": 1.5,
    "cohesionRank": 0,
    "portion": 10,
    "deltaTime": 10,
    "cohesion": 1
}

In [None]:
calced_clusters, sort_cdf = sc.save_to_db(prefix, clusters_collection, articles_collection, target_time, factor, test=True)

In [None]:
# sc.df_[sc.df_.new_cluster==1119024].title

In [None]:
ranks = sort_cdf.drop(['count'], 1)
ranks

In [None]:
sc.print_cluster_rank(calced_clusters)

In [None]:
%matplotlib inline
sc.print_clusters(top=15, sortby='similarity', threshold=0.65)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

portion = ranks.sort_values('portion').portion
deltaTime = ranks.sort_values('deltaTime').deltaTime
cohesion = ranks.sort_values('cohesion').cohesion
portionRank = ranks.sort_values('portionRank').portionRank
deltaTimeRank = ranks.sort_values('deltaTimeRank').deltaTimeRank
cohesionRank = ranks.sort_values('cohesionRank').cohesionRank

plt.xlabel("Cluster")
plt.ylabel("Value")
plt.plot(range(len(portion)), portion, 'r', label='portion')
plt.plot(range(len(deltaTime)), deltaTime, 'g', label='timeDelta')
plt.plot(range(len(cohesion)), cohesion, 'b', label='cohesion')
plt.legend(bbox_to_anchor=(1, 0.7))
plt.show()

plt.xlabel("Cluster")
plt.ylabel("Value")
plt.plot(range(len(portionRank)), portionRank, 'r', label='portionRank')
plt.plot(range(len(deltaTimeRank)), deltaTimeRank, 'g', label='deltaTimeRank')
plt.plot(range(len(cohesionRank)), cohesionRank, 'b', label='cohesionRank')
plt.legend(bbox_to_anchor=(1, 0.7))
plt.show()