# News Clustering using KMeans Algorithm

In [1]:
import numpy as np
import pandas as pd
from time import time
from pymongo import MongoClient

In [2]:
t0 = time()

## Load data from MongoDB 

In [73]:
client = MongoClient('mongodb://localhost:27017/somanews')
client.somanews.authenticate('ssomanews', 'ssomanews1029')
db = client.get_database('somanews')

articles = db.get_collection('articles')
train = pd.DataFrame(list(articles.find()))

## Group by cluster

In [4]:
import datetime
import functools
import operator

In [5]:
def avg_datetime(series):
    dt_min = series.min()
    deltas = [x-dt_min for x in series]
    return dt_min + functools.reduce(operator.add, deltas) / len(deltas)

In [6]:
for name, group in train.groupby('cluster'):
    print(name, group.size, avg_datetime(group.publishedAt))

3 238 2016-11-09 12:03:49.529411764
4 70 2016-11-09 04:38:36
12 56 2016-11-08 22:44:45.250000
13 182 2016-11-10 04:30:18.461538461
16 154 2016-11-10 00:36:38.181818181
17 224 2016-11-08 13:28:52.625000
20 196 2016-11-10 15:43:38.571428571
21 140 2016-11-09 18:19:30.200000
22 42 2016-11-08 01:46:40.333333333
23 168 2016-11-11 11:32:50
24 392 2016-11-09 16:25:36.642857142
25 70 2016-11-08 21:56:36


In [31]:
test = [0.71706071666708804,
 0.3529872074112238,
 0.68909809900922037,
 0.48477673033979296,
 0.59294147267543074,
 0.63600523810705734,
 0.64057083464699427,
 0.60478776979334714,
 0.89184765048088543,
 0.37862157423502785,
 0.42959035257638323,
 0.4260801989923223]

In [8]:
import ntc_rank

In [9]:
def get_target_cate():
    return [u"정치", u"사회", u"과학", u"경제"]

In [71]:
def save_to_db(train, prefix, collections, cohesions):
    clusters = []
    time = datetime.datetime.now()
    clusters_infors = [(name, group) for name, group in train.groupby('cluster')]
    prefix = prefix * 1000
    i = 0
    for cluster in clusters_infors:
        new_cluster = prefix + cluster[0]
        info = cluster[1].size

        articles = []
        for idx, row in cluster[1].iterrows():
            row_dict = row.to_dict()
            articles.append(row_dict)

        cates = {}
        for cate in get_target_cate():
            cate_items = [article for article in articles if article['cate'] == cate]
            count = len(cate_items)
            cates[cate] = count

        leading = articles[0]
        for article in articles:
            if article['imageURL'] != '':
                if((leading['publishedAt'] - article['publishedAt']).total_seconds() > 0):
                    leading = article

        cluster = {
            "cluster": str(new_cluster),
            "cohesion": cohesions[i],
            "count": int(info),
            "cate": cates,
            "leading": leading,
            "clusteredAt": time,
            "articles": articles
        }
        clusters.append(cluster)
        i = i+1

    clusters = ntc_rank.calc_issue_rank(clusters)
    collections.insert_many(clusters)

In [72]:
save_to_db(train, 1108, collections, test)

In [57]:
client.close()