# Baseline script of SomaNews Clustering

In [1]:
import numpy as np
import pandas as pd
from konlpy.tag import Mecab
import hanja
import re
import string
import operator
import random
import matplotlib.pyplot as plt
import itertools
import cnouns
from sklearn.metrics import adjusted_rand_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

## Load Data
Load Data from database

In [2]:
df = pd.read_pickle("../datastore/whole_articles.p")
df = df.drop(['author', 'link', 'imageURL'], axis=1)
df.shape

(120835, 8)

## Pick test set

In [3]:
topics = {0: u'올림픽',
          1: u'테러', 
          2: u'브렉시트', 
#           3: u'남중국해, 사드, 북핵',
          3: u'미국 금리',
          4: u'바이러스', 
          5: u'미국대선,힐러리,트럼프', 
          6: u'시리아 전쟁, 난민', 
#           7: u'미국 금리'
         }
test_set = {}

In [None]:
ts0_1 = df.loc[df.title.str.match(u"(?=.*올림픽)(?!.*장애인)(?!.*평창).*")]
test_set[0] = pd.concat([ts0_1], axis=0)

In [None]:
test_set[1] = pd.concat([df.loc[df.title.str.match(u"(?=.*테러)(?!.*방지법).*")]], axis=0)

In [None]:
ts2_1 = df[df.title.str.match(u"(?=.*브렉시트).*")]
ts2_2 = df[df.title.str.match(u"(?=.*EU)(?=.*탈퇴).*")]
test_set[2] = pd.concat([ts2_1, ts2_2], axis=0)

In [None]:
# ts9_1 = df[df.title.str.match(u"(?=.*남중국해).*")]
# ts9_2 = df[df.title.str.match(u"(?=.*사드)(?=.*일본)(?!.*아사드).*")]
# ts9_3 = df[df.title.str.match(u"(?=.*사드)(?=.*중국)(?!.*아사드).*")]
# ts9_4 = df[df.title.str.match(u"(?=.*사드)(?=.*미국)(?!.*아사드).*")]
# ts9_5 = df[df.title.str.match(u"(?=.*사드)(?=.*시진핑)(?!.*아사드).*")]
# ts9_6 = df[df.title.str.match(u"(?=.*사드)(?=.*오바마)(?!.*아사드).*")]
# ts9_7 = df[df.title.str.match(u"(?=.*북핵).*")]
# test_set[3] = pd.concat([ts9_1, ts9_2, ts9_3, ts9_4, ts9_5, ts9_6, ts9_7], axis=0)

In [None]:
ts4_1 = df[df.title.str.match(u"(?=.*지카).*")]
ts4_2 = df[df.title.str.match(u"(?=.*메르스).*")]
ts4_3 = df[df.title.str.match(u"(?=.*에볼라).*")]
ts4_4 = df[df.title.str.match(u"(?=.*바이러스).*")]
test_set[4] = pd.concat([ts4_1, ts4_2, ts4_3, ts4_4], axis=0)

In [None]:
ts5_1 = df[df.title.str.match(u"(?=.*힐러리).*")]
ts5_2 = df[df.title.str.match(u"(?=.*트럼프).*")]
ts5_3 = df[df.title.str.match(u"(?=.*클린턴).*")]
ts5_4 = df[df.title.str.match(u"(?=.*도날드)(?!.*맥도날드).*")]
test_set[5] = pd.concat([ts5_1, ts5_2, ts5_3, ts5_4], axis=0)

In [None]:
ts6_1 = df[df.title.str.match(u"(?=.*시리아).*")]
ts6_2 = df[df.title.str.match(u"(?=.*알레포).*")]
ts6_3 = df[df.title.str.match(u"(?=.*아사드).*")]
ts6_4 = df[df.title.str.match(u"(?=.*정부군).*")]
ts6_5 = df[df.title.str.match(u"(?=.*반군).*")]
ts3_6 = df[df.title.str.match(u"(?=.*난민).*")]
test_set[6] = pd.concat([ts6_1, ts6_2, ts6_3, ts6_4, ts6_5, ts3_6], axis=0)

In [None]:
ts8_1 = df[df.title.str.match(u"(?=.*연준).*")]
# ts8_2 = df[df.title.str.match(u"(?=.*양적완화)(?!.*한국).*")]
ts8_3 = df[df.title.str.match(u"(?=.*금리)(?=.*인상).*")]
ts8_4 = df[df.title.str.match(u"(?=.*기준)(?=.*금리).*")]
ts8_5 = df[df.title.str.match(u"(?=.*옐런).*")]
ts8_6 = df[df.title.str.match(u"(?=.*금리)(?=.*동결).*")]
ts8_7 = df[df.title.str.match(u"(?=.*금리)(?=.*경제).*")]
test_set[3] = pd.concat([ts8_1, ts8_3, ts8_4, ts8_5, ts8_6, ts8_7], axis=0)

In [None]:
# ts7_1 = df[df.title.str.match(u"(?=.*IS).*")]
# ts7_2 = df[df.title.str.match(u"(?=.*극단주의).*")]
# ts7_3 = df[df.title.str.match(u"(?=.*외로운 늑대).*")]
# test_set[7] = pd.concat([ts7_1, ts7_2, ts7_3], axis=0)

In [None]:
# ts9_1 = df[df.title.str.match(u"(?=.*남중국해).*")]
# ts9_2 = df[df.title.str.match(u"(?=.*사드)(?=.*일본)(?!.*아사드).*")]
# ts9_3 = df[df.title.str.match(u"(?=.*사드)(?=.*중국)(?!.*아사드).*")]
# ts9_4 = df[df.title.str.match(u"(?=.*사드)(?=.*미국)(?!.*아사드).*")]
# ts9_5 = df[df.title.str.match(u"(?=.*사드)(?=.*시진핑)(?!.*아사드).*")]
# ts9_6 = df[df.title.str.match(u"(?=.*사드)(?=.*오바마)(?!.*아사드).*")]
# ts9_7 = df[df.title.str.match(u"(?=.*북핵).*")]
# test_set[9] = pd.concat([ts9_1, ts9_2, ts9_3, ts9_4, ts9_5, ts9_6, ts9_7], axis=0)

In [None]:
for i in range(0, len(topics)):
    test_set[i]['topic'] = topics[i]
    test_set[i]['topic_idx'] = i
#     test_set[i] = test_set[i].sample(n=150)
    
test_set[5] = test_set[5].sample(n=500)

train_df = pd.concat([ts for ts in test_set.values()], axis=0)
# train_df

## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab

In [None]:
def text_cleaning(text):
    text = hanja.translate(text, 'substitution')
    text = re.sub(u'(\[.*\]|\(.*\))', '', text)
    text = re.sub(u'(\(|\)|\[|\])', '', text)
    return text

In [None]:
train_df['clean_title'] = [text_cleaning(t) for t in train_df.title]
train_df['clean_content'] = [text_cleaning(t) for t in train_df.content]

In [None]:
train_df['tokenized_title'] = [cnouns.tokenize(t) for t in train_df.clean_title]
train_df['tokenized_content'] = [cnouns.tokenize(t) for t in train_df.clean_content]

## Save/Load Preprocessed data

In [None]:
train_df.to_pickle("../datastore/preprocesse.p")

In [4]:
train_df = pd.read_pickle("../datastore/w-preprocesse.p")

In [5]:
train_df.head

<bound method DataFrame.head of                              _id                                category  \
0       57e2c716149c2181df5b8b95                               사회 > 카드뉴스   
1       57e2c716149c2181df5b8b96                               사회 > 카드뉴스   
2       57e2c716149c2181df5b8b97                               사회 > 카드뉴스   
3       57e2c716149c2181df5b8b98                             문화 > 오늘의 운세   
4       57e2c723149c2181df5b8bad                       스포츠ㆍ연예 > 스포츠 > 종합   
5       57e2c716149c2181df5b8b99                         nativeAD > 기업뉴스   
6       57e2c717149c2181df5b8b9a                       사설ㆍ칼럼 > 내부칼럼 > 사설   
7       57e2c717149c2181df5b8b9b                       사설ㆍ칼럼 > 내부칼럼 > 사설   
8       57e2c717149c2181df5b8b9c                       사설ㆍ칼럼 > 내부칼럼 > 사설   
9       57e2c717149c2181df5b8b9d  사설ㆍ칼럼 > 외부칼럼 > 시론ㆍ기고 > 朝鮮칼럼 The Column   
10      57e2c717149c2181df5b8b9e                      사설ㆍ칼럼 > 내부칼럼 > 만물상   
11      57e2c721149c2181df5b8b9f                 사설ㆍ칼럼 >

## Training

In [None]:
vectorizer = TfidfVectorizer(lowercase=False, 
                             max_features=None,
                             max_df=1.0, 
                             min_df=1,
                             use_idf=True,
                             smooth_idf=True,
                             sublinear_tf=False)

In [None]:
x_list = vectorizer.fit_transform(train_df.tokenized_title + train_df.tokenized_content)

In [None]:
print x_list.shape
print train_df.shape

In [None]:
x_list[0]

In [None]:
len(x_list.toarray())
print(dict(zip(vectorizer.get_feature_names(), vectorizer._tfidf.idf_)))
# x_list.stop_words()

### Basic Models
1. Tf-idf and Cosine similarity
2. K-Means Algorithm

In [None]:
def cosine_distance(x_list):
    dist = 1 - cosine_similarity(x_list)
    return dist

In [None]:
num_clusters = len(topics)
num_clusters

In [None]:
t0 = time()
km = KMeans(n_clusters=num_clusters, n_jobs=-1)
km.fit(x_list)
clusters = km.labels_.tolist()
print("Done in %0.3fs." % (time() - t0))

In [None]:
print(len(clusters), len(train_df))

In [None]:
train_df['cluster'] = clusters

In [None]:
def match_cluster_topic(is_cluster):
    if(is_cluster):
        print("Cluster -> Topic")
        criteria = 'cluster'
        target = 'topic_idx'
    else:
        print("Topic -> Cluster")
        criteria = 'topic_idx'
        target = 'cluster'
        
    total_doc = 0    
    total_accuracy = 0    
    for i in range(0, num_clusters):
        criteria_set = train_df[train_df[criteria]==i]
        target_count = {}
        for j in range(0, num_clusters):
            target_set = criteria_set[criteria_set[target]==j]
            target_count[j] = len(target_set)
        max_target_idx = max(target_count.iteritems(), key=operator.itemgetter(1))[0]
        accuracy = 100*target_count[max_target_idx]/float(len(criteria_set))
        total_accuracy = total_accuracy + accuracy
        if(is_cluster):
            topic_str = topics[max_target_idx]
        else:
            topic_str = topics[i]
        print("#%d -> #%d Accuracy is %.4d/%.4d = %.10f \t %s" % (i, max_target_idx, target_count[max_target_idx], len(criteria_set), accuracy, topic_str))
        total_doc = total_doc + target_count[max_target_idx]
        
    print("%.4f" % (total_accuracy/num_clusters))
    print("%.4f" % (100 * total_doc/len(train_df)))

In [None]:
for idx in topics:
    topic = topics[idx]
    print("%.4d - %s" % (len(train_df[train_df.topic==topic]), topic)) 

In [None]:
match_cluster_topic(True)

In [None]:
match_cluster_topic(False)

In [None]:
# print_top_words(km, vectorizer.get_feature_names(), 20)

In [None]:
adjusted_rand_score(train_df.topic, km.labels_)

In [None]:
skf = StratifiedKFold(train_df.topic, n_folds=3)
cross_val_score(km, x_list, cv=skf)

In [None]:
df = train_df.drop(['_id', 'content', 'description', 'provider', 'providerNewsID', 'publishedAt'], axis=1)
two = df[df.cluster==3]
# two[two.topic_idx==4]
for idx in topics:
    print topics[idx], len(two[two.topic_idx==idx])
# two

In [None]:
two

In [None]:
cartesian = itertools.product(range(num_clusters), range(num_clusters))

temp = {
    'cluster': [],
    'topic_idx': [],
    'counts': []
}
for c, t in cartesian:
    clusters = df[df.cluster==c]
    topics = clusters[clusters.topic_idx==t]
    temp['cluster'].append(c)
    temp['topic_idx'].append(t)
    temp['counts'].append(len(topics))
    
results = pd.DataFrame(temp)
results = results[results.counts!=0]

In [None]:
# results[results.counts!=0]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(results.topic_idx, results.cluster, 'ro')
for index, row in results.iterrows():
    x = row['topic_idx']
    y = row['cluster']
    ax.annotate('  %d' % row['counts'], xy=(x,y), textcoords='data')
plt.axis([-1, 7, -1, 7])
plt.xlabel('Topic')
plt.ylabel('Cluster')

plt.grid()
plt.show()

## Deep Learning

In [6]:
from gensim import models
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

from random import shuffle

from collections import namedtuple

Articles = namedtuple('Articles', 'words tags split')

In [8]:
size = len(train_df) / 4
print size, len(train_df), size * 4

30208 120835 120832


In [9]:
alldocs = []
for idx, row in train_df.iterrows():
    tokens = row['tokenized_title'].split(' ') + row['tokenized_content'].split(' ')
    words = tokens[1:] # why start from 1?
    tags = [idx] # `tags = [tokens[0]]` would also work at extra memory cost
    tmp = idx//size % 4
    split = ['train','test','extra','extra'][tmp]  # 25k train, 25k test, 25k extra
#     sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][tmp] # [12.5K pos, 12.5K neg]*2 then unknown
    alldocs.append(Articles(words, tags, split))

In [10]:
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # for reshuffling per pass

print('%d docs: %d train, %d test' % (len(doc_list), len(train_docs), len(test_docs)))

120835 docs: 30211 train, 30208 test


In [11]:
words = [x.words for x in alldocs]
w2v = Word2Vec(words, size=100, window=5, min_count=5, workers=4)
w2v.save_word2vec_format("../datastore/w-w2v.p")

In [None]:
simple_models = [
    # PV-DM Distributed Momory Model of PV
    # w/concatenation - window=5 (both sides) approximates paper's 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW Distributed Bag of Words version of PV
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM w/average
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores),
]

In [None]:
simple_models[0].load_word2vec_format("../datastore/w-w2v.p")

In [None]:
# speed setup by sharing results of 1st model's vocabulary scan
simple_models[0].build_vocab(alldocs)  # PV-DM/concat requires one special NULL word so it serves as template

In [None]:
print simple_models[0]
for model in simple_models[1:]:
    model.reset_from(simple_models[0])
    print(model)

models_by_name = OrderedDict((str(model), model) for model in simple_models)

In [None]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[2]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[1], simple_models[0]])

In [None]:
alpha, min_alpha, passes = (0.025, 0.001, 20)
alpha_delta = (alpha - min_alpha) / passes

for epoch in range(passes):
    shuffle(doc_list)  # shuffling gets best results

    for name, train_model in models_by_name.items():
        train_model.alpha, train_model.min_alpha = alpha, alpha
        train_model.train(doc_list)
        print("%i passes : %s" % (epoch + 1), name)

    print('completed pass %i at alpha %f' % (epoch + 1, alpha))
    alpha -= alpha_delta

In [None]:
for name, train_model in models_by_name.items():
    train_model.save("../datastore/" + name)
    print "saved " + name