# Baseline script of SomaNews Clustering

In [1]:
import numpy as np
import pandas as pd
from konlpy.tag import Mecab
import hanja
import re
import string
import operator
import random
import matplotlib.pyplot as plt
import itertools
import cnouns
from sklearn.metrics import adjusted_rand_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score

## Load Data
Load Data from database

In [2]:
df = pd.read_pickle("../datastore/whole_articles.p")
df = df.drop(['author', 'link', 'imageURL'], axis=1)
df.shape

(120835, 8)

In [3]:
train_df = df

## Preprocessing
1. Remove stopwords (regex, hanja)
2. POS Tagging with KoNLPy, Mecab

In [4]:
def text_cleaning(text):
    text = hanja.translate(text, 'substitution')
    text = re.sub(u'(\[.*\]|\(.*\))', '', text)
    text = re.sub(u'(\(|\)|\[|\])', '', text)
    return text

In [5]:
train_df['clean_title'] = [text_cleaning(t) for t in train_df.title]
train_df['clean_content'] = [text_cleaning(t) for t in train_df.content]

In [6]:
train_df['tokenized_title'] = [cnouns.tokenize(t) for t in train_df.clean_title]
train_df['tokenized_content'] = [cnouns.tokenize(t) for t in train_df.clean_content]

## Save/Load Preprocessed data

In [7]:
train_df.to_pickle("../datastore/w-preprocesse.p")

In [2]:
train_df = pd.read_pickle("../datastore/preprocesse.p")

## Training

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from time import time
from gensim import models

In [4]:
vectorizer = TfidfVectorizer(lowercase=False)
# title_flat = [item for sublist in titles for item in sublist]
x_list = vectorizer.fit_transform(train_df.tokenized_title + train_df.tokenized_content)

In [None]:
x_list

In [None]:
len(x_list.toarray())
print(dict(zip(vectorizer.get_feature_names(), vectorizer._tfidf.idf_)))
# x_list.stop_words()

### Basic Models
1. Tf-idf and Cosine similarity
2. K-Means Algorithm

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [None]:
def cosine_distance(x_list):
    dist = 1 - cosine_similarity(x_list)
    return dist

In [None]:
num_clusters = len(topics)
num_clusters

In [None]:
t0 = time()
km = KMeans(n_clusters=num_clusters, n_jobs=-1)
km.fit(x_list)
clusters = km.labels_.tolist()
print("Done in %0.3fs." % (time() - t0))

In [None]:
print(len(clusters), len(train_df))

In [None]:
train_df['cluster'] = clusters

In [None]:
def match_cluster_topic(is_cluster):
    if(is_cluster):
        print("Cluster -> Topic")
        criteria = 'cluster'
        target = 'topic_idx'
    else:
        print("Topic -> Cluster")
        criteria = 'topic_idx'
        target = 'cluster'
        
    total_doc = 0    
    total_accuracy = 0    
    for i in range(0, num_clusters):
        criteria_set = train_df[train_df[criteria]==i]
        target_count = {}
        for j in range(0, num_clusters):
            target_set = criteria_set[criteria_set[target]==j]
            target_count[j] = len(target_set)
        max_target_idx = max(target_count.iteritems(), key=operator.itemgetter(1))[0]
        accuracy = 100*target_count[max_target_idx]/float(len(criteria_set))
        total_accuracy = total_accuracy + accuracy
        if(is_cluster):
            topic_str = topics[max_target_idx]
        else:
            topic_str = topics[i]
        print("#%d -> #%d Accuracy is %.4d/%.4d = %.10f \t %s" % (i, max_target_idx, target_count[max_target_idx], len(criteria_set), accuracy, topic_str))
        total_doc = total_doc + target_count[max_target_idx]
        
    print("%.4f" % (total_accuracy/num_clusters))
    print("%.4f" % (100 * total_doc/len(train_df)))

In [None]:
for idx in topics:
    topic = topics[idx]
    print("%.4d - %s" % (len(train_df[train_df.topic==topic]), topic)) 

In [None]:
match_cluster_topic(True)

In [None]:
match_cluster_topic(False)

In [None]:
# print_top_words(km, vectorizer.get_feature_names(), 20)

In [None]:
adjusted_rand_score(train_df.topic, km.labels_)

In [None]:
skf = StratifiedKFold(train_df.topic, n_folds=3)
cross_val_score(km, x_list, cv=skf)

In [None]:
df = train_df.drop(['_id', 'content', 'description', 'provider', 'providerNewsID', 'publishedAt'], axis=1)
two = df[df.cluster==3]
# two[two.topic_idx==4]
for idx in topics:
    print topics[idx], len(two[two.topic_idx==idx])
# two

In [None]:
two

In [None]:
cartesian = itertools.product(range(num_clusters), range(num_clusters))

temp = {
    'cluster': [],
    'topic_idx': [],
    'counts': []
}
for c, t in cartesian:
    clusters = df[df.cluster==c]
    topics = clusters[clusters.topic_idx==t]
    temp['cluster'].append(c)
    temp['topic_idx'].append(t)
    temp['counts'].append(len(topics))
    
results = pd.DataFrame(temp)
results = results[results.counts!=0]

In [None]:
# results[results.counts!=0]

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(results.topic_idx, results.cluster, 'ro')
for index, row in results.iterrows():
    x = row['topic_idx']
    y = row['cluster']
    ax.annotate('  %d' % row['counts'], xy=(x,y), textcoords='data')
plt.axis([-1, 7, -1, 7])
plt.xlabel('Topic')
plt.ylabel('Cluster')

plt.grid()
plt.show()