In [26]:
import os
import pandas as pd
import numpy as np

In [27]:
from sklearn.decomposition import TruncatedSVD

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cluster import KMeans, AgglomerativeClustering

In [28]:
import re

In [29]:
# from nltk.corpus import stopwords
# stops = set(stopwords.words("english"))

In [30]:
CLUSTER_COUNT = 28

In [31]:
n_components = 20
n_features = 10000
verbose = False

In [32]:
METHOD = "bow_svg_kmeans"
DATA_FOLDER = "data/"
OUTPUT_FOLDER = "output/"
TITLE_FILE = DATA_FOLDER + "title_StackOverflow.txt"
CHECK_INDEX_FILE = DATA_FOLDER + "check_index.csv"

In [33]:
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

In [34]:

# from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

# import logging
# from optparse import OptionParser
# import sys
# from time import time

# import numpy as np


# Read and cleaning data

In [35]:
# # substitute symbols with whitespaces, to lowercase, remove stopwords
# def cleanTitle(title):
#     return ' '.join([w for w in re.sub("[^a-zA-Z]", " ", title).lower().split() if not w in stops])

In [36]:
# substitute symbols with whitespaces, to lowercase
def cleanTitle(title):
    return ' '.join(re.sub("[^a-zA-Z]", " ", title).lower().split())

In [37]:
with open(TITLE_FILE) as f:
    titles = f.read().splitlines()

In [38]:
clean_titles = [cleanTitle(title) for title in titles]

In [39]:
clean_titles[:5]

['how do i fill a dataset or a datatable from a linq query resultset',
 'how do you page a collection with linq',
 'best subversion clients for windows vista bit',
 'best practice collaborative environment bin directory svn',
 'visual studio setup project per user registry settings']

# Extract Features

In [40]:

# # Perform an IDF normalization on the output of HashingVectorizer
# hasher = HashingVectorizer(n_features=n_features,
#                            stop_words='english', non_negative=True,
#                            norm=None, binary=False)
# vectorizer = make_pipeline(hasher, TfidfTransformer())

vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english",   \
                             max_features = 1000) 

X = vectorizer.fit_transform(clean_titles)

svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

# Clustering

In [41]:

km = KMeans(n_clusters=CLUSTER_COUNT, init='k-means++', max_iter=100, n_init=1,
            verbose=verbose)

km.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=28, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=False)

In [42]:
title_clusters = np.array(km.labels_.tolist())

In [43]:
from scipy.stats import entropy

unique, counts = np.unique(title_clusters, return_counts=True)
print(entropy(counts))
print(np.asarray((unique, counts)).T)

3.27046660749
[[   0  842]
 [   1  561]
 [   2 1144]
 [   3  873]
 [   4  793]
 [   5  823]
 [   6  607]
 [   7  197]
 [   8  820]
 [   9  890]
 [  10  832]
 [  11  753]
 [  12  705]
 [  13  831]
 [  14  820]
 [  15  831]
 [  16  797]
 [  17  856]
 [  18  781]
 [  19  235]
 [  20  759]
 [  21  857]
 [  22  901]
 [  23  665]
 [  24  164]
 [  25  584]
 [  26  753]
 [  27  326]]


# Visualize

In [44]:
# from sklearn.manifold import TSNE
# from sklearn.decomposition import TruncatedSVD, PCA
# import matplotlib.pyplot as plt

In [45]:
# pca = PCA(n_components=2)
# pca.fit(X)
# Y = pca.transform(X)

In [46]:
# plt.scatter(Y[:,0], Y[:,1])
# plt.show()

# output

In [47]:
checkIndexDF = pd.read_csv(CHECK_INDEX_FILE)

In [48]:
checkIndexDF["Ans"] = (title_clusters[checkIndexDF["x_ID"]] == title_clusters[checkIndexDF["y_ID"]]).astype(int)

In [49]:
# checkIndexDF.head(20)

In [50]:
checkIndexDF.to_csv(OUTPUT_FOLDER + METHOD + "_comp25_cluster30_feature1000.csv", columns=["ID", "Ans"], index=False)