In [89]:
import os
import pandas as pd
import numpy as np

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [9]:
import re

In [14]:
# from nltk.corpus import stopwords
# stops = set(stopwords.words("english"))

In [46]:
CLUSTER_COUNT = 20

In [88]:
METHOD = "tfidf_kmeans"
DATA_FOLDER = "data/"
OUTPUT_FOLDER = "output/"
TITLE_FILE = DATA_FOLDER + "title_StackOverflow.txt"
CHECK_INDEX_FILE = DATA_FOLDER + "check_index.csv"

In [90]:
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# Read and cleaning data

In [22]:
# # substitute symbols with whitespaces, to lowercase, remove stopwords
# def cleanTitle(title):
#     return ' '.join([w for w in re.sub("[^a-zA-Z]", " ", title).lower().split() if not w in stops])

In [None]:
# substitute symbols with whitespaces, to lowercase
def cleanTitle(title):
    return re.sub("[^a-zA-Z]", " ", title).lower()

In [18]:
with open(TITLE_FILE) as f:
    titles = f.read().splitlines()

In [33]:
len(titles)

20000

In [19]:
titles[:5]

['How do I fill a DataSet or a DataTable from a LINQ query resultset ?',
 'How do you page a collection with LINQ?',
 'Best Subversion clients for Windows Vista (64bit)',
 'Best Practice: Collaborative Environment, Bin Directory, SVN',
 'Visual Studio Setup Project - Per User Registry Settings']

In [23]:
title_words = [titleToMeaningfulWords(title) for title in titles]

In [24]:
title_words[:5]

[['fill', 'dataset', 'datatable', 'linq', 'query', 'resultset'],
 ['page', 'collection', 'linq'],
 ['best', 'subversion', 'clients', 'windows', 'vista', 'bit'],
 ['best',
  'practice',
  'collaborative',
  'environment',
  'bin',
  'directory',
  'svn'],
 ['visual',
  'studio',
  'setup',
  'project',
  'per',
  'user',
  'registry',
  'settings']]

In [37]:
clean_titles = [' '.join(s) for s in title_words]

In [38]:
clean_titles[:5]

['fill dataset datatable linq query resultset',
 'page collection linq',
 'best subversion clients windows vista bit',
 'best practice collaborative environment bin directory svn',
 'visual studio setup project per user registry settings']

# Clustering

In [96]:
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english",   \
                             max_features = 5000) 

In [30]:
train_data_features = vectorizer.fit_transform(title_words_flat)

In [32]:
train_data_features.shape

(111095, 5000)

In [39]:
vectorizer = TfidfVectorizer(analyzer="word", stop_words='english')

In [97]:
title_vecs = vectorizer.fit_transform(clean_titles)

In [98]:
title_vecs.shape

(20000, 5000)

In [99]:
model = KMeans(n_clusters=CLUSTER_COUNT, init='k-means++', max_iter=100, n_init=1)
model.fit(title_vecs)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=20, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [100]:
title_clusters = np.array(model.labels_.tolist())

In [101]:
title_clusters.shape

(20000,)

# output

In [54]:
checkIndexDF = pd.read_csv(CHECK_INDEX_FILE)

In [57]:
checkIndexDF.head()

Unnamed: 0,ID,x_ID,y_ID,same_tag
0,0,11726,1565,0
1,1,16528,7523,0
2,2,16683,19401,0
3,3,6506,5239,0
4,4,10279,19928,0


In [102]:
checkIndexDF["Ans"] = (title_clusters[checkIndexDF["x_ID"]] == title_clusters[checkIndexDF["y_ID"]]).astype(int)

In [103]:
checkIndexDF.head(20)

Unnamed: 0,ID,x_ID,y_ID,same_tag,x_ID_tag,Ans
0,0,11726,1565,0,0,0
1,1,16528,7523,0,14,0
2,2,16683,19401,0,0,0
3,3,6506,5239,0,19,0
4,4,10279,19928,0,0,0
5,5,19846,2428,0,7,0
6,6,3532,18102,0,8,0
7,7,11502,9753,1,0,1
8,8,13470,18420,0,0,0
9,9,4867,4793,0,0,0


In [104]:
checkIndexDF.to_csv(OUTPUT_FOLDER + METHOD + ".csv", columns=["ID", "Ans"], index=False)