In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [3]:
import re

In [4]:
from gensim.models import word2vec

In [5]:
# from nltk.corpus import stopwords
# stops = set(stopwords.words("english"))

In [6]:
CLUSTER_COUNT = 20

In [7]:
METHOD = "word2vec"

DATA_FOLDER = "data/"
OUTPUT_FOLDER = "output/"
MODEL_FOLDER = "model/"

TITLE_FILE = DATA_FOLDER + "title_StackOverflow.txt"
CHECK_INDEX_FILE = DATA_FOLDER + "check_index.csv"
DOCS_FILE = DATA_FOLDER + "docs.txt"

In [8]:
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
if not os.path.exists(MODEL_FOLDER):
    os.makedirs(MODEL_FOLDER)

# Read and cleaning data

In [9]:
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def isUselessLine(line):
    # number of chars < 10
    # number of words < 3
    # start with whitespaces (usually are codes)
    return (len(line) < 10) or (len(line.split()) < 3) or (line[0] == ' ')

def lineToCleanWordSeqs(line):
    # return a list of word sequences.
    # e.g. [["i", "have", "an", "apple"], ["i", "have", "a", "pen"]]
    
    # remove urls
    line = re.sub(r"\S+:/\S+","", line)  # ".../..."
    line = re.sub(r"\S+\\\S+","", line)  # "...\..."
    
    if isUselessLine(line):
        return []
    
    sentences = tokenizer.tokenize(line)
    wordSeqs = []
    for s in sentences:
        wordSeq = re.sub(r"[^a-zA-Z]"," ", s).lower().split()
        if len(wordSeq) >= 3:
            wordSeqs.append(wordSeq)
    return wordSeqs

In [10]:
clean_word_seqs = []

with open(DOCS_FILE, 'r') as f:
    for line in f.read().splitlines():
        clean_word_seqs.extend(lineToCleanWordSeqs(line))

In [11]:
len(clean_word_seqs)

115258

In [12]:
lineIdx = 0
clean_word_seqs[lineIdx:lineIdx+30]

[['how',
  'do',
  'you',
  'expose',
  'a',
  'linq',
  'query',
  'as',
  'an',
  'asmx',
  'web',
  'service'],
 ['usually',
  'from',
  'the',
  'business',
  'tier',
  'i',
  'can',
  'return',
  'a',
  'typed',
  'dataset',
  'or',
  'datatable',
  'which',
  'can',
  'be',
  'serialized',
  'for',
  'transport',
  'over',
  'asmx'],
 ['how', 'can', 'i', 'do', 'the', 'same', 'for', 'a', 'linq', 'query'],
 ['is',
  'there',
  'a',
  'way',
  'to',
  'populate',
  'a',
  'typed',
  'dataset',
  'or',
  'datatable',
  'via',
  'a',
  'linq',
  'query'],
 ['how',
  'can',
  'i',
  'get',
  'the',
  'resultset',
  'of',
  'a',
  'linq',
  'query',
  'into',
  'a',
  'dataset',
  'or',
  'datatable'],
 ['alternatively',
  'is',
  'the',
  'linq',
  'query',
  'serializeable',
  'so',
  'that',
  'i',
  'can',
  'expose',
  'it',
  'as',
  'an',
  'asmx',
  'web',
  'service'],
 ['how',
  'do',
  'you',
  'page',
  'through',
  'a',
  'collection',
  'in',
  'linq',
  'given',
  'that',

## Read Title Data and add to training data

In [13]:
def sentenceToCleanWordSeqs(s):
    return re.sub("[^a-zA-Z]", " ", s).lower().split()

In [14]:
with open(TITLE_FILE) as f:
    titles = f.read().splitlines()

In [15]:
clean_title_word_seqs = [sentenceToCleanWordSeqs(title) for title in titles]

In [16]:
clean_word_seqs.extend(clean_title_word_seqs)

In [17]:
len(clean_word_seqs)

135258

# Training word2vec model

In [18]:
# Set values for various parameters
num_features = 1000    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words
num_iter = 10

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(clean_word_seqs, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling, iter=num_iter)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = 'w2v_with-title_{}feature_{}minwords_{}iter'.format(num_features, min_word_count, num_iter)
model.save(MODEL_FOLDER + model_name)

print("Done")

Training model...
Done


In [19]:
model.syn0.shape

(6344, 1000)

In [22]:
model.most_similar("linq")

[('hql', 0.6585182547569275),
 ('predicate', 0.6009401082992554),
 ('criteria', 0.5546219348907471),
 ('caml', 0.5519425868988037),
 ('hibernate', 0.5207352638244629),
 ('linqtosql', 0.5176195502281189),
 ('orderby', 0.5139288902282715),
 ('subquery', 0.48453181982040405),
 ('iqueryable', 0.48244035243988037),
 ('lambda', 0.47810477018356323)]

In [27]:
model.most_similar(positive=["sql", "macro"], negative=["database"])

[('vba', 0.5704120397567749),
 ('automation', 0.5156331062316895),
 ('macros', 0.5053470134735107),
 ('excel', 0.4900357127189636),
 ('conditional', 0.47260475158691406),
 ('udf', 0.4654103219509125),
 ('formula', 0.46455878019332886),
 ('vsto', 0.4503313899040222),
 ('word', 0.41569530963897705),
 ('vb', 0.4061427116394043)]