In [22]:
import sys
sys.path.append("../src/")
from tqdm import tqdm


import multiprocessing
from gensim.models import word2vec
import pandas as pd
from textDataset import *
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
cpu_count = 2*multiprocessing.cpu_count()-1
print('Number of CPUs: {}'.format(cpu_count))

Number of CPUs: 7


In [17]:
path2data = '../data/news_headlines/'

text = {
        'train': TextDataset(path2data, extension='.csv', sep=',', is_train = True),
}    

In [18]:
col = 'headline_text'
text['train'].process_data(col = col, remove_stopw = True, remove_tags=True)

In [19]:
text['train'].data.head()

Unnamed: 0,publish_date,headline_text,subject,headline_text_data
0,20030303,unhooked brakes to blame for taiwan train disa...,news_headlines,"{blame, train, taiwan, disaster, brakes, unhoo..."
1,20030918,oldest prisoner in tas released citing health,news_headlines,"{prisoner, released, tas, oldest, citing, health}"
2,20030913,nine reportedly dead in portuguese plane crash,news_headlines,"{crash, portuguese, reportedly, dead, nine, pl..."
3,20031031,nurses welcome medicare rebate plan,news_headlines,"{plan, welcome, medicare, nurses, rebate}"
4,20030930,un cuts its iraq staff,news_headlines,"{iraq, staff, cuts, un}"


In [24]:
sentences = { 'train': text['train'].data[col + '_data'].values}
X_train = sentences['train']

In [None]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 50   # Minimum word count                        
num_workers = cpu_count  # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words



W2Vmodel = word2vec.Word2Vec(workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


W2Vmodel.build_vocab([x for x in tqdm(X_train)])
W2Vmodel.train([x for x in tqdm(X_train)], \
            total_examples=W2Vmodel.corpus_count, epochs=W2Vmodel.epochs)



100%|██████████| 1000001/1000001 [00:00<00:00, 2504641.78it/s]
100%|██████████| 1000001/1000001 [00:00<00:00, 2483610.03it/s]


In [None]:
print ('Building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x for x in X_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size : {}'.format(len(tfidf)))

In [None]:
def buildWordVector(model, tfidf, tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
f_train = np.concatenate([buildWordVector(W2Vmodel, tfidf, z, num_features) for z in map(lambda x: x, X_train)])

In [None]:
print('Shape of train features:: {}'.format(f_train.shape))

In [None]:
def elbow_rule(f_train, max_nb_cluster = 10, distortions_method='euclidean', plot=True):

    # k means determine k
    distortions = []

    for k in range(1,max_nb_cluster):
        print('Training K-means models for {} cluster/s...'.format(k))
        kmeanModel = KMeans(n_clusters=k).fit(f_train)
        kmeanModel.fit(f_train)
        if distortions_method == 'euclidean':
            distortions.append(sum(np.min(cdist(f_train, kmeanModel.cluster_centers_, 'euclidean'), \
                                          axis=1)) / f_train.shape[0])
        #elif other distortion evaluation
        
    if plot:
        # Plot the elbow
        plt.figure(figsize=(10,5))
        plt.plot(range(1,max_nb_cluster), distortions, 'bx-')
        plt.xlabel('k')
        plt.ylabel('Distortion')
        plt.title('Elbow Method')
        plt.show()

In [None]:
elbow_rule(f_train)