In [1]:
from eval import *
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics.cluster import *
from sklearn.datasets import fetch_20newsgroups
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
# get data 
newsgroups_train = fetch_20newsgroups(data_home="~/workspace/scikit_learn_data", subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(data_home="~/workspace/scikit_learn_data", subset='test', remove=('headers', 'footers', 'quotes'))

In [3]:
text, labels = newsgroups_test.data, newsgroups_test.target
test_batch_size=1000
size = len(labels)
print("Loaded dataset {} with total lines: {}".format("20 NEWS", size))

top_level_labels = np.copy(labels)
top_categories = dict((name, i) for (i, name) in enumerate(set(map(lambda x: x.split('.')[0], newsgroups_test.target_names))))
print(top_categories)
for i, name in enumerate(newsgroups_test.target_names):
    print(i, name)
    top = name.split('.')[0]
    top_level_labels[labels == i ] = top_categories[top]

print(top_level_labels[:15])
print(labels[:15])

Loaded dataset 20 NEWS with total lines: 7532
{'misc': 0, 'talk': 1, 'sci': 2, 'rec': 3, 'comp': 4, 'alt': 5, 'soc': 6}
0 alt.atheism
1 comp.graphics
2 comp.os.ms-windows.misc
3 comp.sys.ibm.pc.hardware
4 comp.sys.mac.hardware
5 comp.windows.x
6 misc.forsale
7 rec.autos
8 rec.motorcycles
9 rec.sport.baseball
10 rec.sport.hockey
11 sci.crypt
12 sci.electronics
13 sci.med
14 sci.space
15 soc.religion.christian
16 talk.politics.guns
17 talk.politics.mideast
18 talk.politics.misc
19 talk.religion.misc
[3 4 5 1 1 2 6 6 4 4 4 4 1 3 5]
[ 7  5  0 17 19 13 15 15  5  1  2  5 17  8  0]


In [4]:
#train d2v model
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(newsgroups_train.data)]
d2v = Doc2Vec(documents, vector_size=1000, window=2, min_count=1, workers=8)

2019-07-16 13:47:03 INFO     collecting all words and their counts
2019-07-16 13:47:03 INFO     PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-07-16 13:47:04 INFO     PROGRESS: at example #10000, processed 12357648 words (10047583/s), 121 word types, 10000 tags
2019-07-16 13:47:04 INFO     collected 121 word types and 11314 unique tags from a corpus of 11314 examples and 13781985 words
2019-07-16 13:47:04 INFO     Loading a fresh vocabulary
2019-07-16 13:47:04 INFO     min_count=1 retains 121 unique words (100% of original 121, drops 0)
2019-07-16 13:47:04 INFO     min_count=1 leaves 13781985 word corpus (100% of original 13781985, drops 0)
2019-07-16 13:47:04 INFO     deleting the raw counts dictionary of 121 items
2019-07-16 13:47:04 INFO     sample=0.001 downsamples 44 most-common words
2019-07-16 13:47:04 INFO     downsampling leaves estimated 4083000 word corpus (29.6% of prior 13781985)
2019-07-16 13:47:04 INFO     estimated required memory for 121 wo

2019-07-16 13:47:33 INFO     training on a 68909925 raw words (16779378 effective words) took 29.0s, 578219 effective words/s


In [5]:
#load qt model
checkpoint_dir = '/home/jcjessecai/workspace/taboola/quickthoughts/checkpoints'
with open("{}/config.json".format(checkpoint_dir)) as fp:
    CONFIG = json.load(fp)

WV_MODEL = api.load(CONFIG['embedding'])
qt = QuickThoughts(WV_MODEL, hidden_size=CONFIG['hidden_size'])
trained_params = torch.load("{}/checkpoint_latest.pth".format(checkpoint_dir))
qt.load_state_dict(trained_params['state_dict'])
qt = qt.cuda()
qt.eval()
print("Restored successfully from {}".format(checkpoint_dir))

2019-07-16 13:48:36 INFO     loading projection weights from /home/jcjessecai/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2019-07-16 13:50:42 INFO     loaded (400000, 300) matrix from /home/jcjessecai/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
  "num_layers={}".format(dropout, num_layers))


Restored successfully from /home/jcjessecai/workspace/taboola/quickthoughts/checkpoints


In [6]:
#encode data
def make_batch(j):
    """Processes one test batch of the test datset"""
    stop_idx = min(size, j+test_batch_size)
    batch_text, batch_labels  = text[j:stop_idx], labels[j:stop_idx]
    data = list(map(lambda x: torch.LongTensor(prepare_sequence(x, WV_MODEL.vocab, no_zeros=True)), batch_text))
    for i in data:
        if len(i) == 0:
            print(i)
            input()
    packed = safe_pack_sequence(data).cuda()
    return qt(packed).cpu().detach().numpy()

feature_list = [make_batch(i) for i in range(0, size, test_batch_size)]
print("Processing {:5d} batches of size {:5d}".format(len(feature_list), test_batch_size))
qt_features = np.concatenate(feature_list)
print("Test feature matrix of shape: {}".format(qt_features.shape))

Processing     8 batches of size  1000
Test feature matrix of shape: (7532, 1000)


In [7]:

d2v_features = np.vstack([d2v.infer_vector(doc) for doc in newsgroups_test.data])
print(d2v_features.shape)

(7532, 1000)


In [8]:
#first we compare embedding performance by fitting binary classifier on top
s=1
X_train, X_test, y_train, y_test = train_test_split(d2v_features, top_level_labels)
clf = LogisticRegression(solver='sag', C=s)
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)

print("Fit logistic model with s: {:3d} and acc: {:.2%}".format(s, acc))

X_train, X_test, y_train, y_test = train_test_split(qt_features, top_level_labels)
clf = LogisticRegression(solver='sag', C=s)
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)

print("Fit logistic model with s: {:3d} and acc: {:.2%}".format(s, acc))



Fit logistic model with s:   1 and acc: 46.36%




Fit logistic model with s:   1 and acc: 64.90%


In [11]:
qt_predicted  = KMeans(n_clusters=7, n_jobs=20).fit_predict(qt_features)
d2v_predicted  = KMeans(n_clusters=7, n_jobs=20).fit_predict(d2v_features)

In [14]:
print(adjusted_rand_score(top_level_labels, qt_predicted))
print(adjusted_rand_score(top_level_labels, d2v_predicted))

print(adjusted_mutual_info_score(top_level_labels, qt_predicted))
print(adjusted_mutual_info_score(top_level_labels, d2v_predicted))

0.10898983664881542
0.045095497905550064
0.15014304339815002
0.0669735747454644




In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

feature_embedded = TSNE(n_components=2, verbose=1).fit_transform(features[:5000])

fig, axs = plt.subplots(4, 5, figsize=(20, 15))

fixed_xlim, fixed_ylim = (-85.6129455010451, 110.84618372125996), (-90.00594782812799, 80.691349744632)

for i in range(20):
    selected = feature_embedded[labels[:5000] == i]
    ax = axs[i//5, i%5]
    ax.scatter(selected[:, 0], selected[:, 1])
    ax.set_ylim(fixed_ylim)
    ax.set_xlim(fixed_xlim)
    
plt.show()

fig, axs = plt.subplots(4, 5, figsize=(20, 15))

fixed_xlim, fixed_ylim = (-85.6129455010451, 110.84618372125996), (-90.00594782812799, 80.691349744632)

for i in range(20):
    selected = feature_embedded[predicted_spectral[:5000] == i]
    ax = axs[i//5, i%5]
    ax.scatter(selected[:, 0], selected[:, 1])
    ax.set_ylim(fixed_ylim)
    ax.set_xlim(fixed_xlim)
    
plt.show()

fig, axs = plt.subplots(4, 5, figsize=(20, 15))

fixed_xlim, fixed_ylim = (-85.6129455010451, 110.84618372125996), (-90.00594782812799, 80.691349744632)

for i in range(20):
    selected = feature_embedded[predicted_kmeans[:5000] == i]
    ax = axs[i//5, i%5]
    ax.scatter(selected[:, 0], selected[:, 1])
    ax.set_ylim(fixed_ylim)
    ax.set_xlim(fixed_xlim)
    
plt.show()