# 1 Load data

### the data/GoogleNews-vectors-negative300.bin can be downloaded here
(https://github.com/mmihaltz/word2vec-GoogleNews-vectors)

In [10]:
import os

from gensim.models.keyedvectors import KeyedVectors
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from utils import cluster_quality
EMBEDDING_FILE = 'data/GoogleNews-vectors-negative300.bin'

In [11]:
text_path = 'data/Biomedical.txt'
label_path = 'data/Biomedical_gnd.txt'

with open(text_path) as f:
    data = [text.strip() for text in f]

with open(label_path) as f:
    target = f.readlines()
target = [int(label.rstrip('\n')) for label in target]

print("Total: %s short texts" % format(len(data), ","))

Total: 20,000 short texts


# 2 Word embedding

In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [13]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(data)
sequences_full = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 18888 unique tokens.


In [14]:
MAX_NB_WORDS = len(word_index)

seq_lens = [len(s) for s in sequences_full]
print("Average length: %d" % np.mean(seq_lens))
print("Max length: %d" % max(seq_lens))
print("Max length: %d" % min(seq_lens))

Average length: 12
Max length: 53
Max length: 1


In [15]:
MAX_SEQUENCE_LENGTH = max(seq_lens)

X = pad_sequences(sequences_full, maxlen=MAX_SEQUENCE_LENGTH)
y = target
print(X)
print(y)

[[    0     0     0 ...     1     2  9676]
 [    0     0     0 ...     4  3647    23]
 [    0     0     0 ...   390    25   176]
 ...
 [    0     0     0 ...   103   137    59]
 [    0     0     0 ...   132     1 18886]
 [    0     0     0 ...     9    82   412]]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [8]:
X.shape

(20000, 53)

In [16]:
############################
# Preparing embedding matrix
############################


print('Preparing embedding matrix')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

EMBEDDING_DIM = 300
nb_words = min(MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        print(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
of
and
a
to
escherichia
transl
12
ribonucleic
rna
deoxyribonucleic
fibres
saccharomyces
geniculate
hodgkin
histochemical
adenylate
k12
phytohemagglutinin
tumours
concanavalin
deferens
tumour
14c
14
10
purkinje
colliculus
hydroxytryptamine
leukaemia
rauscher
igg
17
11
labelled
actinomycin
trna
15
hydroxydopamine
triphosphatase
taenia
behaviour
balb
acth
foetal
20
ca2
catabolite
atpase
streptomyces
isoprenaline
adrenoceptors
autoradiographic
sarcoplasmic
choriomeningitis
gnotobiotic
40
megaterium
colicin
marek
hla
sendai
motoneurones
bcg
16
isocitrate
aplysia
eeg
cytochemical
c57bl
glucuronidase
igm
intrarenal
renovascular
hexokinase
b12
anaemia
nzb
thymineless
ischaemic
21
fibre
somatomedin
golgi
lipopolysaccharides
stearothermophilus
glutamic
cytochalasin
guanethidine
saralasin
aminoacyl
24
100
decerebrate
phosphofructokinase
neurospora
labelling
luteinizing
125i
gmp
sartorius
phosphoenolpyruvate
po2
18
pgo
chymotrypsin
radioautographic
tolbutamide
langerhans

cerebello
organisation
thourgh
fimbria
tectal
arborizations
lucus
entorhinal
subiculum
grnerated
microembolization
ergotoxine
axotomized
subcoeruleus
methylhistamine
activiy
rhombencephalon
prostaglanding
f2beta
f1beta
neuroanatomic
microdetermination
alphaamino
retine
inbread
leucoanthocyanins
chemosterilants
thrombokinase
mollusca
134
deoxycytidylate
thiochrome
cytonucleoproteins
carbodiimides
dinucleotidase
heparinic
leptotrichia
maltigenes
spectrochemical
hypoferremia
mitomycins
porfiromycin
anhydrides
denitrificans
glycosaminopeptides
coproporphyrinogenase
heartwoods
peroxidatically
hemerythrin
photoperiodically
noninduced
xanthium
myrothecium
verrucaria
parvula
brinaldix
corrinoids
sclerotium
bataticola
thermostability
chloropseudomonas
ethylicum
azaguanine
polynoxylin
noxythiolin
sulphomethyl
polybenzenoid
ovoglycoprotein
polytoma
uvella
transhydrogenating
dextrorotatory
diastereoisomeric
trihydroxyflavan
sulphadimethoxine
dicyandiamide
aminopyrimidine
lolium
perenne
phosphoglyc

99mtechnetium
urophyses
urotensin
indoramin
remak
parasympathectomy
eicosa
tetraynoic
delta1
crystaloid
normetanephrine
pithing
hydroxykynurenamine
histofluorescence
thermoneutrality
dyskinesis
calices
anoccygeus
adrenolytic
glucopenia
trinitrate
microiontophoretically
uterotonic
metoestrus
catechlamine
hydroxydropamine
nigro
neostriatal
occlucion
ergometrine
38407
nephrotensin
dimethoxy
anorexigenic
cathecholamines
stimuation
isoindole
mazindol
548
erytrho
benzodioxan
piperidyl
benzimidazolinone
28935
jugulare
prodigiozan
dopamin
cyclasses
piribedil
methylhistidines
ohda
septi
ajamline
antogonist
venomotor
esthesioneuroma
erinaceus
europaeus
trihydroxyindole
methylnorepinephrine
unreleated
sciated
asympathicotonic
ganglionectomy
enterohepatic
cardiodepressor
anilide
tryptamines
norepinephrinergic
tegmentalarea
clopimozide
764
diphenylbutylpiperidine
postjunctional
chloroamphetamine
602
vasoconstrictors
iprindole
2985
branchial
dioxepino
antilipidemic
intracisternally
dihydroergocristi

In [17]:
#################################################
# Preparing target using Average embeddings (AE)
#################################################
Y = {}
tfidf = tokenizer.sequences_to_matrix(sequences_full, mode='tfidf')
denom = 1 + np.sum(tfidf, axis=1)[:, None]
normed_tfidf = tfidf/denom
average_embeddings = np.dot(normed_tfidf, embedding_matrix)
Y["ae"] = average_embeddings
print("Shape of average embedding: ", Y['ae'].shape)

# binary Y
from utils import binarize
reduction_name = "ae"
B = binarize(Y[reduction_name])

# Last dimension in the CNN
TARGET_DIM = B.shape[1]

# Example of binarized target vector
print(B.shape)
print(B[0])

Shape of average embedding:  (20000, 300)
(20000, 300)
[0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1.
 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1.
 1. 1. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0.
 1. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0.
 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1.
 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 0. 1. 1.
 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 0.
 1. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1.
 1. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0.
 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 1.
 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 1. 1.
 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0.]


# 3 Training

In [18]:
################################################
# train model
################################################

from keras.layers import Input, Embedding, Flatten, Reshape
from keras.layers import Dense, Conv1D, Dropout, merge
from keras.layers import MaxPooling1D, GlobalMaxPooling1D, ZeroPadding1D
from keras.models import Model


In [19]:
def get_model():
    embedding_matrix_copy = embedding_matrix.copy()
    trainable_embedding = False
    # Embedding layer
    pretrained_embedding_layer = Embedding(
        input_dim=nb_words,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
    )

    # Input
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = pretrained_embedding_layer(sequence_input)
    
    # 1st Layer
    x = Conv1D(100, 5, activation='tanh', padding='same')(embedded_sequences)
    x = GlobalMaxPooling1D()(x)

    # Output
    x = Dropout(0.5)(x)
    predictions = Dense(TARGET_DIM, activation='sigmoid')(x)
    model = Model(sequence_input, predictions)

    model.layers[1].trainable=trainable_embedding

    adam = Adam(lr=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    # Loss and Optimizer
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['mae'])
    # Fine-tune embeddings or not
    model.summary()
    return model

In [20]:
if __name__ == '__main__':
    nb_epoch = 50
    checkpoint = ModelCheckpoint('models/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    model = get_model()
    model.fit(X, B, validation_split=0.2,
              epochs=nb_epoch, batch_size=100, verbose=1, shuffle=True)

    # create model that gives penultimate layer
    input = model.layers[0].input
    output = model.layers[-2].output
    model_penultimate = Model(input, output)

    # inference of penultimate layer
    H = model_penultimate.predict(X)
    print("Sample shape: {}".format(H.shape))




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 53)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 53, 300)           5666700   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 53, 100)           150100    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               30300     
Total params: 5,847,100
Trainable params: 180,400
Non-trainable params: 5,666,700
____________________________________________________________

Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Sample shape: (20000, 100)


# 4 Test

In [21]:
    from sklearn.preprocessing import normalize
    from sklearn.cluster import KMeans

    true_labels = y
    n_clusters = len(np.unique(y))
    print("Number of classes: %d" % n_clusters)
    km = KMeans(n_clusters=n_clusters, n_jobs=10)
    result = dict()
    V = normalize(H, norm='l2')
    km.fit(V)
    pred = km.labels_
    print(pred)
    a = {'deep': cluster_quality(true_labels, pred)}
    np.save("pred.npy", pred)
    model.save_weights("model.plk")

Number of classes: 20
[ 7  5  0 ... 10 16 11]
Homogeneity: 0.250
Completeness: 0.252
V-measure: 0.251
NMI: 0.251
Rand score: 0.140
Accuracy: 0.309




# Exercise 1 on the StackOverflow dataset

## data can be obtained in the filefold "data"

## step 1 load data

In [None]:
##############################
#####  Input your codes  #####
##############################

In [None]:
text_path = 'data/StackOverflow.txt'
label_path = 'data/StackOverflow_gnd.txt'

with open(text_path) as f:
    data = [text.strip() for text in f]

with open(label_path) as f:
    target = f.readlines()
target = [int(label.rstrip('\n')) for label in target]

print("Total: %s short texts" % format(len(data), ","))

## step 2 word embedding

In [None]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(data)
sequences_full = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
MAX_NB_WORDS = len(word_index)

seq_lens = [len(s) for s in sequences_full]
print("Average length: %d" % np.mean(seq_lens))
print("Max length: %d" % max(seq_lens))
print("Max length: %d" % min(seq_lens))

In [None]:
MAX_SEQUENCE_LENGTH = max(seq_lens)

X = pad_sequences(sequences_full, maxlen=MAX_SEQUENCE_LENGTH)
y = target
print(X)
print(y)

In [None]:
############################
# Preparing embedding matrix
############################


print('Preparing embedding matrix')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

EMBEDDING_DIM = 300
nb_words = min(MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        print(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
#################################################
# Preparing target using Average embeddings (AE)
#################################################
Y = {}
tfidf = tokenizer.sequences_to_matrix(sequences_full, mode='tfidf')
denom = 1 + np.sum(tfidf, axis=1)[:, None]
normed_tfidf = tfidf/denom
average_embeddings = np.dot(normed_tfidf, embedding_matrix)
Y["ae"] = average_embeddings
print("Shape of average embedding: ", Y['ae'].shape)

# binary Y
from utils import binarize
reduction_name = "ae"
B = binarize(Y[reduction_name])

# Last dimension in the CNN
TARGET_DIM = B.shape[1]

# Example of binarized target vector
print(B.shape)
print(B[0])

## step 3 training 

In [None]:
################################################
# train model
################################################

from keras.layers import Input, Embedding, Flatten, Reshape
from keras.layers import Dense, Conv1D, Dropout, merge
from keras.layers import MaxPooling1D, GlobalMaxPooling1D, ZeroPadding1D
from keras.models import Model

In [None]:
if __name__ == '__main__':
    nb_epoch = 50
    checkpoint = ModelCheckpoint('models/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    model = get_model()
    model.fit(X, B, validation_split=0.2,
              epochs=nb_epoch, batch_size=100, verbose=1, shuffle=True)

    # create model that gives penultimate layer
    input = model.layers[0].input
    output = model.layers[-2].output
    model_penultimate = Model(input, output)

    # inference of penultimate layer
    H = model_penultimate.predict(X)
    print("Sample shape: {}".format(H.shape))


## step 4 test 

In [None]:
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

true_labels = y
n_clusters = len(np.unique(y))
print("Number of classes: %d" % n_clusters)
km = KMeans(n_clusters=n_clusters, n_jobs=10)
result = dict()
V = normalize(H, norm='l2')
km.fit(V)
pred = km.labels_
print(pred)
a = {'deep': cluster_quality(true_labels, pred)}
np.save("pred.npy", pred)
model.save_weights("model.plk")

# Exercise 2 on the SearchSnippets dataset

## step 1 load data

In [None]:
##############################
#####  Input your codes  #####
##############################

In [None]:
text_path = 'data/SearchSnippets.txt'
label_path = 'data/SearchSnippets_gnd.txt'

with open(text_path) as f:
    data = [text.strip() for text in f]

with open(label_path) as f:
    target = f.readlines()
target = [int(label.rstrip('\n')) for label in target]

print("Total: %s short texts" % format(len(data), ","))

## step 2 word embedding

In [None]:
tokenizer = Tokenizer(char_level=False)
tokenizer.fit_on_texts(data)
sequences_full = tokenizer.texts_to_sequences(data)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
MAX_NB_WORDS = len(word_index)

seq_lens = [len(s) for s in sequences_full]
print("Average length: %d" % np.mean(seq_lens))
print("Max length: %d" % max(seq_lens))
print("Max length: %d" % min(seq_lens))

In [None]:
MAX_SEQUENCE_LENGTH = max(seq_lens)

X = pad_sequences(sequences_full, maxlen=MAX_SEQUENCE_LENGTH)
y = target
print(X)
print(y)

In [None]:
############################
# Preparing embedding matrix
############################


print('Preparing embedding matrix')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

EMBEDDING_DIM = 300
nb_words = min(MAX_NB_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        print(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

In [None]:
#################################################
# Preparing target using Average embeddings (AE)
#################################################
Y = {}
tfidf = tokenizer.sequences_to_matrix(sequences_full, mode='tfidf')
denom = 1 + np.sum(tfidf, axis=1)[:, None]
normed_tfidf = tfidf/denom
average_embeddings = np.dot(normed_tfidf, embedding_matrix)
Y["ae"] = average_embeddings
print("Shape of average embedding: ", Y['ae'].shape)

# binary Y
from utils import binarize
reduction_name = "ae"
B = binarize(Y[reduction_name])

# Last dimension in the CNN
TARGET_DIM = B.shape[1]

# Example of binarized target vector
print(B.shape)
print(B[0])

## step 3 training 

In [None]:
################################################
# train model
################################################

from keras.layers import Input, Embedding, Flatten, Reshape
from keras.layers import Dense, Conv1D, Dropout, merge
from keras.layers import MaxPooling1D, GlobalMaxPooling1D, ZeroPadding1D
from keras.models import Model

In [None]:
if __name__ == '__main__':
    nb_epoch = 50
    checkpoint = ModelCheckpoint('models/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
    model = get_model()
    model.fit(X, B, validation_split=0.2,
              epochs=nb_epoch, batch_size=100, verbose=1, shuffle=True)

    # create model that gives penultimate layer
    input = model.layers[0].input
    output = model.layers[-2].output
    model_penultimate = Model(input, output)

    # inference of penultimate layer
    H = model_penultimate.predict(X)
    print("Sample shape: {}".format(H.shape))


## step 4 test 

In [None]:
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans

true_labels = y
n_clusters = len(np.unique(y))
print("Number of classes: %d" % n_clusters)
km = KMeans(n_clusters=n_clusters, n_jobs=10)
result = dict()
V = normalize(H, norm='l2')
km.fit(V)
pred = km.labels_
print(pred)
a = {'deep': cluster_quality(true_labels, pred)}
np.save("pred.npy", pred)
model.save_weights("model.plk")

In [None]:
https://github.com/jacoxu/STC2
https://github.com/zqhZY/short_text_cnn_cluster
https://datawarrior.wordpress.com/2016/10/12/short-text-categorization-using-deep-neural-networks-and-word-embedding-models/