In [4]:
import numpy as np
np.random.seed(13)

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils.data_utils import get_file
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import gensim
import pandas as pd

In [5]:
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile

"""dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
        urlretrieve(
            'http://mattmahoney.net/dc/text8.zip',
            dataset_filename,
            pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)"""
        
"""with open('clos_clean.txt') as f:
    text = f.read()"""

"with open('clos_clean.txt') as f:\n    text = f.read()"

In [6]:
df = pd.read_csv("../CLOS.csv")
print(df['Text'])

0      Be able to articulate the difference between d...
1      Demonstrate an understanding of scientific met...
2      Interpret evidence from an experiment and how ...
3      Critically read, evaluate claims made by both ...
4      Distinguish differences between science and ot...
5      Apply scepticism, the scientific method, evide...
6      correctly recall the applicable data type to s...
7      correctly write basic programming constructs: ...
8      correctly perform calls to a function includin...
9      correctly recall the use of basic data structu...
10     given a moderately complex problem, use the ab...
11     Given a problem, correctly choose a suitable d...
12     given a tree, correctly apply breadth first an...
13     correctly employ LIFO and/or FIFO structures t...
14     correctly employ graphs to model applicable pr...
15     write working programs to correctly solve prob...
16     utilize debugging tools in order to debug prog...
17     given an algorithm, anal

In [7]:
corpus = [sentence for sentence in df['Text'] if sentence.count(' ') >= 2]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 5

In [8]:
def generate_data(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [9]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))

In [10]:

cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [11]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)

    print(ite, loss)

0 12025.620184659958
1 11368.931551933289
2 11161.525293588638
3 11091.765606880188
4 11051.3975263834
5 11003.50007045269
6 10958.66885304451
7 10910.168879806995
8 10852.820275485516
9 10787.426206409931


In [12]:

f = open('vectors.txt' ,'w')
f.write('{} {}\n'.format(V-1, dim))

8

In [13]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

In [14]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [15]:

w2v.most_similar(positive=['knowledge'])

[('the', 0.5963079929351807),
 ('an', 0.5930779576301575),
 ('random', 0.5675033926963806),
 ('understanding', 0.5253812670707703),
 ('circuits', 0.5224876999855042),
 ('systems', 0.4931008517742157),
 ('such', 0.47586336731910706),
 ('have', 0.47550955414772034),
 ('electronics', 0.4716382622718811),
 ('experiment', 0.4682241380214691)]