In [32]:
import os 
import pickle
import random
import numpy as np

In [35]:
from gensim.models import Word2Vec

In [54]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras import backend as k
from tensorflow.keras.layers import Lambda
from tensorflow import expand_dims
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Flatten
import tensorflow

In [62]:
data_dir = os.path.join(os.getcwd(), 'glassdoor_problem')

with open(os.path.join(data_dir, 'training_data.pkl'), 'rb') as f:
    training_data = pickle.load(f)
    
with open(os.path.join(data_dir, 'validation_data.pkl'), 'rb') as f:
    validation_data = pickle.load(f)
    
with open(os.path.join(data_dir, 'test_data.pkl'), 'rb') as f:
    test_data = pickle.load(f)

In [64]:
# To keep training fast, it will good to keep batches of data during training than just one data point at a time
def cluster_sentences(data, batch_size) : 
    """
    Given: 
    data:[("word11 word12 ..", label_vector), ("word21 word22 ..", label_vector), ....] - List of sentences with labels
    batch_size : size of the batch to be used during training
    
    Return:
    A cluster of TOKENIZED sentences, which is a list of list, where each entry in the output is a list of length batch_size 
    containing tokenized sentences with same number of words shown below
    [[([word11, word12], label_vector), ([word21, word22], label_vector), ...], 
    [([word31, word32, word33], label_vector), ([word41, word42, word43], label_vector), ...],
    ]
    """
    tokenized_sentences = [(sen[0].split(" "), sen[1]) for sen in data]
    
    # list at ith position below will store sentences with i words
    store_same_length_sentences = [[] for i in range(100)]
    _ = [store_same_length_sentences[len(sentence[0])].append(sentence) for sentence in tokenized_sentences]
    
    # take only those sentence lengths which have number of sentences greater than batch_size 
    valid_sentence_lengths = [i for i in range(100) if len(store_same_length_sentences[i])>batch_size]
    
    batched_sentences = []
    for length in valid_sentence_lengths : 
        same_length_sentences = store_same_length_sentences[length]
        number_of_same_length_sentences = len(same_length_sentences)
        number_of_training_batches = int(number_of_same_length_sentences/batch_size)
        _ = random.shuffle(same_length_sentences)
        batches = [same_length_sentences[(i*batch_size):((i+1)*batch_size)] for i in range(number_of_training_batches)]
        batched_sentences = batched_sentences + batches 
    
    return batched_sentences

In [65]:
batch_size = 8 
training_clusters = cluster_sentences(training_data, batch_size)
validation_clusters = cluster_sentences(validation_data, batch_size)
test_clusters = cluster_sentences(test_data, batch_size)

In [66]:
# load word2vec model 
wordvec_model = Word2Vec.load(os.path.join(os.getcwd(), 'glassdoor_problem/wordvecmodel'))

def get_word_vector(word, word2vecmodel) : 
    return word2vecmodel.wv[word]

# a function to create the input matrix given the tokenized sentences using the word2vec model
def get_word2vec_input_matrix(list_of_words) : 
    """
    List of list of words
    Each entry in list_of_words is a list of words. All entries are of the same length
    """
    store_all_together = []
    for word_group in list_of_words : 
        store_all_together.append(np.array([get_word_vector(word, wordvec_model) for word in word_group]))
    return np.array(store_all_together)

In [67]:
"""
This cell has
Generator to load the data batch wise
A Lambda function to get the last elements of a batch of data to be sent to CNN part
"""
def generator(data):
    """
    The generator to load data for training
    This will take batched sentences and load each batch in every iteration
    """
    L = len(data)
    batch_size = 1 
    while True:
        batch_start = 0 
        batch_end = batch_size
        while batch_start < L:
            x_portion = [sen[0] for sen in data[batch_start:batch_end][0]]
            y_portion = np.array([sen[1] for sen in data[batch_start:batch_end][0]])
            x_values = get_word2vec_input_matrix(x_portion)
            y_values = y_portion
            yield x_values, y_values
            batch_start = batch_end
            batch_end = batch_start + batch_size 

def get_last_elements(tensor) : 
    last_words = []
    for i in range(tensor.shape[0]) : 
        last_word_representation = tensor[i][-1]
        expanded = expand_dims(last_word_representation, axis=0)
        expanded = tensorflow.reshape(expanded, (30, 1))
        last_words.append(expanded)
    return tensorflow.convert_to_tensor(last_words)

# batch_size to train
batch_size = 8 
# size of the word vectors to be passed in the neural network model
word_vec_size = 100

inp = Input(batch_shape=(batch_size, None, word_vec_size))
encoded1 = LSTM(30, return_sequences=True, activation='tanh')(inp)
encoded = Lambda(lambda x: get_last_elements(x))(encoded1)
convolved = Conv1D(32, 2, input_shape=(1, 30), activation='relu')(encoded)
pooled = MaxPooling1D(3, strides=3)(convolved)
flattened = Flatten()(pooled)
output_probabilities = Dense(8, activation='sigmoid')(flattened)
output_vector = Lambda(lambda x: x*8)(output_probabilities)
model = Model(inp, output_vector)
model.compile(loss='mean_squared_error', optimizer='sgd')

In [68]:
#callback to save checkpoints
filepath= os.path.join(os.getcwd(), 'glassdoor_problem/glassdoor_assignment_models/') + "weights-improvement-{epoch:02d}-{val_loss:02f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_weights_only=True, save_best_only=True, mode='min')

In [69]:
model.fit_generator(generator(training_clusters), steps_per_epoch=len(training_clusters), epochs=400, 
                        validation_data=generator(validation_clusters),
                        validation_steps=len(validation_clusters),  verbose=1, callbacks=[checkpoint])

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 588 steps, validate for 61 steps
Epoch 1/400
Epoch 00001: val_loss improved from inf to 6.83667, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-01-6.836670.h5
Epoch 2/400
Epoch 00002: val_loss improved from 6.83667 to 4.07904, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-02-4.079044.h5
Epoch 3/400
Epoch 00003: val_loss improved from 4.07904 to 3.67924, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-03-3.679240.h5
Epoch 4/400
Epoch 00004: val_loss improved from 3.67924 to 3.54963, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-04-3.549627.h5
Epoch 5/400
Epoch 00005: val_loss improved from 3.54963 to 2.77723, saving model to /Users/purushottamsinha/Desktop/glassdo

Epoch 30/400
Epoch 00030: val_loss did not improve from 1.63836
Epoch 31/400
Epoch 00031: val_loss did not improve from 1.63836
Epoch 32/400
Epoch 00032: val_loss did not improve from 1.63836
Epoch 33/400
Epoch 00033: val_loss did not improve from 1.63836
Epoch 34/400
Epoch 00034: val_loss did not improve from 1.63836
Epoch 35/400
Epoch 00035: val_loss did not improve from 1.63836
Epoch 36/400
Epoch 00036: val_loss did not improve from 1.63836
Epoch 37/400
Epoch 00037: val_loss did not improve from 1.63836
Epoch 38/400
Epoch 00038: val_loss did not improve from 1.63836
Epoch 39/400
Epoch 00039: val_loss did not improve from 1.63836
Epoch 40/400
Epoch 00040: val_loss did not improve from 1.63836
Epoch 41/400
Epoch 00041: val_loss improved from 1.63836 to 1.62964, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-41-1.629638.h5
Epoch 42/400
Epoch 00042: val_loss did not improve from 1.62964
Epoch 43/400
Epoch 00043: val_loss

Epoch 64/400
Epoch 00064: val_loss did not improve from 1.45730
Epoch 65/400
Epoch 00065: val_loss did not improve from 1.45730
Epoch 66/400
Epoch 00066: val_loss did not improve from 1.45730
Epoch 67/400
Epoch 00067: val_loss did not improve from 1.45730
Epoch 68/400
Epoch 00068: val_loss did not improve from 1.45730
Epoch 69/400
Epoch 00069: val_loss did not improve from 1.45730
Epoch 70/400
Epoch 00070: val_loss did not improve from 1.45730
Epoch 71/400
Epoch 00071: val_loss improved from 1.45730 to 1.45085, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-71-1.450853.h5
Epoch 72/400
Epoch 00072: val_loss improved from 1.45085 to 1.42885, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-72-1.428851.h5
Epoch 73/400
Epoch 00073: val_loss did not improve from 1.42885
Epoch 74/400
Epoch 00074: val_loss did not improve from 1.42885
Epoch 75/400
Epoch 00075: va

Epoch 99/400
Epoch 00099: val_loss did not improve from 1.40518
Epoch 100/400
Epoch 00100: val_loss did not improve from 1.40518
Epoch 101/400
Epoch 00101: val_loss did not improve from 1.40518
Epoch 102/400
Epoch 00102: val_loss did not improve from 1.40518
Epoch 103/400
Epoch 00103: val_loss did not improve from 1.40518
Epoch 104/400
Epoch 00104: val_loss did not improve from 1.40518
Epoch 105/400
Epoch 00105: val_loss did not improve from 1.40518
Epoch 106/400
Epoch 00106: val_loss did not improve from 1.40518
Epoch 107/400
Epoch 00107: val_loss did not improve from 1.40518
Epoch 108/400
Epoch 00108: val_loss did not improve from 1.40518
Epoch 109/400
Epoch 00109: val_loss did not improve from 1.40518
Epoch 110/400
Epoch 00110: val_loss did not improve from 1.40518
Epoch 111/400
Epoch 00111: val_loss did not improve from 1.40518
Epoch 112/400
Epoch 00112: val_loss did not improve from 1.40518
Epoch 113/400
Epoch 00113: val_loss improved from 1.40518 to 1.35614, saving model to /User

Epoch 136/400
Epoch 00136: val_loss did not improve from 1.35614
Epoch 137/400
Epoch 00137: val_loss did not improve from 1.35614
Epoch 138/400
Epoch 00138: val_loss did not improve from 1.35614
Epoch 139/400
Epoch 00139: val_loss did not improve from 1.35614
Epoch 140/400
Epoch 00140: val_loss improved from 1.35614 to 1.35089, saving model to /Users/purushottamsinha/Desktop/glassdoor_problem/glassdoor_assignment_models/weights-improvement-140-1.350889.h5
Epoch 141/400
Epoch 00141: val_loss did not improve from 1.35089
Epoch 142/400
Epoch 00142: val_loss did not improve from 1.35089
Epoch 143/400
Epoch 00143: val_loss did not improve from 1.35089
Epoch 144/400
Epoch 00144: val_loss did not improve from 1.35089
Epoch 145/400
Epoch 00145: val_loss did not improve from 1.35089
Epoch 146/400
Epoch 00146: val_loss did not improve from 1.35089
Epoch 147/400
Epoch 00147: val_loss did not improve from 1.35089
Epoch 148/400
Epoch 00148: val_loss did not improve from 1.35089
Epoch 149/400
Epoch 

Epoch 173/400
Epoch 00173: val_loss did not improve from 1.35089
Epoch 174/400
Epoch 00174: val_loss did not improve from 1.35089
Epoch 175/400
Epoch 00175: val_loss did not improve from 1.35089
Epoch 176/400
Epoch 00176: val_loss did not improve from 1.35089
Epoch 177/400
Epoch 00177: val_loss did not improve from 1.35089
Epoch 178/400
Epoch 00178: val_loss did not improve from 1.35089
Epoch 179/400
Epoch 00179: val_loss did not improve from 1.35089
Epoch 180/400
Epoch 00180: val_loss did not improve from 1.35089
Epoch 181/400
Epoch 00181: val_loss did not improve from 1.35089
Epoch 182/400
Epoch 00182: val_loss did not improve from 1.35089
Epoch 183/400
Epoch 00183: val_loss did not improve from 1.35089
Epoch 184/400
Epoch 00184: val_loss did not improve from 1.35089
Epoch 185/400
Epoch 00185: val_loss did not improve from 1.35089
Epoch 186/400
Epoch 00186: val_loss did not improve from 1.35089
Epoch 187/400
Epoch 00187: val_loss did not improve from 1.35089
Epoch 188/400
Epoch 00188

Epoch 00210: val_loss did not improve from 1.35089
Epoch 211/400
Epoch 00211: val_loss did not improve from 1.35089
Epoch 212/400
Epoch 00212: val_loss did not improve from 1.35089
Epoch 213/400
Epoch 00213: val_loss did not improve from 1.35089
Epoch 214/400
Epoch 00214: val_loss did not improve from 1.35089
Epoch 215/400
Epoch 00215: val_loss did not improve from 1.35089
Epoch 216/400
Epoch 00216: val_loss did not improve from 1.35089
Epoch 217/400
Epoch 00217: val_loss did not improve from 1.35089
Epoch 218/400
Epoch 00218: val_loss did not improve from 1.35089
Epoch 219/400
Epoch 00219: val_loss did not improve from 1.35089
Epoch 220/400
Epoch 00220: val_loss did not improve from 1.35089
Epoch 221/400
Epoch 00221: val_loss did not improve from 1.35089
Epoch 222/400
Epoch 00222: val_loss did not improve from 1.35089
Epoch 223/400
Epoch 00223: val_loss did not improve from 1.35089
Epoch 224/400
Epoch 00224: val_loss did not improve from 1.35089
Epoch 225/400
Epoch 00225: val_loss did

Epoch 00247: val_loss did not improve from 1.35089
Epoch 248/400
Epoch 00248: val_loss did not improve from 1.35089
Epoch 249/400
Epoch 00249: val_loss did not improve from 1.35089
Epoch 250/400
Epoch 00250: val_loss did not improve from 1.35089
Epoch 251/400
Epoch 00251: val_loss did not improve from 1.35089
Epoch 252/400
Epoch 00252: val_loss did not improve from 1.35089
Epoch 253/400
Epoch 00253: val_loss did not improve from 1.35089
Epoch 254/400
Epoch 00254: val_loss did not improve from 1.35089
Epoch 255/400
Epoch 00255: val_loss did not improve from 1.35089
Epoch 256/400
Epoch 00256: val_loss did not improve from 1.35089
Epoch 257/400
Epoch 00257: val_loss did not improve from 1.35089
Epoch 258/400
Epoch 00258: val_loss did not improve from 1.35089
Epoch 259/400
Epoch 00259: val_loss did not improve from 1.35089
Epoch 260/400
Epoch 00260: val_loss did not improve from 1.35089
Epoch 261/400
Epoch 00261: val_loss did not improve from 1.35089
Epoch 262/400
Epoch 00262: val_loss did

Epoch 00284: val_loss did not improve from 1.35089
Epoch 285/400
Epoch 00285: val_loss did not improve from 1.35089
Epoch 286/400
Epoch 00286: val_loss did not improve from 1.35089
Epoch 287/400
Epoch 00287: val_loss did not improve from 1.35089
Epoch 288/400
Epoch 00288: val_loss did not improve from 1.35089
Epoch 289/400
Epoch 00289: val_loss did not improve from 1.35089
Epoch 290/400
Epoch 00290: val_loss did not improve from 1.35089
Epoch 291/400
Epoch 00291: val_loss did not improve from 1.35089
Epoch 292/400
Epoch 00292: val_loss did not improve from 1.35089
Epoch 293/400
Epoch 00293: val_loss did not improve from 1.35089
Epoch 294/400
Epoch 00294: val_loss did not improve from 1.35089
Epoch 295/400
Epoch 00295: val_loss did not improve from 1.35089
Epoch 296/400
Epoch 00296: val_loss did not improve from 1.35089
Epoch 297/400
Epoch 00297: val_loss did not improve from 1.35089
Epoch 298/400
Epoch 00298: val_loss did not improve from 1.35089
Epoch 299/400
Epoch 00299: val_loss did

Epoch 00321: val_loss did not improve from 1.35089
Epoch 322/400
Epoch 00322: val_loss did not improve from 1.35089
Epoch 323/400
Epoch 00323: val_loss did not improve from 1.35089
Epoch 324/400
Epoch 00324: val_loss did not improve from 1.35089
Epoch 325/400
Epoch 00325: val_loss did not improve from 1.35089
Epoch 326/400
Epoch 00326: val_loss did not improve from 1.35089
Epoch 327/400
Epoch 00327: val_loss did not improve from 1.35089
Epoch 328/400
Epoch 00328: val_loss did not improve from 1.35089
Epoch 329/400
Epoch 00329: val_loss did not improve from 1.35089
Epoch 330/400
Epoch 00330: val_loss did not improve from 1.35089
Epoch 331/400
Epoch 00331: val_loss did not improve from 1.35089
Epoch 332/400
Epoch 00332: val_loss did not improve from 1.35089
Epoch 333/400
Epoch 00333: val_loss did not improve from 1.35089
Epoch 334/400
Epoch 00334: val_loss did not improve from 1.35089
Epoch 335/400
Epoch 00335: val_loss did not improve from 1.35089
Epoch 336/400
Epoch 00336: val_loss did

Epoch 00358: val_loss did not improve from 1.35089
Epoch 359/400
Epoch 00359: val_loss did not improve from 1.35089
Epoch 360/400
Epoch 00360: val_loss did not improve from 1.35089
Epoch 361/400
Epoch 00361: val_loss did not improve from 1.35089
Epoch 362/400
Epoch 00362: val_loss did not improve from 1.35089
Epoch 363/400
Epoch 00363: val_loss did not improve from 1.35089
Epoch 364/400
Epoch 00364: val_loss did not improve from 1.35089
Epoch 365/400
Epoch 00365: val_loss did not improve from 1.35089
Epoch 366/400
Epoch 00366: val_loss did not improve from 1.35089
Epoch 367/400
Epoch 00367: val_loss did not improve from 1.35089
Epoch 368/400
Epoch 00368: val_loss did not improve from 1.35089
Epoch 369/400
Epoch 00369: val_loss did not improve from 1.35089
Epoch 370/400
Epoch 00370: val_loss did not improve from 1.35089
Epoch 371/400
Epoch 00371: val_loss did not improve from 1.35089
Epoch 372/400
Epoch 00372: val_loss did not improve from 1.35089
Epoch 373/400
Epoch 00373: val_loss did

Epoch 00395: val_loss did not improve from 1.35089
Epoch 396/400
Epoch 00396: val_loss did not improve from 1.35089
Epoch 397/400
Epoch 00397: val_loss did not improve from 1.35089
Epoch 398/400
Epoch 00398: val_loss did not improve from 1.35089
Epoch 399/400
Epoch 00399: val_loss did not improve from 1.35089
Epoch 400/400
Epoch 00400: val_loss did not improve from 1.35089


<tensorflow.python.keras.callbacks.History at 0x13dcb5cc0>

In [70]:
training_data[1]

('company ha pretty prevalent good old boy culture',
 array([0., 0., 0., 0., 0., 8., 0., 0.]))