In [3]:
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Activation, Dense, merge
from keras.layers.recurrent import LSTM

In [4]:
import numpy as np
from DatasetBuilder import DatasetBuilder

In [5]:
# Read data

FLDS = ['track_id', 'artist', 'title', 'similars', 'lyrics_url', 'sample_url']
DATASET = DatasetBuilder(True,
                         'D:\\PROJECT\\data\\train_data', 'D:\\PROJECT\\data\\train_lyrics',
                         'D:\\PROJECT\\data\\test_data', 'D:\\PROJECT\\data\\test_lyrics',
                         FLDS, 0.5)

DATASET.load_from_file('../data/db_test.neg', False, False)
DATASET.load_from_file('../data/db_test.pos', False, True)
DATASET.load_from_file('../data/db_train.neg', True, False)
DATASET.load_from_file('../data/db_train.pos', True, True)
DATASET.info()

#DATASET.read_data(False, 1000)
#DATASET.read_data(True, 10000)
#DATASET.dump('../data/db')
#DATASET.info()

# Join all text data into 2 lists to build vocabulary out of them
TRAIN_DATA_1 =  [data_item['data_1'] for data_item in DATASET.train_data['pos']] + \
                [data_item['data_1'] for data_item in DATASET.train_data['neg']]
TRAIN_DATA_2 =  [data_item['data_2'] for data_item in DATASET.train_data['pos']] + \
                [data_item['data_2'] for data_item in DATASET.train_data['neg']]
TRAIN_SCORES =  [data_item['score'] for data_item in DATASET.train_data['pos']] + \
                [data_item['score'] for data_item in DATASET.train_data['neg']]

TEST_DATA_1 =   [data_item['data_1'] for data_item in DATASET.test_data['pos']] + \
                [data_item['data_1'] for data_item in DATASET.test_data['neg']]
TEST_DATA_2 =   [data_item['data_2'] for data_item in DATASET.test_data['pos']] + \
                [data_item['data_2'] for data_item in DATASET.test_data['neg']]
TEST_SCORES =   [data_item['score'] for data_item in DATASET.test_data['pos']] + \
                [data_item['score'] for data_item in DATASET.test_data['neg']]

DataBuilder initialized.
Dataset Info
> Train set:
> > Positive: 10000
> > Negative: 10000
> Test set:
> > Positive: 1000
> > Negative: 1000


In [6]:
from keras.preprocessing import text
from nltk.stem import SnowballStemmer
import string

word_dictionary = {}
def unique_recursive_len(item):
    for it in item:
        for elem in it:
            if elem not in word_dictionary:
                word_dictionary[elem] = 1
            else:
                word_dictionary[elem] = word_dictionary[elem] + 1
    return len(word_dictionary)

def wordIsPrintable(word):
    printable = string.printable
    f = 1
    for c in word:
        if c not in printable:
            f = 0
            break
    return f

stemmer = SnowballStemmer("english")
TRAIN_DATA_1_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TRAIN_DATA_1]
TRAIN_DATA_2_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TRAIN_DATA_2]
TEST_DATA_1_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TEST_DATA_1]
TEST_DATA_2_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TEST_DATA_2]
number_of_words = unique_recursive_len(TRAIN_DATA_1_ws + TRAIN_DATA_2_ws + TEST_DATA_1_ws + TEST_DATA_2_ws)
print(number_of_words)

58403


In [7]:
stemmed_dict = {}

for word, count in word_dictionary.items():
    stemmed_word = stemmer.stem(word)
    if stemmed_word not in stemmed_dict:
        stemmed_dict[stemmed_word] = count
    else:
        stemmed_dict[stemmed_word] = stemmed_dict[stemmed_word] + count
print(len(word_dictionary), len(stemmed_dict))

58403 57786


In [8]:
VOCAB_SIZE = 25000

In [15]:
TRAIN_DATA_1_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TRAIN_DATA_1]
TRAIN_DATA_2_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TRAIN_DATA_2]
TEST_DATA_1_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TEST_DATA_1]
TEST_DATA_2_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TEST_DATA_2]

In [16]:
from keras.preprocessing import sequence

SEQUENCE_LENGTH = 200

TRAIN_DATA_1_hot = sequence.pad_sequences(TRAIN_DATA_1_hot, maxlen=SEQUENCE_LENGTH)
TRAIN_DATA_2_hot = sequence.pad_sequences(TRAIN_DATA_2_hot, maxlen=SEQUENCE_LENGTH)
TEST_DATA_1_hot = sequence.pad_sequences(TEST_DATA_1_hot, maxlen=SEQUENCE_LENGTH)
TEST_DATA_2_hot = sequence.pad_sequences(TEST_DATA_2_hot, maxlen=SEQUENCE_LENGTH)

In [11]:
# Have to change input shape (nb_samples, timesteps, input_dim)
TRAIN_DATA_1_hot = TRAIN_DATA_1_hot.reshape(len(TRAIN_DATA_1_hot), SEQUENCE_LENGTH, 1)
TRAIN_DATA_2_hot = TRAIN_DATA_2_hot.reshape(len(TRAIN_DATA_2_hot), SEQUENCE_LENGTH, 1)
TEST_DATA_1_hot = TEST_DATA_1_hot.reshape(len(TEST_DATA_1_hot), SEQUENCE_LENGTH, 1)
TEST_DATA_2_hot = TEST_DATA_2_hot.reshape(len(TEST_DATA_2_hot), SEQUENCE_LENGTH, 1)

In [None]:
EMBED_VEC_LEN = 128
output_dim = 30

length = 15000

input_data_1 = Input(shape=(SEQUENCE_LENGTH, 1))
input_data_2 = Input(shape=(SEQUENCE_LENGTH, 1))

shared_lstm = LSTM(output_dim, input_shape=(EMBED_VEC_LEN, 1))

encoded_data_1 = shared_lstm(input_data_1)
encoded_data_2 = shared_lstm(input_data_1)

merged_vector = merge([encoded_data_1, encoded_data_2], mode='concat', concat_axis=-1)

predictions = Dense(1, activation='sigmoid')(merged_vector)

model = Model(input=[embed_1, embed_2], output=predictions)

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())
model.fit([TRAIN_DATA_1_hot[:length], TRAIN_DATA_2_hot[:length]], TRAIN_SCORES[:length], nb_epoch=50, batch_size=8)

#first_model = Sequential()
#first_model.add(Embedding(VOCAB_SIZE, EMBED_VEC_LEN, input_length=SEQUENCE_LENGTH))
#first_model.add(lstm)

#second_model = Sequential()
#second_model.add(Embedding(VOCAB_SIZE, EMBED_VEC_LEN, input_length=SEQUENCE_LENGTH))
#second_model.add(lstm)

#model = Sequential()
#model.add(Merge(layers=[first_model, second_model], mode='concat'))
#model.add(Dense(1))
#model.add(Activation('sigmoid'))
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#print(model.summary())
#model.fit([TRAIN_DATA_1_hot[:15000], TRAIN_DATA_2_hot[:15000]], TRAIN_SCORES[:15000], nb_epoch=50, batch_size=8)

In [40]:
EMBED_VEC_LEN = 32
output_dim = 50

input_data_1 = Input(shape=(SEQUENCE_LENGTH,))
input_data_2 = Input(shape=(SEQUENCE_LENGTH,))

embed_1 = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_VEC_LEN, input_length=SEQUENCE_LENGTH, dropout=0.5)(input_data_1)
embed_2 = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_VEC_LEN, input_length=SEQUENCE_LENGTH, dropout=0.5)(input_data_2)

lstm = LSTM(output_dim, input_shape=(EMBED_VEC_LEN, 1))

encoded_1 = lstm(embed_1)
encoded_2 = lstm(embed_2)

merged_vector = merge([encoded_1, encoded_2], mode='concat', concat_axis=-1)

predictions = Dense(1, activation='sigmoid')(merged_vector)

model = Model(input=[input_data_1, input_data_2], output=predictions)

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_15 (InputLayer)            (None, 200)           0                                            
____________________________________________________________________________________________________
input_16 (InputLayer)            (None, 200)           0                                            
____________________________________________________________________________________________________
embedding_15 (Embedding)         (None, 200, 32)       800000      input_15[0][0]                   
____________________________________________________________________________________________________
embedding_16 (Embedding)         (None, 200, 32)       800000      input_16[0][0]                   
___________________________________________________________________________________________

In [48]:
TRAIN_DATA_1_hot.shape

(20000, 200)

In [54]:
from random import shuffle

# shuffle data a bit
index_shuf = list(range(len(TRAIN_SCORES)))
shuffle(index_shuf)

TRAIN_DATA_1_hot_shuf = np.asarray([TRAIN_DATA_1_hot[i] for i in index_shuf]).reshape(TRAIN_DATA_1_hot.shape)
TRAIN_DATA_2_hot_shuf = np.asarray([TRAIN_DATA_2_hot[i] for i in index_shuf]).reshape(TRAIN_DATA_2_hot.shape)
TRAIN_SCORES_shuf = np.asarray([TRAIN_SCORES[i] for i in index_shuf]).reshape(np.asarray(TRAIN_SCORES).shape)

In [55]:
THRESHOLD = 2000
TEST_D1 = TRAIN_DATA_1_hot_shuf[:THRESHOLD]
TEST_D2 = TRAIN_DATA_2_hot_shuf[:THRESHOLD]
TEST_S = TRAIN_SCORES_shuf[:THRESHOLD]

In [57]:
length = len(TRAIN_SCORES)
model.fit([TRAIN_DATA_1_hot_shuf[THRESHOLD:length], TRAIN_DATA_2_hot_shuf[THRESHOLD:length]], TRAIN_SCORES_shuf[THRESHOLD:length], nb_epoch=10, batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x60a4d550>

In [59]:
scores = model.evaluate([TEST_D1, TEST_D2], TEST_S, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 95.35%


In [137]:
def save_model(model, name, folder=''):
    model_json = model.to_json()
    with open(folder + name + '.json', 'w') as json_file:
        json_file.write(model_json)
    model.save_weights(folder + name + '.h5')
    return {
        'model_file': folder + name + '.json',
        'weights_file': folder + name + '.h5'
    }

In [138]:
from keras.models import model_from_json
def load_model(model_file, weights_file):
    with open(model_file, 'r') as json_file:
        loaded_model_json = json_file.read()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(weights_file)
    return loaded_model

In [139]:
save_model(model, 'model_with_embed_and_dropout')

{'model_file': 'model_with_embed_and_dropout.json',
 'weights_file': 'model_with_embed_and_dropout.h5'}