In [3]:
from keras.models import Sequential
from keras.layers import Embedding, Merge, Activation, Dense
from keras.layers.recurrent import LSTM

In [4]:
import numpy as np
from DatasetBuilder import DatasetBuilder

In [5]:
# Read data

FLDS = ['track_id', 'artist', 'title', 'similars', 'lyrics_url', 'sample_url']
DATASET = DatasetBuilder(True,
                         'D:\\PROJECT\\data\\train_data', 'D:\\PROJECT\\data\\train_lyrics',
                         'D:\\PROJECT\\data\\test_data', 'D:\\PROJECT\\data\\test_lyrics',
                         FLDS, 0.5)

DATASET.load_from_file('../data/db_test.neg', False, False)
DATASET.load_from_file('../data/db_test.pos', False, True)
DATASET.load_from_file('../data/db_train.neg', True, False)
DATASET.load_from_file('../data/db_train.pos', True, True)
DATASET.info()

#DATASET.read_data(False, 1000)
#DATASET.read_data(True, 10000)
#DATASET.dump('../data/db')
#DATASET.info()

# Join all text data into 2 lists to build vocabulary out of them
TRAIN_DATA_1 =  [data_item['data_1'] for data_item in DATASET.train_data['pos']] + \
                [data_item['data_1'] for data_item in DATASET.train_data['neg']]
TRAIN_DATA_2 =  [data_item['data_2'] for data_item in DATASET.train_data['pos']] + \
                [data_item['data_2'] for data_item in DATASET.train_data['neg']]
TRAIN_SCORES =  [data_item['score'] for data_item in DATASET.train_data['pos']] + \
                [data_item['score'] for data_item in DATASET.train_data['neg']]

TEST_DATA_1 =   [data_item['data_1'] for data_item in DATASET.test_data['pos']] + \
                [data_item['data_1'] for data_item in DATASET.test_data['neg']]
TEST_DATA_2 =   [data_item['data_2'] for data_item in DATASET.test_data['pos']] + \
                [data_item['data_2'] for data_item in DATASET.test_data['neg']]
TEST_SCORES =   [data_item['score'] for data_item in DATASET.test_data['pos']] + \
                [data_item['score'] for data_item in DATASET.test_data['neg']]

DataBuilder initialized.
Dataset Info
> Train set:
> > Positive: 10000
> > Negative: 10000
> Test set:
> > Positive: 1000
> > Negative: 1000


In [6]:
from keras.preprocessing import text
from nltk.stem import SnowballStemmer
import string

word_dictionary = {}
def unique_recursive_len(item):
    for it in item:
        for elem in it:
            if elem not in word_dictionary:
                word_dictionary[elem] = 1
            else:
                word_dictionary[elem] = word_dictionary[elem] + 1
    return len(word_dictionary)

def wordIsPrintable(word):
    printable = string.printable
    f = 1
    for c in word:
        if c not in printable:
            f = 0
            break
    return f

stemmer = SnowballStemmer("english")
TRAIN_DATA_1_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TRAIN_DATA_1]
TRAIN_DATA_2_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TRAIN_DATA_2]
TEST_DATA_1_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TEST_DATA_1]
TEST_DATA_2_ws = [[stemmer.stem(word) for word in text.text_to_word_sequence(t, lower=True, split=" ")] for t in TEST_DATA_2]
number_of_words = unique_recursive_len(TRAIN_DATA_1_ws + TRAIN_DATA_2_ws + TEST_DATA_1_ws + TEST_DATA_2_ws)
print(number_of_words)

58403


In [7]:
stemmed_dict = {}

for word, count in word_dictionary.items():
    stemmed_word = stemmer.stem(word)
    if stemmed_word not in stemmed_dict:
        stemmed_dict[stemmed_word] = count
    else:
        stemmed_dict[stemmed_word] = stemmed_dict[stemmed_word] + count
print(len(word_dictionary), len(stemmed_dict))

58403 57786


In [8]:
VOCAB_SIZE = 40000

In [9]:
TRAIN_DATA_1_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TRAIN_DATA_1]
TRAIN_DATA_2_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TRAIN_DATA_2]
TEST_DATA_1_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TEST_DATA_1]
TEST_DATA_2_hot = [text.one_hot(t, VOCAB_SIZE, lower=True, split=" ") for t in TEST_DATA_2]

In [12]:
from keras.preprocessing import sequence

SEQUENCE_LENGTH = 200

TRAIN_DATA_1_hot = sequence.pad_sequences(TRAIN_DATA_1_hot, maxlen=SEQUENCE_LENGTH)
TRAIN_DATA_2_hot = sequence.pad_sequences(TRAIN_DATA_2_hot, maxlen=SEQUENCE_LENGTH)
TEST_DATA_1_hot = sequence.pad_sequences(TEST_DATA_1_hot, maxlen=SEQUENCE_LENGTH)
TEST_DATA_2_hot = sequence.pad_sequences(TEST_DATA_2_hot, maxlen=SEQUENCE_LENGTH)

In [14]:
# Have to change input shape (nb_samples, timesteps, input_dim)
TRAIN_DATA_1_hot = TRAIN_DATA_1_hot.reshape(len(TRAIN_DATA_1_hot), SEQUENCE_LENGTH, 1)
TRAIN_DATA_2_hot = TRAIN_DATA_2_hot.reshape(len(TRAIN_DATA_2_hot), SEQUENCE_LENGTH, 1)
TEST_DATA_1_hot = TEST_DATA_1_hot.reshape(len(TEST_DATA_1_hot), SEQUENCE_LENGTH, 1)
TEST_DATA_2_hot = TEST_DATA_2_hot.reshape(len(TEST_DATA_2_hot), SEQUENCE_LENGTH, 1)

In [17]:
EMBED_VEC_LEN = 128
output_dim = 10

first_model = Sequential()
#first_model.add(Embedding(VOCAB_SIZE, EMBED_VEC_LEN, input_length=SEQUENCE_LENGTH))
first_model.add(LSTM(output_dim, input_shape=(SEQUENCE_LENGTH,1)))

second_model = Sequential()
#second_model.add(Embedding(VOCAB_SIZE, EMBED_VEC_LEN, input_length=SEQUENCE_LENGTH))
second_model.add(LSTM(output_dim, input_shape=(SEQUENCE_LENGTH,1)))

model = Sequential()
model.add(Merge(layers=[first_model, second_model], mode='sum'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit([TRAIN_DATA_1_hot, TRAIN_DATA_2_hot], TRAIN_SCORES, nb_epoch=10, batch_size=16)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_3 (LSTM)                    (None, 10)            480                                          
____________________________________________________________________________________________________
lstm_4 (LSTM)                    (None, 10)            480                                          
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1)             11          merge_2[0][0]                    
____________________________________________________________________________________________________
activation_2 (Activation)        (None, 1)             0           dense_2[0][0]                    
Total params: 971
_________________________________________________________________________

<keras.callbacks.History at 0x3fdc0828>

In [18]:
scores = model.evaluate([TEST_DATA_1_hot, TEST_DATA_2_hot], TEST_SCORES, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 54.10%
