## Import libraries

In [1]:
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


## Setup some configurational parameters

In [2]:
MAX_NB_WORDS=200000
MAX_SEQUENCE_LENGTH=50
VALIDATION_SPLIT = .2
GLOVE_DIR="../glove.6B"

In [3]:
with open("training.1600000.processed.noemoticon.csv", encoding = "ISO-8859-1" ) as f:
    li=f.readlines()

In [4]:
li[0]

'"0","1467810369","Mon Apr 06 22:19:45 PDT 2009","NO_QUERY","_TheSpecialOne_","@switchfoot http://twitpic.com/2y1zl - Awww, that\'s a bummer.  You shoulda got David Carr of Third Day to do it. ;D"\n'

In [5]:
texts = []  # list of text samples
labels = []  # list of label ids


for row in li: 
    row = row.replace('"',"").strip().split(",")
    texts.append(row[-1])
    if int(row[0])==4:
        label = 1
    elif int(row[0]) == 2:
        label = 1
    else:
        label =0
    labels.append(label)

print('Found %s texts.' % len(texts))

Found 1600000 texts.


In [6]:
labels.count(1)

800000

In [7]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS,filters="")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



Found 1010342 unique tokens.


In [8]:
embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [9]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# data = np.array(sequences)
labels_1 = to_categorical(np.asarray(labels),num_classes=2)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels_1.shape)

Shape of data tensor: (1600000, 50)
Shape of label tensor: (1600000, 2)


index_word = dict(zip(word_index.values(),word_index.keys()))

new_data = []
for data_row in data[:1000]:
    new_row = []
    for word_idx in data_row:
        new_word = embeddings_index.get(index_word[word_idx])
#         print(type(new_word))
        if  type(new_word).__module__== 'numpy':
            new_row.append(new_word)
        else:
            new_row.append([0]*100)
    new_data.append(np.array(new_row))
# new_data = np.array(new_data)

new_data_2 = list(map( lambda y: np.array(list(map(lambda x: list(x), y))), new_data))

new_data_2 = np.array(new_data_2,dtype=object)

In [10]:
EMBEDDING_DIM=50
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


In [11]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels_1 = labels_1[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels_1[:-nb_validation_samples]
x_val = data[-nb_validation_samples:-int(nb_validation_samples/2)]
y_val = labels_1[-nb_validation_samples:-int(nb_validation_samples/2)]
x_test = data[-int(nb_validation_samples/2):]
y_test = labels_1[-int(nb_validation_samples/2):]

In [19]:
from keras.layers import Embedding,Input, Conv1D, MaxPooling1D, Dense, Flatten, Reshape, Dropout, LSTM, Activation
from keras.models import Model, Sequential


embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
model_1 = Model(sequence_input, embedded_sequences)
model_1.summary()
model = Sequential()
model.add(model_1)
model.add(Conv1D(64, 5, activation='relu',input_shape=(None, 500)))

model.add(Dropout(0.25))
model.add(LSTM(64))
model.add(Dense(50))
model.add(Dense(2,activation="softmax"))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 50, 50)            50517150  
Total params: 50,517,150
Trainable params: 50,517,150
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
model_3 (Model)              (None, 50, 50)            50517150  
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 46, 64)            16064     
_________________________________________________________________
dropout_3 (Dropout)          (None, 46, 64)            0         
________________________________________________________________

### Load weights 

In [27]:
from keras.models import model_from_json
json_file = open('model_tweets_new.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model`
model.load_weights("model_tweets_new.h5")

In [28]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [29]:
# fitting the data
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=6, batch_size=1280)

Train on 1280000 samples, validate on 160000 samples
Epoch 1/6
   2560/1280000 [..............................] - ETA: 7043s - loss: 0.2839 - acc: 0.8770

KeyboardInterrupt: 

In [22]:
score = model.evaluate(x_test,y_test, batch_size=1280)
print("Loss: "+str(score[0]))
print("Accuracy: "+str(score[1]))

Loss: 0.5097347134709358
Accuracy: 0.7999625


### Check Custom Sentences

### Save model

In [24]:
model_json = model.to_json()
with open("model_tweets_new.json", "w") as json_file:
    json_file.write(model_json)

In [25]:
model.save_weights("model_tweets_new.h5")