In [1]:
import pandas as pd
import numpy as np
import re
from string import punctuation
from sklearn.preprocessing import MultiLabelBinarizer
import matplotlib.pyplot as plt

In [2]:
data=pd.read_csv("./data_twitter/tweets_train.csv",encoding='utf-8')
train, validate, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])

In [3]:
print train.shape
print validate.shape
print test.shape
print train.columns

(293159, 4)
(97720, 4)
(97720, 4)
Index([u'tweet_id', u'text', u'label', u'emoji'], dtype='object')


In [5]:
embeddings={}
with open('./glove.twitter.27B/glove.twitter.27B.200d.txt','r') as f:
    for line in f:
        list_line = line.split()
        word= list_line[0]
        word= word.replace('<',' ')
        word = word.replace('>', ' ')
        embeddings[word] = np.array(list_line[1:],dtype=np.float16)

In [6]:
def _tweets_preprocess(tweets):
    tweets_clean=np.zeros_like(tweets)
    for i,tweet in enumerate(tweets):
        p = re.compile(r'(\n)|(\r)|(\t)|(\')|(\u00A9)|([!"#$%&()*+,-./:;<=>?@\[\\\]^_`{|}~])', re.IGNORECASE)
        tweet = re.sub('\?', ' ', tweet)
        tweet = re.sub('\.', ' ', tweet)
        tweet = re.sub(',', ' ', tweet)
        tweet = re.sub('!', ' ', tweet)
        tweet = re.sub(' +',' ', tweet)
        tweet = re.sub(p," ",tweet)
        tweet= tweet.encode('ascii','ignore')

        tweet_clean = [ wd.strip(punctuation).lower() for wd in tweet.split() \
                    if not wd.startswith('@') and not wd.startswith('#') and not wd == 'rt']
        #wd[0].isupper()
        
        tweets_clean[i]= np.array(tweet_clean)
    return tweets_clean

In [7]:
X_train= _tweets_preprocess(train['text'].values)
y_train= train['label'].values
X_val= _tweets_preprocess(validate['text'].values)
y_val= validate['label'].values
X_test= _tweets_preprocess(test['text'].values)
y_test= test['label'].values

In [8]:
def _calc_embeddings(X):
    lstm_embeddings= np.zeros((X.shape[0],20,200),dtype=np.float16)
    for i,tweet in enumerate(X):
        k=0
        for j,word in enumerate(tweet):
            try:
                lstm_embeddings[i,j-k]= embeddings[word.lower().strip()]
            except:
                k+=1
                continue
    return lstm_embeddings

In [9]:
embeddings_train= _calc_embeddings(X_train)
embeddings_val= _calc_embeddings(X_val)
embeddings_test= _calc_embeddings(X_test)

In [10]:
encoder = MultiLabelBinarizer()
encoder.fit(y_train[:,np.newaxis])
y_train = encoder.transform(y_train[:,np.newaxis])
y_val = encoder.transform(y_val[:,np.newaxis])
y_test  = encoder.transform(y_test[:,np.newaxis])

In [11]:
print embeddings_train.shape
print y_train.shape

(293159, 20, 200)
(293159, 20)


In [12]:
from keras.models import Sequential
from keras.layers import Dense, Activation,LSTM
from keras import backend as K

Using TensorFlow backend.


In [13]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))


In [21]:
def LSTMmodel():
    model= Sequential()
    model.add(LSTM(8,input_shape=[None,200], return_sequences=True))
    model.add(LSTM(4, return_sequences=False))
    model.add(Dense(20,activation='softmax'))
    model.summary()
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=[f1,'accuracy'])
    return model
    

In [22]:
model=LSTMmodel()
res = model.fit(embeddings_train, y_train, batch_size=1024, epochs=50, validation_data=(embeddings_val, y_val))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, None, 8)           6688      
_________________________________________________________________
lstm_6 (LSTM)                (None, 4)                 208       
_________________________________________________________________
dense_4 (Dense)              (None, 20)                100       
Total params: 6,996
Trainable params: 6,996
Non-trainable params: 0
_________________________________________________________________
Train on 293159 samples, validate on 97720 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50

KeyboardInterrupt: 

In [15]:
model.save('lstm_model.h5')

In [16]:
from keras.models import load_model

In [18]:
modl= load_model('lstm_model.h5', custom_objects={'f1': f1})

In [None]:
history= res

In [None]:
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['f1'])
plt.plot(history.history['val_f1'])
plt.title('model f1 score')
plt.ylabel('f1')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()