## Import packages

In [1]:
%matplotlib inline
from __future__ import print_function

import sys
import pandas as pd
import numpy as np

from sklearn.manifold import TSNE

from keras.layers import Input, Embedding, Dense, GRU, LSTM, Dropout, Reshape, Merge, Activation, Bidirectional
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

Using Theano backend.


## Download data files

In [None]:
! wget -q http://nlp.stanford.edu/data/glove.6B.zip
! unzip glove.6B.zip
! wget -q https://raw.githubusercontent.com/uclmr/emoji2vec/master/data/raw_training_data/emoji_joined.txt

## Initialize global variables

In [2]:
GLOVE_FILE = 'glove.6B/glove.6B.300d.txt'
EMOJI_DESCRIPTIONS_FILE = 'emoji_joined.txt'
EMOJI_EMB_VIZ_FILE = 'emoji_emb_viz.csv'
MODEL_WEIGHTS_FILE = 'weights.h5'

MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 5000
MAX_NB_EMOJIS = 2000
EMBEDDING_DIM = 300

RNG_SEED_1 = 1446557
RNG_SEED_2 = 1337603
VALIDATION_SPLIT = 0.1

## Load emjois

"To this end, we crawl emoji, their name and their keyword phrases from the Unicode emoji list, resulting in 6088 descriptions of 1661 emoji symbols." [1]

In [3]:
emoji_descriptions = pd.read_csv(EMOJI_DESCRIPTIONS_FILE, 
                                 sep='\t', 
                                 engine='python', 
                                 encoding='utf_8',
                                 names=['description', 'emoji'])

print('Emoji descriptions: %d' % len(emoji_descriptions))

Emoji descriptions: 6088


In [4]:
emoji_descriptions.head(5)

Unnamed: 0,description,emoji
0,ballot box with check,☑️
1,full moon with face,🌝
2,cheese,🌝
3,moon,🌝
4,smiling moon,🌝


In [5]:
neg_emoji_descriptions = pd.DataFrame({'emoji': emoji_descriptions['emoji'].values, 
                                       'description': emoji_descriptions.sample(frac=1, 
                                                                                random_state=RNG_SEED_1)['description'].values})

In [6]:
neg_emoji_descriptions.head(5)

Unnamed: 0,description,emoji
0,flag for jordan,☑️
1,middle school,🌝
2,menorah,🌝
3,old man,🌝
4,jazz,🌝


In [7]:
emoji_descriptions['label'] = 1
neg_emoji_descriptions['label'] = 0
emoji_data = pd.concat([emoji_descriptions, neg_emoji_descriptions]).sample(frac=1, random_state=RNG_SEED_2)

In [8]:
emoji_data.head(10)

Unnamed: 0,description,emoji,label
4078,arab,▫️,0
468,black man with turban,🈵,0
2407,sad cat,😿,1
1530,flag for brazil,🇧🇷,1
4797,accessible bathroom,💴,0
5890,white left pointing backhand index,🍳,0
5401,blond,👱,1
2520,face with open mouth and cold sweat,😰,1
747,slow,🐢,1
5770,mobile phone,📱,1


In [9]:
emoji_series = emoji_descriptions.groupby('emoji')['description'].apply(lambda x: ', '.join(x))
emojis_combined_desc = pd.DataFrame({'emoji': emoji_series.index, 'description': emoji_series.values})

print('Emojis: %d' % len(emojis_combined_desc))

Emojis: 1661


In [10]:
emojis_combined_desc[emojis_combined_desc['description'].str.contains('new york')]

Unnamed: 0,description,emoji
606,"slice of pizza, pie, italy, pepperoni pizza, s...",🍕
1362,"statue of liberty, new york",🗽
1506,"taxicab, city, new york taxi, car, service, au...",🚕


## Build emoji index

In [11]:
emojis = emojis_combined_desc['emoji'].values
emoji_index = {}
emoji_reverse_index = {}
i = 0
for e in emojis:
    i += 1
    emoji_index[e] = i
    emoji_reverse_index[i] = e

print("Unique emojis: %d" % len(emoji_index))

Unique emojis: 1661


## Build word index

In [12]:
descriptions = emoji_data['description'].values
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(descriptions.tolist())
desc_word_sequences = tokenizer.texts_to_sequences(descriptions.tolist())
word_index = tokenizer.word_index
print("Unique words: %d" % len(word_index))

Unique words: 3364


## Load GloVe word embeddings

In [None]:
embeddings_index = {}
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

## Prepare word embedding matrix

In [None]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        word_embedding_matrix[i] = embedding_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

## Prepare training and validation sets

In [None]:
e_data = np.array([ emoji_index[e] for e in emoji_data['emoji'].values ])
d_data = pad_sequences(desc_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = emoji_data['label'].values
nb_emojis = min(MAX_NB_EMOJIS, len(emoji_index))
print('Shape of emoji data tensor:', e_data.shape)
print('Shape of description data tensor:', d_data.shape)
print('Shape of label tensor:', labels.shape)
print('Number of emojis:', nb_emojis)

## Define sigmoid model

In [None]:
sigmoid_P = Sequential()
sigmoid_P.add(Embedding(nb_emojis + 1, EMBEDDING_DIM, input_length=1))
sigmoid_P.add(Reshape((EMBEDDING_DIM,)))
sigmoid_Q = Sequential()
sigmoid_Q.add(Embedding(nb_words + 1, 
                        EMBEDDING_DIM, 
                        weights=[word_embedding_matrix], 
                        input_length=MAX_SEQUENCE_LENGTH, 
                        trainable=False))
sigmoid_Q.add(Bidirectional(GRU(EMBEDDING_DIM, dropout_W=0.5, dropout_U=0.5), merge_mode='sum'))
sigmoid_model = Sequential()
sigmoid_model.add(Merge([sigmoid_P, sigmoid_Q], mode='concat'))
sigmoid_model.add(Dropout(0.5))
sigmoid_model.add(Dense(EMBEDDING_DIM*2, activation='relu'))
sigmoid_model.add(Dropout(0.5))
sigmoid_model.add(Dense(1, activation='sigmoid'))
sigmoid_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
sigmoid_model.summary()

## Train sigmoid model

In [None]:
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, save_best_only=True)]
sigmoid_history = sigmoid_model.fit([e_data, d_data], 
                                    labels, 
                                    nb_epoch=80, 
                                    validation_split=VALIDATION_SPLIT, 
                                    verbose=2, 
                                    callbacks=callbacks)

## Plot training and validation accuracy

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in sigmoid_history.epoch ],
                    'training': sigmoid_history.history['binary_accuracy'],
                    'validation': sigmoid_history.history['val_binary_accuracy']})
ax = acc.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("binary accuracy")
ax.set_ylim([0.0,1.0]);

## Plot training and validation loss

In [None]:
loss = pd.DataFrame({'epoch': [ i + 1 for i in sigmoid_history.epoch ],
                     'training': sigmoid_history.history['loss'],
                     'validation': sigmoid_history.history['val_loss']})
ax = loss.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("loss")
ax.set_ylim([0.0,2.0]);

## Define softmax model

In [None]:
softmax_P = Sequential()
softmax_P.add(Embedding(nb_emojis + 1, EMBEDDING_DIM, input_length=1))
softmax_P.add(Reshape((EMBEDDING_DIM,)))
softmax_Q = Sequential()
softmax_Q.add(Embedding(nb_words + 1, 
                        EMBEDDING_DIM, 
                        weights=[word_embedding_matrix], 
                        input_length=MAX_SEQUENCE_LENGTH, 
                        trainable=False))
softmax_Q.add(Bidirectional(GRU(EMBEDDING_DIM, dropout_W=0.5, dropout_U=0.5), merge_mode='sum'))
softmax_model = Sequential()
softmax_model.add(Merge([softmax_P, softmax_Q], mode='concat'))
softmax_model.add(Dropout(0.5))
softmax_model.add(Dense(EMBEDDING_DIM*2, activation='relu'))
softmax_model.add(Dropout(0.5))
softmax_model.add(Dense(2, activation='softmax'))
softmax_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
softmax_model.summary()

## Train softmax model

In [None]:
softmax_labels = np.array([ [0, 1] if l == 0 else [1, 0] for l in labels ])

In [None]:
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, save_best_only=True)]
softmax_history = softmax_model.fit([e_data, d_data], 
                                    softmax_labels, 
                                    nb_epoch=80, 
                                    validation_split=VALIDATION_SPLIT, 
                                    verbose=2, 
                                    callbacks=callbacks)

In [None]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in softmax_history.epoch ],
                    'training': softmax_history.history['categorical_accuracy'],
                    'validation': softmax_history.history['val_categorical_accuracy']})
ax = acc.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("categorical accuracy")
ax.set_ylim([0.0,1.0]);

In [None]:
loss = pd.DataFrame({'epoch': [ i + 1 for i in softmax_history.epoch ],
                     'training': softmax_history.history['loss'],
                     'validation': softmax_history.history['val_loss']})
ax = loss.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("loss")
ax.set_ylim([0.0,2.0]);

## Display t-SNE visualization of emoji embeddings from sigmoid model

In [None]:
tsne2 = TSNE(n_components=2, perplexity=30, init='pca', n_iter=5000)
weights = sigmoid_P.layers[0].get_weights()[0]
fit = tsne2.fit_transform(weights)
visualization = pd.DataFrame(fit[1:], columns=['x', 'y'])
visualization['emoji'] = emojis_combined_desc['emoji'].values
visualization.to_csv(EMOJI_EMB_VIZ_FILE)
visualization.plot('x', 'y', kind='scatter');