In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
import pandas as pd

# open csv in google drive
link = 'https://drive.google.com/open?id=15mQ4cQiwnB4dDdSSvEtTbeZ2P-ZA2UYs'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_5_now.csv')  
df = pd.read_csv('train_5_now.csv')

# create dataframe from csv
keys = []
for line in df:
  keys.append(line)
print(keys)

['text', 'label', 'hashcount', 'hashcontent', 'atcount', 'atcontent']


In [4]:
import keras
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [0]:
"""
extract and create vocab from content
"""
def extract_vocab(contents):
  vocab = {}
  count = {}
  length = 0
  # count frequency
  for content in contents:
    # skip empty line
    if isinstance(content, float):
      continue
    words = content.split()
    for word in words:
      count[word] = count.get(word, 0) + 1

  # extract word as vocab
  for content in contents:
    # skip empty line
    if isinstance(content, float):
      continue
    words = content.split()
    for word in set(words):

      # assign ID to each words
      if word not in vocab and count[word] > 1:
        vocab[word] = len(vocab)

    # update maximum length of the list
    length = max(length, len(words))

  # words appears less than twice will be an unique label 'unk'
  vocab["unk"] = len(vocab)

  return count, vocab, length

In [0]:
"""
extract labels
return Y
"""
def extract_labels(df):
  num_classes = len(set(df["label"]))
  y = []
  for label in df["label"]:
    # skip empty line
    if isinstance(label, float):
      cur.append(0)
    y.append(label)
  # convert labels into binary style representation
  y = to_categorical(y, num_classes + 1, dtype='float32')
  return y, num_classes

In [0]:
"""
build X
0: only text content 
1: include number of @ and #
2: include contents of #
3: include contens of @
4: include all contents
"""
def build_features(df, mode, vocabs):
  if mode < 0 or mode > 4:
    return []

  # test X wont need new vocab  
  contents = [df["text"], df["hashcontent"], df["atcontent"]]

  if mode == 2:
    index = [0, 1]
  elif mode == 3:
    index = [0, 2]
  elif mode == 4:
    index = [0, 1, 2]
  else:
    index = [0]

  X = np.array([])
  length = 0
  vocab_len = 0
  for i in index:
    vocab, content = vocabs[i], contents[i]
    vocab_len += len(vocab)
    length += 30
    cur_X = []

    for j in range(len(content)):
      cur = []
      if isinstance(content[j], float):
        # empty
        cur.append(0)
      else:
        # reconstruct the sentence by word's ID
        words = content[j].split()
        for word in words:
          if word in vocab:
            cur.append(vocab[word])
          else:
            cur.append(vocab["unk"])
      # update current matrix
      cur_X.append(cur)

    # make all sentence respresentations the same length
    cur_X = np.array(pad_sequences(cur_X, 30, padding='post'))

    # combine the matrix
    if len(X) == 0:
      X = np.array(cur_X)
    else:
      X = np.concatenate((X, cur_X), axis = 1)

  # add sum of at and hashtag
  if mode == 1:
    cur_X = []
    for i in range(len(contents[0])):
      cur_X.append([df["hashcount"][i], df["atcount"][i]])
    X = np.concatenate((X, np.array(cur_X)), axis = 1)
    length += 2

  return X, length, vocab_len

In [8]:
print("---------------------------")
print("Extracting Features:")

y, num_classes = extract_labels(df)

print(y.shape, num_classes)

# vocabulary global
count1, vocab1, len1 = extract_vocab(df["text"])
count2, vocab2, len2 = extract_vocab(df["hashcontent"])
count3, vocab3, len3 = extract_vocab(df["atcontent"])
vocabs = [vocab1, vocab2, vocab3]

features = []
for i in range(5):
  X, length, vocab_len = build_features(df, i, vocabs)
  print(X.shape, length, vocab_len)
  features.append((X, length, vocab_len))

---------------------------
Extracting Features:
(295700, 6) 5
(295700, 30) 30 49215
(295700, 32) 32 49215
(295700, 60) 60 77055
(295700, 60) 60 65080
(295700, 90) 90 92920


In [0]:
"""
directly from https://www.kaggle.com/eray1yildiz/using-lstms-with-attention-for-emotion-recognition
modified:
  Apply Bidirectional CuDNNLSTM over embedded inputs
"""

def build_lstm(length, vocab_len):
  # The dimension of word embeddings
  embedding_dim = 100

  # Define input tensor
  sequence_input = keras.Input(shape=(length,), dtype='int32')

  # Word embedding layer
  embedded_inputs =keras.layers.Embedding(vocab_len + 1,
                                        embedding_dim,
                                        input_length=length)(sequence_input)

  # Apply dropout to prevent overfitting
  embedded_inputs = keras.layers.Dropout(0.2)(embedded_inputs)

  # Apply Bidirectional CuDNNLSTM over embedded inputs
  lstm_outs = keras.layers.wrappers.Bidirectional(
    keras.layers.CuDNNLSTM(embedding_dim, return_sequences=True)
  )(embedded_inputs)

  # Apply dropout to LSTM outputs to prevent overfitting
  lstm_outs = keras.layers.Dropout(0.2)(lstm_outs)

  # Attention Mechanism - Generate attention vectors
  input_dim = int(lstm_outs.shape[2])
  permuted_inputs = keras.layers.Permute((2, 1))(lstm_outs)
  attention_vector = keras.layers.TimeDistributed(keras.layers.Dense(1))(lstm_outs)
  attention_vector = keras.layers.Reshape((length,))(attention_vector)
  attention_vector = keras.layers.Activation('softmax', name='attention_vec')(attention_vector)
  attention_output = keras.layers.Dot(axes=1)([lstm_outs, attention_vector])

  # Last layer: fully connected with softmax activation
  fc = keras.layers.Dense(embedding_dim, activation='relu')(attention_output)
  output = keras.layers.Dense(num_classes + 1, activation='softmax')(fc)

  # Finally building model
  model = keras.Model(inputs=[sequence_input], outputs=output)
  model.compile(loss="categorical_crossentropy", metrics=["accuracy"], optimizer='adam')

  # Print model summary
  model.summary()
  return model

In [10]:
print("---------------------------")
print("Building models:")

names = {0: "only text content" ,
1: "include number of @ and #",
2: "include contents of #",
3: "include contens of @",
4: "include all contents"}

# build models
models = []
for i in range(5):
  print(names[i])
  models.append(build_lstm(features[i][1], features[i][2]))

---------------------------
Building models:
only text content




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 100)      4921600     input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 30, 100)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirec

In [11]:
print("---------------------------")
print("Doing model training:")

# train models
for i in range(5):
  print(names[i])
  models[i].fit(features[i][0], y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)

---------------------------
Doing model training:
only text content
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 266130 samples, validate on 29570 samples
Epoch 1/2





Epoch 2/2
include number of @ and #
Train on 266130 samples, validate on 29570 samples
Epoch 1/2
Epoch 2/2
include contents of #
Train on 266130 samples, validate on 29570 samples
Epoch 1/2
Epoch 2/2
include contens of @
Train on 266130 samples, validate on 29570 samples
Epoch 1/2
Epoch 2/2
include all contents
Train on 266130 samples, validate on 29570 samples
Epoch 1/2
Epoch 2/2


In [12]:
import numpy as np

print("---------------------------")
print("Doing test:")

# read test
link = 'https://drive.google.com/open?id=1HSzZBgvWtD-7sNLtdJY5877SQGJakwm0'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_5_now.csv')  
test = pd.read_csv('test_5_now.csv')

# get evaluation report
test_y, num_classes = extract_labels(test)
for i in range(5):
  print(names[i])
  test_X, length, vocab_len = build_features(test, i, vocabs)
  res = models[i].evaluate(test_X, test_y, verbose=1)
  print("Loss:", res[0])
  print("Accuracy", res[1])

---------------------------
Doing test:
only text content
Loss: 1.0312710192999572
Accuracy 0.6044047371763562
include number of @ and #
Loss: 1.0342411415234638
Accuracy 0.5992104717012307
include contents of #
Loss: 1.032220564053506
Accuracy 0.5975483067437415
include contens of @
Loss: 1.0244034139500222
Accuracy 0.6081446084019153
include all contents
Loss: 1.0292769146267748
Accuracy 0.6083523789798052


In [13]:
print("---------------------------")
print("Doing validation:")

# read validation data
link = 'https://drive.google.com/open?id=1N-LnZZa1rFD-yAI3lNzdJshHKK40OgPp'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('validation_5_now.csv')  
valid = pd.read_csv('validation_5_now.csv')

# get evaluation report
valid_y, num_classes = extract_labels(valid)
for i in range(5):
  print(names[i])
  valid_X, length, vocab_len = build_features(valid, i, vocabs)
  res = models[i].evaluate(valid_X, valid_y, verbose=1)
  print("Loss:", res[0])
  print("Accuracy", res[1])

---------------------------
Doing validation:
only text content
Loss: 1.0085597610811885
Accuracy 0.6106822450897675
include number of @ and #
Loss: 1.0155119152773453
Accuracy 0.6056749425749178
include contents of #
Loss: 1.0264122686952184
Accuracy 0.6019194659499318
include contens of @
Loss: 1.0140224861471931
Accuracy 0.6094304194237478
include all contents
Loss: 1.0132237828377266
Accuracy 0.6106822448037448


In [0]:
"""
predict emoji labels
"""

def predict_lstm(sentences, vocab, model):

  X = []
  for i in range(len(sentences)):
    cur = []
    if isinstance(sentences[i], float):
      # empty
      cur.append(0)
    else:
      # reconstruct the sentence by word's ID
      words = sentences[i].split()
      for word in words:
        if word in vocab:
          cur.append(vocab[word])
        else:
          cur.append(vocab["unk"])
    # update current matrix
    X.append(cur)

  # make all sentence respresentations the same length
  X = np.array(pad_sequences(X, 30, padding='post'))

  label_probs = model.predict(X)
  labels = np.argmax(label_probs, axis=1)
  return labels

In [15]:
"""
human label test
"""

print("---------------------------")
print("Doing human label test:")

link = 'https://drive.google.com/open?id=1xNcZPUOaqMYlZQt1T_5axCfxW87YC3rG'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('human_test.txt')  
f = open('human_test.txt')

sentences = []
for line in f:
  sentences.append(line)

labels = predict_lstm(sentences, vocabs[0], models[0])
for label in labels:
  print(label)

---------------------------
Doing human label test:
1
1
1
2
1
3
1
1
1
2
2
4
3
4
3
3
4
1
1
1
2
1
3
2
4
2
3
3
3
1
3
1
3
2
5
3
3
2
3
2
1
2
4
1
1
1
1
1
3
3
2
3
1
2
3
3
3
1
2
1
1
5
3
1
1
3
3
2
4
3
1
1
3
1
3
3
3
1
1
1
3
2
3
3
1
2
4
2
1
3
3
2
3
5
3
3
3
2
4
2
3
3
3
3
1
3
1
1
3
1
1
3
3
4
4
5
1
2
1
3
1
3
4
3
2
2
3
1
1
3
1
5
3
1
3
3
1
2
1
3
3
3
3
3
3
2
1
1
3
1
1
2
3
3
5
2
3
3
1
1
4
4
3
3
1
3
3
1
3
3
1
1
2
3
4
3
2
3
2
1
1
1
3
3
3
2
3
3
1
1
1
1
3
1
4
1
1
2
3
1


In [16]:
"""
train and predict 20 emoji dataset
"""
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

print("---------------------------")
print("For 20 emoji dataset:")
# open csv in google drive
link = 'https://drive.google.com/open?id=1zjypXqXd9at3x-4ZxLTk4oE1U5D0OqHt'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_20_now.csv')  
df = pd.read_csv('train_20_now.csv')

# create dataframe from csv
keys = []
for line in df:
  keys.append(line)
print(keys)
y, num_classes = extract_labels(df)
print(y.shape, num_classes)

# vocabulary global
count1, vocab1, len1 = extract_vocab(df["text"])
count2, vocab2, len2 = extract_vocab(df["hashcontent"])
count3, vocab3, len3 = extract_vocab(df["atcontent"])
vocabs = [vocab1, vocab2, vocab3]

# extract features
print("---------------------------")
print("Extracting features:")
features = []
for i in range(5):
  X, length, vocab_len = build_features(df, i, vocabs)
  print(X.shape, length, vocab_len)
  features.append((X, length, vocab_len))

names = {0: "only text content" ,
1: "include number of @ and #",
2: "include contents of #",
3: "include contens of @",
4: "include all contents"}

# build models
print("---------------------------")
print("Building models:")
models = []
for i in range(5):
  print(names[i])
  models.append(build_lstm(features[i][1], features[i][2]))

# train models
print("---------------------------")
print("Training models:")
for i in range(5):
  print(names[i])
  models[i].fit(features[i][0], y, epochs=2, batch_size=64, validation_split=0.1, shuffle=True)

---------------------------
For 20 emoji dataset:
['text', 'label', 'hashcount', 'hashcontent', 'atcount', 'atcontent']
(580271, 21) 20
---------------------------
Extracting features:
(580271, 30) 30 78851
(580271, 32) 32 78851
(580271, 60) 60 126174
(580271, 60) 60 103136
(580271, 90) 90 150459
---------------------------
Building models:
only text content
Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 30, 100)      7885200     input_6[0][0]                    
__________________________________________________________________________________________________
dropout_11 (Dropout)        

In [17]:
# read test
print("---------------------------")
print("Doing test:")
link = 'https://drive.google.com/open?id=1XQ5IyiEJFdJWmMKvlpJsNlmE_92-i4Ce'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_20_now.csv')  
test = pd.read_csv('test_20_now.csv')

# get evaluation report
test_y, num_classes = extract_labels(test)
for i in range(5):
  print(names[i])
  test_X, length, vocab_len = build_features(test, i, vocabs)
  res = models[i].evaluate(test_X, test_y, verbose=1)
  print("Loss:", res[0])
  print("Accuracy", res[1])

---------------------------
Doing test:
only text content
Loss: 2.0484702620953663
Accuracy 0.38009153323533607
include number of @ and #
Loss: 2.035998703413206
Accuracy 0.3842105263703475
include contents of #
Loss: 2.04173323742585
Accuracy 0.37826086957203714
include contens of @
Loss: 2.053848368297725
Accuracy 0.3828375286109387
include all contents
Loss: 2.056784960989003
Accuracy 0.37871853551002604


In [18]:
# read validation data
print("---------------------------")
print("Doing validation:")
link = 'https://drive.google.com/open?id=1NPkUvXqKPuSROYJwG1daSBFnIaUt_QZB'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('validation_20_now.csv')  
valid = pd.read_csv('validation_20_now.csv')

# get evaluation report
valid_y, num_classes = extract_labels(valid)
for i in range(5):
  print(names[i])
  valid_X, length, vocab_len = build_features(valid, i, vocabs)
  res = models[i].evaluate(valid_X, valid_y, verbose=1)
  print("Loss:", res[0])
  print("Accuracy", res[1])

---------------------------
Doing validation:
only text content
Loss: 2.0984813579590833
Accuracy 0.37003899978084503
include number of @ and #
Loss: 2.0956390924644075
Accuracy 0.3695801789503793
include contents of #
Loss: 2.107369819401765
Accuracy 0.36568020189142086
include contens of @
Loss: 2.1049057684187553
Accuracy 0.367056664382818
include all contents
Loss: 2.1027862863765994
Accuracy 0.37279192476363926


In [19]:
# human label test
print("---------------------------")
print("Doing human label test:")
link = 'https://drive.google.com/open?id=1xNcZPUOaqMYlZQt1T_5axCfxW87YC3rG'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('human_test.txt')  
f = open('human_test.txt')

sentences = []
for line in f:
  sentences.append(line)
labels = predict_lstm(sentences, vocabs[0], models[0])
for label in labels:
  print(label)

---------------------------
Doing human label test:
1
1
1
2
1
3
1
1
1
1
1
1
3
4
3
1
4
1
1
1
2
3
3
2
1
3
3
3
3
1
3
1
3
1
5
3
1
3
3
6
1
2
16
1
1
1
1
1
3
3
8
5
5
2
3
3
3
1
3
1
1
5
14
1
1
1
3
3
4
3
1
1
3
5
3
1
1
1
1
5
3
3
3
3
1
3
4
3
1
1
3
2
3
5
3
3
3
1
19
3
3
3
3
3
1
3
3
1
3
1
1
3
3
5
3
5
1
3
1
3
1
3
4
3
3
2
1
1
1
3
1
5
3
1
14
3
1
1
1
3
3
3
3
3
3
1
1
1
3
1
5
3
1
2
1
1
3
3
1
1
4
4
3
14
1
3
14
1
3
3
1
1
2
3
4
3
3
3
3
1
1
1
6
3
3
1
3
3
1
1
1
8
5
1
4
1
1
3
2
1
