title: lstmmodel.ipynb, id: 1TnrStjnks7OTUMnMNDnr_pYRfj-58UwP

title: glove.6B.300d.txt, id: 1rWMInnJOHWksW_iTTNaFMFH1E2rIv3cR

title: repubTweets.npy, id: 13qF8Ba5s4RSa4XU-Ekm3Kk6XDIezZTX_

title: demTweets.npy, id: 1gg7t6X0dDhl916B6zYVh01lDKNKNYulR

In [2]:
!ls

datalab        glove.6B.300d.txt  model
demtweets.npy  mislabeled.txt	  repubtweets.npy


In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_id = '13qF8Ba5s4RSa4XU-Ekm3Kk6XDIezZTX_'
downloaded = drive.CreateFile({'id': file_id})
file = downloaded.GetContentFile('repubtweets.npy', '.npy')
file_id = '1gg7t6X0dDhl916B6zYVh01lDKNKNYulR'
downloaded = drive.CreateFile({'id': file_id})
file = downloaded.GetContentFile('demtweets.npy', '.npy')
file_id = '1rWMInnJOHWksW_iTTNaFMFH1E2rIv3cR'
downloaded = drive.CreateFile({'id': file_id})
file = downloaded.GetContentFile('glove.6B.300d.txt', '.txt')

In [0]:
import numpy as np
from tensorflow import keras as k
from tensorflow.python.keras.layers import Bidirectional, LSTM, Dense, Input, TimeDistributed, \
  Flatten, Activation, RepeatVector, multiply, Permute, Lambda, Dropout

MAX_NB_WORDS = 20000 # Max number of words for the tokenizer
EMBEDDING_DIM = 300 # Dimensions of the word vectors
MAX_SEQ_LENGTH  = 280 # Max length of the text sequences (for padding purposes)



def load_embeddings():
  """
  Loads the word embedding from file
  :return: returns a dictionary of the word vectors
  """
  f = open('glove.6B.300d.txt')
  embed_index = {}

  for line in f:
    vals = line.split()
    word = vals[0]
    vector = np.asarray(vals[1:], dtype='float32')
    embed_index[word] = vector

  f.close()
  return embed_index

def load_data():
  """
  loads the tweets stored in the files demTweets and repubTweets and creates labels
  Dems are 0 and Repubs are 1
  :return: returns data and labels for the data
  """
  demTweets = np.load('demtweets.npy')
  repubTweets = np.load('repubtweets.npy')

  tweets = [] # list the text of the tweets
  labels = [] # list of the labels (0 or 1)
  maxlen = 0
  for text in demTweets:
    tweets.append(text)
    labels.append(0)
    if len(text) > maxlen:
      maxlen = len(text)
  for text in repubTweets:
    tweets.append(text)
    labels.append(1)
    if len(text) > maxlen:
      maxlen = len(text)

  print('Tweets found: %s' % len(tweets))
  print('Longest tweet: %s' % maxlen)
  return tweets, labels

def embed_and_token(tweets, labels):
  """
  creates the embedding layer, tokenizes the corpus.
  :param tweets:corpus of the tweets
  :param labels:labels for the tweets
  :return: formatted data, labels, and the embedding layer
  """
  tokenizer = k.preprocessing.text.Tokenizer(num_words=MAX_NB_WORDS)
  tokenizer.fit_on_texts(tweets)
  data = tokenizer.texts_to_sequences(tweets)
  word_index = tokenizer.word_index

  data = k.preprocessing.sequence.pad_sequences(data)
  labels = np.asarray(labels)
  # labels = k.utils.to_categorical(np.asarray(labels))

  embedding_index = load_embeddings()

  embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
  for word, i in word_index.items():
    embedding_vec = embedding_index.get(word)
    if embedding_vec is not None:
      embedding_matrix[i] = embedding_vec
  embedding_layer = k.layers.Embedding(len(word_index) + 1,
                                       EMBEDDING_DIM,
                                       weights=[embedding_matrix],
                                       input_length=data.shape[1],
                                       trainable=False)
  return data, labels, embedding_layer

In [3]:
tweets, labels = load_data()
data, labels, embedding_layer = embed_and_token(tweets, labels)
print(data.shape)
print(labels.shape)

sequence_input = Input(shape=(data.shape[1],), dtype='int32') # (Batch size,
embedded_sequences = embedding_layer(sequence_input)

Tweets found: 299254
Longest tweet: 151
(299254, 34)
(299254,)


In [0]:
shuffled_data = []
shuffled_labels = []
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(0.2 * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

In [6]:
UNITS = 256 # number of hidden units in the lstm
REG = 0.25
ACT_REG = 0.09
DROP = 0.1


x = Bidirectional(LSTM(UNITS, return_sequences=True, dropout=DROP, activity_regularizer=k.regularizers.l2(ACT_REG)), merge_mode='concat')(embedded_sequences) # (batch_size, timesteps, units)
attention = TimeDistributed(Dense(UNITS, activation='relu', kernel_regularizer=k.regularizers.l2(REG)))(x)
attention = TimeDistributed(Dense(1, name='timeDense'))(attention) # (batch_size, timesteps, 1)
attention = Flatten()(attention) # (batch size, timesteps)
attention = Activation('softmax')(attention) # (batch, timesteps)
attention = RepeatVector(UNITS*2)(attention) # (batch, units, timesteps)
attention = Permute([2,1])(attention) #(batch, timesteps, units)
rejoined = multiply([x, attention])

# x = LSTM(UNITS, return_sequences=True, dropout=DROP, kernel_regularizer=k.regularizers.l2(REG))(rejoined) # (batch_size, timesteps, units)
# attention = TimeDistributed(Dense(UNITS, activation='relu', activity_regularizer=k.regularizers.l2(REG)))(x)
# attention = TimeDistributed(Dense(1, activation='tanh', name='timeDense'))(attention) # (batch_size, timesteps, 1)
# attention = Flatten()(attention) # (batch size, timesteps)
# attention = Activation('softmax')(attention) # (batch, timesteps)
# attention = RepeatVector(UNITS)(attention) # (batch, units, timesteps)
# attention = Permute([2,1])(attention) #(batch, timesteps, units)
# rejoined = multiply([x, attention])

timeflip = TimeDistributed(Dense(UNITS, activation='relu', kernel_regularizer=k.regularizers.l2(REG)))(rejoined)
# timeflip = Permute([2,1])(timeflip)
timeflip = TimeDistributed(Dense(UNITS, activation='relu', kernel_regularizer=k.regularizers.l2(REG)))(timeflip)
# timeflip = Permute([2,1])(timeflip)

interm = Lambda(lambda xin: k.backend.sum(xin, axis=-2))(timeflip)
# interm = Dense(UNITS, activation='relu', kernel_regularizer=k.regularizers.l2(REG), bias_regularizer=k.regularizers.l2(REG))(interm)



# interm = LSTM(UNITS, dropout=DROP, kernel_regularizer=k.regularizers.l2(REG))(interm)
# interm = Flatten()(interm)

out = Dense(1, activation='sigmoid', name='finaldense')(interm)

opt = k.optimizers.Adam(lr=0.001)
model = k.models.Model(sequence_input, out)
model.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])
print(model.summary())

model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=20, batch_size=512)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 300)      46398300    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 34, 512)      1140736     embedding_1[0][0]                
__________________________________________________________________________________________________
time_distributed_5 (TimeDistrib (None, 34, 256)      131328      bidirectional_2[0][0]            
__________________________________________________________________________________________________
time_distr

Epoch 2/20

Epoch 3/20

Epoch 4/20

Epoch 5/20

Epoch 6/20

Epoch 7/20

Epoch 8/20

Epoch 9/20

Epoch 10/20

Epoch 11/20

Epoch 12/20

Epoch 13/20

Epoch 14/20

Epoch 15/20

Epoch 16/20

Epoch 17/20

Epoch 18/20

Epoch 19/20

Epoch 20/20



<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f4d01887278>

In [0]:
predict = model.predict(x_val)

In [0]:
pred_labels = np.round(predict)

In [0]:
val_indices = indices[-num_validation_samples:]

In [0]:
bool_labels = y_val == 1

In [0]:
bool_pred = pred_labels > 0.5

In [0]:
bool_pred = bool_pred.flatten()

In [0]:
mislabeled = np.logical_xor(bool_labels, bool_pred, dtype=bool)


In [0]:
mis = []
for i, j in zip(mislabeled, range(len(mislabeled))):
  if i:
    mis.append(val_indices[j])

In [58]:
print(mis)
len(mis)

[217964, 271545, 9019, 16170, 213326, 97220, 237833, 279378, 66260, 272873, 264010, 12458, 50157, 40586, 148627, 189046, 56834, 46281, 183142, 127951, 217449, 62949, 60989, 279177, 145248, 235511, 195073, 21897, 297668, 226426, 76292, 242530, 66149, 117507, 78563, 194766, 295150, 140020, 91312, 186882, 31527, 62785, 219605, 260117, 241623, 283192, 109072, 295586, 95323, 186655, 20682, 254470, 210680, 19925, 84712, 200728, 117380, 33523, 67962, 144854, 280371, 132137, 244280, 73208, 283761, 121346, 84158, 243190, 178790, 218096, 88645, 178079, 267011, 10580, 84365, 53273, 108348, 204122, 64694, 69703, 53803, 175811, 209657, 293407, 160686, 212600, 137035, 43108, 161917, 32293, 48932, 247153, 54068, 263708, 218612, 175176, 245360, 164763, 24348, 281798, 216337, 36117, 147169, 86804, 281023, 211643, 84053, 2007, 119202, 104702, 230238, 87543, 52680, 275994, 161069, 186843, 110282, 162025, 219769, 86973, 3447, 33485, 84601, 256769, 78089, 97355, 26750, 52970, 191623, 128911, 188292, 111666

12503

In [0]:
missed_indices = np.asarray(mis)

In [0]:
missed_tweets = []
for i in missed_indices:
  missed_tweets.append(tweets[i])

In [17]:
print(missed_tweets[4565])

Prime Minister Abe's address was a testament to the strong relationship between our two countries #AbeInTheUSA


In [0]:
from google.colab import files
with open('mislabeled2.txt', 'w') as f:
  for i in missed_tweets:
    f.write(i + '\n')

In [0]:
files.download('mislabeled2.txt')