# Working with Twitter data

In this lecture, you will learn how to preprocess actual twitter data and create word embeddings from a pre-trained source. 

## Import the necessary modules

In [1]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, SimpleRNN, GRU
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

Using TensorFlow backend.


## Do some initial cleaning using the twitter preprocessing Python package


In [2]:
#step 1: bring in the cleaned up twitter data: used preprocessing python package to clean up the initial twitter data
%cd C:\Users\Max\Documents\TwitterDataModeling\TwitterData
import pandas as pd
twitter_labeled=pd.read_csv("US_AMEX_Tweets_recent2_MV_labels2_05012020_05042020_v2.csv", encoding="utf-8-sig")
#need to save as utf-8 .csv file from excel or other source
twitter_labeled2=twitter_labeled.drop_duplicates()
dups=pd.DataFrame(twitter_labeled2['text'].value_counts()).rename(columns={'text':'count'})
twitter_labeled3=pd.merge(twitter_labeled2, dups, left_on='text', right_index=True, how="inner")
twitter_labeled4=twitter_labeled3.drop(axis=1, index=[226, 760, 235, 757, 247, 759, 535, 330, 680, 780])
twitter_labeled4.drop('count', axis=1, inplace=True)
len(twitter_labeled4) #we have 442 unique tweets

C:\Users\Max\Documents\TwitterDataModeling\TwitterData


442

In [3]:
twitter_labeled4.columns

Index(['text', 'Label'], dtype='object')

## Since the goal here is to capture negative sentiment, let us convert into 2 classes only: 'negative' and 'non-negative'

In [4]:
#step 2: only consider 2 classes, negative and non negative, since we want to capture the negatuve sentiment: create
#training and test sets
twitter_labeled4.loc[:, 'Label_comb']=twitter_labeled4['Label'].map(lambda x: 0 if x==-1 else 1)
twitter_labeled4['Label_comb'].value_counts()
neg_tweets=twitter_labeled4[twitter_labeled4['Label_comb']==0]
non_neg_tweets=twitter_labeled4[twitter_labeled4['Label_comb']==1]
X_train=np.array([x for x in non_neg_tweets['text'][:150]]+[x for x in neg_tweets['text'][:150]])
Y_train=np.array([x for x in non_neg_tweets['Label_comb'][:150]]+[x for x in neg_tweets['Label_comb'][:150]])
X_test=np.array([x for x in non_neg_tweets['text'][150:]]+[x for x in neg_tweets['text'][150:]])
Y_test=np.array([x for x in non_neg_tweets['Label_comb'][150:]]+[x for x in neg_tweets['Label_comb'][150:]])
X_train_test=np.array([x for x in X_train]+[x for x in X_test])
y_train_test=np.array([x for x in Y_train]+[x for x in Y_test])
len(X_train), len(X_test), len(Y_train), len(Y_test), len(X_train_test), len(y_train_test)

(300, 142, 300, 142, 442, 442)

## Import pre-trained glove word embeddings

In [5]:
#step 3: bring in word embedding pre-trained vectors: 
import numpy as np
glove_twitter_file='glove.twitter.27B.50d.txt'
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf-8-sig') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()#removes leading and trailing spaces
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map
words_to_index, index_to_words, word_to_vec_map=read_glove_vecs(glove_twitter_file)
wordsintwitterglove=[]
for value in index_to_words.values(): 
    wordsintwitterglove.append(value)
#almost 2MM words


In [6]:
len(wordsintwitterglove)

1193514

In [7]:
print("The word 'cards' in the embedding space is: ")
print(word_to_vec_map['cards'])

The word 'cards' in the embedding space is: 
[ 0.49513   0.0292   -0.41095   0.2833    0.94241  -0.37731   0.84692
 -0.51128   0.45714  -0.58924   0.94307   0.90303  -3.4979    0.29424
 -0.26355   0.42858  -0.38724  -0.47719   0.044124  0.49529  -1.1181
 -0.48781   0.60082  -0.64361  -0.71212   0.36798  -0.059819 -0.58809
  0.67646  -0.75717   0.4728   -0.23525  -0.29401  -0.13993   0.69861
 -0.29542  -0.013882  0.065944 -0.38697   0.23558   0.50186   0.09126
  0.4026   -0.39129   0.73219  -0.52371  -0.048465 -1.2898   -0.022145
  0.42831 ]


## Typical steps to clean the data for sentiment analysis: 
* Remove hash tags and other distracting symbols
* Remove numbers
* Convert to lower case
* Remove unnecessary punctuation
* Remove stop words: unnecessary/generic words
* Stem sentences: only retain the main roots
* Tokenize sentences: create a list of words from the final cleaned string above

In [8]:
#step 4: function to clean this twitter data some more to create words that can be mapped to the embedding matrix: 
#function to take in a sentence and return a list of words, which are more ready to be vectorized by the embedding matrix: 
def clean_sentence(sentence): 
    import re
    import num2word
    from num2words import num2words
    punc_to_remove='’!"#%&\'()*+,/:;<=>?@[\\]^_`{|}~' #exclude dollar sign and dot as we want to first convert dollar amounts
    #and decimals to word numerics, and then we can remove these punctuation signs: 
    words=re.split('\s+',sentence.lower())
    table = str.maketrans('', '', punc_to_remove) #remove punctuation as it could be at the end of the word
    words = [w.translate(table) for w in words]
    words=[x for x in words if x!='']
    sentence=" ".join(words)
    #first replace only the actual numbers with words: 
    th_pattern='\d+th'
    th_numbers_to_fix=re.findall(th_pattern, sentence) #list of th numbers to convert to numerics
    dollar_pattern='\$\.?0?\d+\.?\d*'
    dollars_to_fix=re.findall(dollar_pattern, sentence)
    words=re.split('\s+',sentence.lower())
    remove_numerics=[]
    for word in words: 
        try:
            if(word in th_numbers_to_fix):            
                remove_numerics.append(num2words(int(float(word.replace('th',''))), to='ordinal'))
            elif (word in dollars_to_fix): 
                number=int(float(word.replace('$','')))
                if(number==1): 
                    remove_numerics.append(num2words(int(float(word.replace('$','')))))
                    remove_numerics.append('dollar')
                else: 
                    remove_numerics.append(num2words(int(float(word.replace('$','')))))
                    remove_numerics.append('dollars')  
            else:
                number=int(float(word)) #convert decimal or integer string into an integer
                remove_numerics.append(num2word.word(number))
        except: 
            remove_numerics.append(word)
    punc_to_remove='$.' #exclude dollar sign
    sentence=" ".join(remove_numerics)
    words=re.split('\s+',sentence.lower())
    table = str.maketrans('', '', punc_to_remove) #remove punctuation as it could be at the end of the word
    words = [w.translate(table) for w in words]
    words=[x for x in words if x not in ('', '-')]
    return words
sentence="I used to work for American Express and spent some time in NY. It was on her bucket list to visit, so I took her there a year and a half ago. We love living around no one, but NYC is a fun place to visit! Well probably not so much at the moment, but we’ll get through this!"
words=clean_sentence(sentence)
print("Original sentence is: ")
print(" ")
print(sentence)
print(" ")
print("Tokenized sentence is now: ")
print(" ")
print(words)

Original sentence is: 
 
I used to work for American Express and spent some time in NY. It was on her bucket list to visit, so I took her there a year and a half ago. We love living around no one, but NYC is a fun place to visit! Well probably not so much at the moment, but we’ll get through this!
 
Tokenized sentence is now: 
 
['i', 'used', 'to', 'work', 'for', 'american', 'express', 'and', 'spent', 'some', 'time', 'in', 'ny', 'it', 'was', 'on', 'her', 'bucket', 'list', 'to', 'visit', 'so', 'i', 'took', 'her', 'there', 'a', 'year', 'and', 'a', 'half', 'ago', 'we', 'love', 'living', 'around', 'no', 'one', 'but', 'nyc', 'is', 'a', 'fun', 'place', 'to', 'visit', 'well', 'probably', 'not', 'so', 'much', 'at', 'the', 'moment', 'but', 'well', 'get', 'through', 'this']


In [13]:
%pwd

'C:\\Users\\Max\\Documents\\TwitterDataModeling\\TwitterData'

## Now, let us look at our training and test sets: 


In [8]:
X_train[0:2]

array(['Question of the Day - Is it worth the effort to get the Hilton Ascend credit card that can be linked to Wyndham Diamond and Caesars Diamond status rewards cards?',
       'Today’s Digital Transactions News: T&E Plunge Hammers AmEx; PayFac Volume To Hit $4 Trillion by 2025; Plus Weekly Recap'],
      dtype='<U280')

In [9]:
X_test[0:2]

array(['rumor has it amex might be planning additional benefits to their platinum card... meanwhile the sapphire took a downfall with their recent updates (imo anyway....the benefits are useless to me making the price increase not worth it)',
       '[Targeted] AmEx Offer: , Spend $100+ & Receive $30 Statement Credit + $50 Off $150+'],
      dtype='<U280')

### Step 1: find the maximum number of words across all sentences. 

In [9]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m,max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words=clean_sentence(X[i])
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if (w in wordsintwitterglove): 
                X_indices[i, j] = word_to_index[w]
                # Increment j to j + 1
                j = j+1
            

    
    return X_indices

In [10]:
#testing the above: 
X1 =  X_train[0:2]
maxlen=max(max([len(x.split()) for x in X_train]), max([len(x.split()) for x in X_test]))
X1_indices = sentences_to_indices(X1,words_to_index, max_len = maxlen)
print("X1 =", X1)
print("X1_indices =", X1_indices)

X1 = ['Question of the Day - Is it worth the effort to get the Hilton Ascend credit card that can be linked to Wyndham Diamond and Caesars Diamond status rewards cards?'
 'Today’s Digital Transactions News: T&E Plunge Hammers AmEx; PayFac Volume To Hit $4 Trillion by 2025; Plus Weekly Recap']
X1_indices = [[503357. 446383. 601627. 138215. 283380. 284816. 657072. 601627. 176556.
  607687. 227866. 601627. 258111.  37963. 127558.  96730. 601405.  94350.
   59105. 342820. 607687. 658588. 151100.  26338.  91709. 151100. 573728.
  519712.  96819.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.      0.      0.      0.      0.
       0.      0.      0.      0.      0.]
 [608104. 153374. 612987. 406728. 593762. 483492. 248627.  24377. 644128.
  607687. 259017. 213327. 163389. 615083.  89093. 621963. 603283. 620819.
  208201. 483525. 650553. 510109.      0.      0.      0. 

In [12]:
#notice above that the words need to be padded to make sure each input sentence takes up the same amount of static
#memory when being fed into a Recurrent Neural Network

### Step 2: Define an embedding layer

In [11]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (2MM words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    

    # Initialize the embedding matrix as a numpy array of zeros of shape (vocab_len, dimensions of word vectors = emb_dim)
    emb_matrix = np.zeros((vocab_len,emb_dim )) #dim is num words by 50
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    j=0
    for word, index in word_to_index.items():
        if len(word_to_vec_map[word])==50:
            j+=1
            emb_matrix[index, :] = word_to_vec_map[word]
    print("number of words with 50 as vector size is: ",j) #only 3 words had vector size less than 50
    # Define Keras embedding layer with the correct output/input sizes, make it non-trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(input_dim=vocab_len,output_dim=emb_dim,trainable=False)


    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    return embedding_layer

In [12]:
#test above code
embedding_layer = pretrained_embedding_layer(word_to_vec_map, words_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3]) #weight of 2nd word 3rd position

number of words with 50 as vector size is:  1193513
weights[0][1][3] = -0.80743


In [15]:
embedding_layer.get_weights()[0].shape #num words by vector size

(1193515, 50)

### Step 3: create LSTM model definition - just 1 LSTM layer

In [15]:
def Emojify_V2_LSTM_OneLayer(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (about 2MM words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = Input(shape=input_shape,dtype='int32')
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)  
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)   
    # Propagate the embeddings through a Simple RNN layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    #here, we are using tanh function
    X = LSTM(
    units=128,
    activation="tanh", #this is for the hidden unit update - both c tilda and final activation unit
    recurrent_activation="sigmoid", #this is for Gamma update and relevance gates computation
    use_bias=True,
    kernel_initializer="glorot_uniform",
    recurrent_initializer="orthogonal",
    bias_initializer="zeros",
    kernel_regularizer=None,
    recurrent_regularizer=None,
    bias_regularizer=None,
    activity_regularizer=None,
    kernel_constraint=None,
    recurrent_constraint=None,
    bias_constraint=None,
    dropout=0.0,
    recurrent_dropout=0.0,
    return_sequences=False, #here just want to output the last time unit prediction
    return_state=False,
    go_backwards=False,
    stateful=False,
    unroll=False
    #time_major=False,
    #reset_after=False #if true, then there will be more bias terms. 
    )(embeddings)
    # Add dropout with a probability of 0.5
    #X = Dropout(0.5)(X)
    # Add dropout with a probability of 0.5
    #X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of 5-dimensional vectors.
    X = Dense(units=2, activation="softmax")(X) #here, we actually could just have 1 unit and have sigmoid, but 
    #with more categories to predict, we would need more units - one unit for each category after which we would apply 
    #the softmax activation for each.
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    ### END CODE HERE ###
    
    return model

In [16]:
#below, for the LSTM layer, we need to learn 3 gates + the main hidden unit tilda weight matrix, so we need to learn in 
#total: 
#128 by (128 + 50) + (128 by 1) and then this 4 times, which equals to: 
#4*(128*(128+50)+128)=91,648
#the last layer needs to take in the 128 final t output activation units and input them into a 2 unit dense layer, since 
#we have 2 classes. This means, we need to learn 2 by 128 weights plus bias, so number of parameters to be learned here is: 
#2*128+2=258 parameters still

In [17]:
model = Emojify_V2_LSTM_OneLayer((maxlen,), word_to_vec_map, words_to_index)
model.summary()

number of words with 50 as vector size is:  1193513
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 59)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 59, 50)            59675750  
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               91648     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 258       
Total params: 59,767,656
Trainable params: 91,906
Non-trainable params: 59,675,750
_________________________________________________________________


In [18]:
def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y

### Step 4: fit model above on training data

In [19]:
#let us now try fitting this simple model: 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#we can run the Adam optimizer and track the overall 'accuracy' metric
X_train_indices = sentences_to_indices(X_train, words_to_index, maxlen)
#convert to one hot encoding
Y_train_oh = convert_to_one_hot(Y_train, C = 2) #we have 2 classes
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x17b151c7d68>

### Step 5: Evaluate model performance

In [20]:
X_test_indices = sentences_to_indices(X_test, words_to_index, max_len = maxlen)
Y_test_oh = convert_to_one_hot(Y_test, C = 2)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.6971830725669861


In [21]:
loss, acc = model.evaluate(X_train_indices, Y_train_oh)
print()
print("Train accuracy = ", acc)


Train accuracy =  0.9666666388511658


In [None]:
#note above that the model is over fitting on the test set more than the GRU slightly but much less than the RNN!