In [1]:
import os 
import shutil
import random as rnd

import tensorflow as tf
import numpy as np
import string
import re
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, twitter_samples 

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Trung\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Trung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<a name="2"></a>
## 2 - Importing the Data

<a name="2-1"></a>
### 2.1 - Loading in the Data

Import the data set.  

In [44]:
def load_tweets():
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
    return all_positive_tweets, all_negative_tweets

In [45]:
## DO NOT EDIT THIS CELL

# Import functions from the utils.py file

def train_val_split():
    # Load positive and negative tweets
    all_positive_tweets, all_negative_tweets = load_tweets()

    # View the total number of positive and negative tweets.
    print(f"The number of positive tweets: {len(all_positive_tweets)}")
    print(f"The number of negative tweets: {len(all_negative_tweets)}")

    # Split positive set into validation and training
    val_pos   = all_positive_tweets[4000:] # generating validation set for positive tweets
    train_pos  = all_positive_tweets[:4000]# generating training set for positive tweets

    # Split negative set into validation and training
    val_neg   = all_negative_tweets[4000:] # generating validation set for negative tweets
    train_neg  = all_negative_tweets[:4000] # generating training set for nagative tweets
    
    # Combine training data into one set
    train_x = train_pos + train_neg 

    # Combine validation data into one set
    val_x  = val_pos + val_neg

    # Set the labels for the training set (1 for positive, 0 for negative)
    train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))

    # Set the labels for the validation set (1 for positive, 0 for negative)
    val_y  = np.append(np.ones(len(val_pos)), np.zeros(len(val_neg)))


    return train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y

In [46]:
train_pos, train_neg, train_x, train_y, val_pos, val_neg, val_x, val_y = train_val_split()

print(f"length of train_x {len(train_x)}")
print(f"length of val_x {len(val_x)}")

The number of positive tweets: 5000
The number of negative tweets: 5000
length of train_x 8000
length of val_x 2000


### Process Tweet

In [47]:
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

# Stop words are messy and not that compelling; 
# "very" and "not" are considered stop words, but they are obviously expressing sentiment

# The porter stemmer lemmatizes "was" to "wa".  Seriously???

# I'm not sure we want to get into stop words
stopwords_english = stopwords.words('english')

# Also have my doubts about stemming...
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [48]:
def process_tweet(tweet):
    '''
    Input: 
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet
    
    '''
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    ### START CODE HERE ###
    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
            word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    ### END CODE HERE ###
    return tweets_clean

In [49]:
# Try out function that processes tweets
print("original tweet at training position 0")
print(train_pos[0])

print("Tweet at training position 0 after processing:")
process_tweet(train_pos[0])

original tweet at training position 0
#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
Tweet at training position 0 after processing:


['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

Notice that the function `process_tweet` keeps key words, removes the hash # symbol, and ignores usernames (words that begin with '@').  It also returns a list of the words.

<a name="2-2"></a>
### 2.2 - Building the Vocabulary

Now build the vocabulary.
- Map each word in each tweet to an integer (an "index"). 
- Note that we will build the vocabulary based on the training data. 
- To do so, we will assign an index to everyword by iterating over your training set.

The vocabulary will also include some special tokens
- `__PAD__`: padding
- `</e>`: end of line
- `__UNK__`: a token representing any word that is not in the vocabulary.

In [50]:
# Build the vocabulary
# Unit Test Note - There is no test set here only train/test
def get_vocab(train_x):

    # Include special tokens 
    # started with pad, end of line and unk tokens
    Vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 

    # Note that we build vocab using training data
    for tweet in train_x: 
        processed_tweet = process_tweet(tweet)
        for word in processed_tweet:
            if word not in Vocab: 
                Vocab[word] = len(Vocab)
    
    return Vocab

Vocab = get_vocab(train_x)

print("Total words in vocab are",len(Vocab))
display(Vocab)

Total words in vocab are 9089


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

<a name="2-3"></a>
## 2.3 - Converting a Tweet to a Tensor

Write a function that will convert each tweet to a tensor (a list of unique integer IDs representing the processed tweet).
- Note, the returned data type will be a **regular Python `list()`**
    - We won't use TensorFlow in this function
    - We also won't use a numpy array
    - We also won't use trax.fastmath.numpy array
- For words in the tweet that are not in the vocabulary, set them to the unique ID for the token `__UNK__`.

##### Example
Input a tweet:
```CPP
'@happypuppy, is Maria happy?'
```

The tweet_to_tensor will first conver the tweet into a list of tokens (including only relevant words)
```CPP
['maria', 'happi']
```

Then it will convert each word into its unique integer

```CPP
[2, 56]
```
- Notice that the word "maria" is not in the vocabulary, so it is assigned the unique integer associated with the `__UNK__` token, because it is considered "unknown."



<a name="ex-1"></a>
### Tweet_to_tensor
Write a program `tweet_to_tensor` that takes in a tweet and converts it to an array of numbers. We can use the `Vocab` dictionary you just found to help create the tensor. 

- Use the vocab_dict parameter and not a global variable.
- Do not hard code the integer value for the `__UNK__` token.

In [51]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):
    '''
    Input: 
        tweet - A string containing a tweet
        vocab_dict - The words dictionary
        unk_token - The special string for unknown tokens
        verbose - Print info durign runtime
    Output:
        tensor_l - A python list with
        
    '''     
    # Process the tweet into a list of words
    # where only important words are kept (stop words removed)
    word_l = process_tweet(tweet)
    
    if verbose:
        print("List of words from the processed tweet:")
        print(word_l)
        
    # Initialize the list that will contain the unique integer IDs of each word
    tensor_l = [] 
    
    # Get the unique integer ID of the __UNK__ token
    unk_ID = vocab_dict[unk_token]
    
    if verbose:
        print(f"The unique integer ID for the unk_token is {unk_ID}")
        
    # for each word in the list:
    for word in word_l:
        
        # Get the unique integer ID.
        # If the word doesn't exist in the vocab dictionary,
        # use the unique ID for __UNK__ instead.        
        word_ID = vocab_dict[word] if word in vocab_dict else unk_ID
            
        # Append the unique integer ID to the tensor list.
        tensor_l.append(word_ID)
    
    return tensor_l

In [52]:
print("Actual tweet is:")
print(test_pos[0])
print("\nTensor of tweet:\n", tweet_to_tensor(test_pos[0], vocab_dict=Vocab))

Actual tweet is:
Bro:U wan cut hair anot,ur hair long Liao bo
Me:since ord liao,take it easy lor treat as save $ leave it longer :)
Bro:LOL Sibei xialan

Tensor of tweet:
 [1064, 136, 478, 2351, 744, 8149, 1122, 744, 53, 2, 2671, 790, 2, 2, 348, 600, 2, 3488, 1016, 596, 4558, 9, 1064, 157, 2, 2]


<a name="2-4"></a>
### 2.4 - Creating a Batch Generator

Most of the time in Natural Language Processing, and AI in general we use batches when training our data sets. 
- If instead of training with batches of examples, you were to train a model with one example at a time, it would take a very long time to train the model. 
- You will now build a data generator that takes in the positive/negative tweets and returns a batch of training examples. It returns the model inputs, the targets (positive or negative labels) and the weight for each target (ex: this allows us to can treat some examples as more important to get right than others, but commonly this will all be 1.0). 

Once you create the generator, you could include it in a for loop

```CPP
for batch_inputs, batch_targets, batch_example_weights in data_generator:
    ...
```

You can also get a single batch like this:

```CPP
batch_inputs, batch_targets, batch_example_weights = next(data_generator)
```
The generator returns the next batch each time it's called. 
- This generator returns the data in a format (tensors) that you could directly use in your model.
- It returns a triplet: the inputs, targets, and loss weights:
    - Inputs is a tensor that contains the batch of tweets we put into the model.
    - Targets is the corresponding batch of labels that we train to generate.
    - Loss weights here are just 1s with same shape as targets. Next week, you will use it to mask input padding.

In [53]:
def data_generator(data_pos, data_neg, vocab_dict, max_len = 20, shuffle=True):
    tensor_pad_l = []
    target_l = []
    # Positive tweet
    for tweet in data_pos:
        tensor = tweet_to_tensor(tweet, vocab_dict)
        if (len(tensor) <= 20):
            n_pad = max_len - len(tensor)
            
            # Generate a list of zeros, with length n_pad
            pad_l = [0]*n_pad
            
            # concatenate the tensor and the list of padded zeros
            tensor_pad = tensor + pad_l
            tensor_pad_l.append(tensor_pad)
            target_l.append(1)
    
    # Positive tweet
    for tweet in data_neg:
        tensor = tweet_to_tensor(tweet, vocab_dict)
        if (len(tensor) <= 20):
            n_pad = max_len - len(tensor)
            
            # Generate a list of zeros, with length n_pad
            pad_l = [0]*n_pad
            
            # concatenate the tensor and the list of padded zeros
            tensor_pad = tensor + pad_l
            tensor_pad_l.append(tensor_pad)
            target_l.append(0)
    
    tensor_pad_l_ind = list(range(len(tensor_pad_l)))
    if shuffle:
        rnd.shuffle(tensor_pad_l_ind)

    final_tensor = []
    final_target = []
    
    for ind in tensor_pad_l_ind:
        final_tensor.append(tensor_pad_l[ind])
        final_target.append(target_l[ind])
    
    return np.array(final_tensor), np.array(final_target)

Now we can use our data generator to create a data generator for the training data, and another data generator for the test 
data.
We will create a third data generator that does not loop, for testing the final accuracy of the model.

<a name="3-3"></a>
## 3 - Model

Now we will implement a classifier using neural networks. Here is the model architecture you will be implementing. 

<img src = "images/nn.jpg" style="width:400px;height:250px;"/>

We will use layers:

    - Embedding
    - Mean
    - Dense

Please use the `help` function to view documentation for each layer.

In [54]:
vocab_size = len(Vocab)
embedding_dim = 256
output_dim = 2

model = tf.keras.Sequential(
    [tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(units=2)]
)

model.compile(loss = tf.keras.losses.BinaryCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy())

### Summary model

In [55]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 256)         2326784   
                                                                 
 global_average_pooling1d_2   (None, 256)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 2)                 514       
                                                                 
Total params: 2,327,298
Trainable params: 2,327,298
Non-trainable params: 0
_________________________________________________________________


<a name="4"></a>
## 4 - Training

### 4.1 - Generate train data

In [56]:
train_x, train_y = data_generator(train_pos, train_neg, Vocab, 20)

### 4.2 - Training

In [57]:
epochs = 50
history = model.fit(
    x=train_x,
    y=train_y,
    validation_data=data_generator(val_pos, val_neg, Vocab, 20),
    epochs=epochs,
    batch_size=20,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


### 4.2 - Practice Making a Prediction

In [58]:
test_inputs, test_targets = data_generator(test_pos, test_neg, Vocab, 20)
test_inputs = test_inputs[:20, :]
test_targets = test_targets[:20]

In [59]:
pred = model.predict(test_inputs)
pred



array([[-0.23938158, -0.5754558 ],
       [ 1.2256563 ,  1.1984658 ],
       [-0.56154853, -0.9327754 ],
       [-1.1312557 , -1.5370682 ],
       [-0.755661  , -1.1324275 ],
       [-0.36212566, -0.7021228 ],
       [ 1.3524549 ,  1.4039096 ],
       [ 1.2843306 ,  1.3806235 ],
       [-0.412334  , -0.7537724 ],
       [-0.836136  , -1.2245624 ],
       [ 1.4412472 ,  1.4956758 ],
       [ 1.3351877 ,  1.3436947 ],
       [ 1.6693764 ,  1.7559879 ],
       [-0.7242282 , -1.1326535 ],
       [ 1.2261693 ,  1.2469711 ],
       [ 1.4489805 ,  1.5413034 ],
       [-0.46705446, -0.80878055],
       [-0.43779004, -0.79300386],
       [ 1.2926633 ,  1.2723994 ],
       [ 1.4035907 ,  1.5089536 ]], dtype=float32)

In [60]:
is_positive = pred[:,0] < pred[:,1]
for i, p in enumerate(is_positive):
    print(f"Neg log prob {pred[i,0]:.4f}\tPos log prob {pred[i,1]:.4f}\t is positive? {p}\t actual {test_targets[i]}")

Neg log prob -0.2394	Pos log prob -0.5755	 is positive? False	 actual 0
Neg log prob 1.2257	Pos log prob 1.1985	 is positive? False	 actual 1
Neg log prob -0.5615	Pos log prob -0.9328	 is positive? False	 actual 0
Neg log prob -1.1313	Pos log prob -1.5371	 is positive? False	 actual 0
Neg log prob -0.7557	Pos log prob -1.1324	 is positive? False	 actual 0
Neg log prob -0.3621	Pos log prob -0.7021	 is positive? False	 actual 0
Neg log prob 1.3525	Pos log prob 1.4039	 is positive? True	 actual 1
Neg log prob 1.2843	Pos log prob 1.3806	 is positive? True	 actual 1
Neg log prob -0.4123	Pos log prob -0.7538	 is positive? False	 actual 0
Neg log prob -0.8361	Pos log prob -1.2246	 is positive? False	 actual 0
Neg log prob 1.4412	Pos log prob 1.4957	 is positive? True	 actual 1
Neg log prob 1.3352	Pos log prob 1.3437	 is positive? True	 actual 1
Neg log prob 1.6694	Pos log prob 1.7560	 is positive? True	 actual 1
Neg log prob -0.7242	Pos log prob -1.1327	 is positive? False	 actual 0
Neg log p

## 5 - Evaluation  

In [64]:
test_x, test_y = data_generator(val_pos, val_neg, Vocab, 20)
loss, accuracy = model.evaluate(test_x, test_y)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.02875853143632412
Accuracy:  0.9957350492477417


<a name="6"></a>
## 6 - Testing with our Own Input

Finally we will test with our own input. We will see that deepnets are more powerful than the older methods we have used before. Although we go close to 100% accuracy on the first two assignments, the task was way easier. 

In [65]:
# this is used to predict on your own sentnece
def predict(sentence, model):
    inputs = np.array(tweet_to_tensor(sentence, vocab_dict=Vocab))
    
    # Batch size 1, add dimension for batch, to work with the model
    inputs = inputs[None, :]  
    
    # predict with the model
    preds_probs = model.predict(inputs)
    
    # Turn probabilities into categories
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'

    return preds, sentiment


In [66]:
# try a positive sentence
sentence = "It's such a nice day, I think I'll be taking Sid to Ramsgate for lunch and then to the beach maybe."
tmp_pred, tmp_sentiment = predict(sentence, model)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

print()
# try a negative sentence
sentence = "I hated my day, it was the worst, I'm so sad."
tmp_pred, tmp_sentiment = predict(sentence, model)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

The sentiment of the sentence 
***
"It's such a nice day, I think I'll be taking Sid to Ramsgate for lunch and then to the beach maybe."
***
is positive.

The sentiment of the sentence 
***
"I hated my day, it was the worst, I'm so sad."
***
is negative.
