In [234]:
# Importing the libraries
import trax
import trax.fastmath.numpy as np
import trax.layers as tl
from nltk.corpus import twitter_samples,stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import random
import string
import os
import re
import nltk

In [235]:
# Function to preprocess the tweets
def preprocess(tweet):
    nltk.download("stopwords",quiet=True)
    stopword = stopwords.words("english")
    punctuations = string.punctuation
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokens = TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=True).tokenize(tweet)
    stemmed_tokens = []
    for token in tokens:
        if token not in stopword and token not in punctuations:
            stemmed_tokens.append(PorterStemmer().stem(token))
    return stemmed_tokens
    

In [236]:
# Downloading the twitter samples
nltk.download("twitter_samples")

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/shubham/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [237]:
# Loading the positive and negative tweets
positive_tweets = twitter_samples.strings("positive_tweets.json")
negative_tweets = twitter_samples.strings("negative_tweets.json")
num_positive_tweets = len(positive_tweets)
num_negative_tweets = len(negative_tweets)

In [238]:
# Splitting the data into training and testing
validation_split = 20
training_split = 100 - validation_split
tweets_train_pos = positive_tweets[:(num_positive_tweets//100)*training_split]
tweets_train_neg = negative_tweets[:(num_negative_tweets//100)*training_split]
tweets_train =tweets_train_pos+tweets_train_neg
tweets_test_pos =  positive_tweets[(num_positive_tweets//100)*training_split:] 
tweets_test_neg = negative_tweets[(num_negative_tweets//100)*training_split*100:]
tweets_test = tweets_test_pos+tweets_test_neg
labels_train = np.append(np.ones((num_positive_tweets//100)*training_split),np.zeros((num_negative_tweets//100)*training_split))
labels_test = np.append(np.ones(num_positive_tweets - (num_positive_tweets//100)*training_split),np.zeros(num_negative_tweets - (num_negative_tweets//100)*training_split))

In [239]:
# Show a random tweet before and after processing
num = random.randint(0,num_positive_tweets-1)
print(f"Tweet before processing: {positive_tweets[num]}")
print(f"Tweet after processing: ",preprocess(positive_tweets[num]))

Tweet before processing: @metalgear_jp @Kojima_Hideo I want you're T-shirts ! They are so cool ! :D
Tweet after processing:  ['want', 't-shirt', 'cool', ':d']


In [240]:
# Creating the vocabulary
vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2}
for tweet in tweets_train: 
    processed_tweet = preprocess(tweet)
    for word in processed_tweet:
        if word not in vocab: 
            vocab[word] = len(vocab)
    
print("Total words in vocab are",len(vocab))
display(vocab)

Total words in vocab are 9088


{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'followfriday': 3,
 'top': 4,
 'engag': 5,
 'member': 6,
 'commun': 7,
 'week': 8,
 ':)': 9,
 'hey': 10,
 'jame': 11,
 'odd': 12,
 ':/': 13,
 'pleas': 14,
 'call': 15,
 'contact': 16,
 'centr': 17,
 '02392441234': 18,
 'abl': 19,
 'assist': 20,
 'mani': 21,
 'thank': 22,
 'listen': 23,
 'last': 24,
 'night': 25,
 'bleed': 26,
 'amaz': 27,
 'track': 28,
 'scotland': 29,
 'congrat': 30,
 'yeaaah': 31,
 'yipppi': 32,
 'accnt': 33,
 'verifi': 34,
 'rqst': 35,
 'succeed': 36,
 'got': 37,
 'blue': 38,
 'tick': 39,
 'mark': 40,
 'fb': 41,
 'profil': 42,
 '15': 43,
 'day': 44,
 'one': 45,
 'irresist': 46,
 'flipkartfashionfriday': 47,
 'like': 48,
 'keep': 49,
 'love': 50,
 'custom': 51,
 'wait': 52,
 'long': 53,
 'hope': 54,
 'enjoy': 55,
 'happi': 56,
 'friday': 57,
 'lwwf': 58,
 'second': 59,
 'thought': 60,
 '’': 61,
 'enough': 62,
 'time': 63,
 'dd': 64,
 'new': 65,
 'short': 66,
 'enter': 67,
 'system': 68,
 'sheep': 69,
 'must': 70,
 'buy':

In [241]:
# Function to convert tweet to tensor
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__'):
    preprocessed_tweet = preprocess(tweet)
    tensor = []
    for word in preprocessed_tweet:
        if word not in vocab_dict:
            tensor.append(vocab_dict[unk_token])
        else:
            tensor.append(vocab_dict[word])
    return tensor

In [242]:
# Show a random tweet before and after tensor conversion
num = random.randint(0,len(tweets_test)-1)
print("Actual tweet is\n", tweets_test[1])
print("\nTensor of tweet:\n", tweet_to_tensor(tweets_test[1], vocab_dict=vocab))

Actual tweet is
 @heyclaireee is back! thnx God!!! i'm so happy :)

Tensor of tweet:
 [443, 2, 303, 566, 56, 9]


In [243]:
# Function to create the data generator
def data_generator(data_pos, data_neg, batch_size, loop, vocab_dict, shuffle=False):
    n_to_take = batch_size // 2
    pos_index = 0
    neg_index = 0
    len_data_pos = len(data_pos)
    len_data_neg = len(data_neg)
    pos_index_lines = list(range(len_data_pos))
    neg_index_lines = list(range(len_data_neg))
    if shuffle:
        random.shuffle(pos_index_lines)
        random.shuffle(neg_index_lines) 
    stop = False
    while not stop:  
        batch = []
        for i in range(n_to_take):

            if pos_index >= len_data_pos: 
                if not loop:
                    stop = True;
                    break;
                pos_index = 0
                if shuffle:
                    random.shuffle(pos_index_lines)
            tweet = data_pos[pos_index_lines[pos_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)
            pos_index = pos_index + 1
        for i in range(n_to_take):
            if neg_index >= len_data_neg:
                if not loop:
                    stop = True;
                    break;
                neg_index = 0
                
                if shuffle:
                    random.shuffle(neg_index_lines)
            tweet = data_neg[neg_index_lines[neg_index]]
            tensor = tweet_to_tensor(tweet, vocab_dict)
            batch.append(tensor)
            neg_index += 1
        if stop:
            break;
        pos_index += n_to_take
        neg_index += n_to_take
        max_len = max([len(t) for t in batch]) 
        tensor_pad_l = []
        for tensor in batch:
            n_pad = max_len - len(tensor)
            pad_l = [0]*n_pad
            tensor_pad = tensor + pad_l
            tensor_pad_l.append(tensor_pad)
        inputs = np.array(tensor_pad_l)
        target_pos = [1]*n_to_take
        target_neg = [0]*n_to_take
        target_l = target_pos + target_neg
        targets = np.array(target_l)
        example_weights = np.ones_like(targets)
        yield inputs, targets, example_weights

In [244]:
# Function to create the model
def classifier(vocab_size=len(vocab), embedding_dim=256, output_dim=2, mode='train'):
    embed_layer = tl.Embedding(
    vocab_size=vocab_size,
    d_feature=embedding_dim)  
    mean_layer = tl.Mean(axis=1)
    dense_output_layer = tl.Dense(n_units = output_dim)
    log_softmax_layer = tl.LogSoftmax()
    model = tl.Serial(
      embed_layer,
      mean_layer,
      dense_output_layer, 
      log_softmax_layer
    )
    return model

In [245]:
# Create the different generators
def train_generator(batch_size, shuffle = False):
    return data_generator(tweets_train_pos, tweets_train_neg, batch_size, False, vocab, shuffle)

def val_generator(batch_size, shuffle = False):
    return data_generator(tweets_test_pos, tweets_test_neg, batch_size, False, vocab, shuffle)

def test_generator(batch_size, shuffle = False):
    return data_generator(tweets_test_pos, tweets_test_neg, batch_size, False, vocab, shuffle)

In [246]:
# Define the training and evaluation tasks
from trax.supervised import training 
batch_size = 32
train_task = training.TrainTask(
    labeled_data=train_generator(batch_size=batch_size, shuffle=True),
    loss_layer=tl.CrossEntropyLoss(),
    optimizer=trax.optimizers.Adam(0.01),
    n_steps_per_checkpoint=10,
)

eval_task = training.EvalTask(labeled_data=train_generator(batch_size=batch_size,shuffle=True),metrics=[tl.CrossEntropyLoss(),tl.Accuracy()])

model = classifier()

In [247]:
# Define the training loop
def train_model(classifier, train_task, eval_task, n_steps):
    training_loop = training.Loop(
                                classifier,
                                train_task, 
                                eval_tasks = eval_task) 
    training_loop.run(n_steps = n_steps)
    return training_loop

In [248]:
# Train the model
training_loop = train_model(model, train_task, eval_task, 100)

Will not write evaluation metrics, because output_dir is None.
Did not save checkpoint as output_dir is None

Step      1: Total number of trainable weights: 2327042
Step      1: Ran 1 train steps in 0.37 secs
Step      1: train CrossEntropyLoss |  0.69444042
Step      1: eval  CrossEntropyLoss |  0.70644379
Step      1: eval          Accuracy |  0.50000000
Did not save checkpoint as output_dir is None

Step     10: Ran 9 train steps in 1.50 secs
Step     10: train CrossEntropyLoss |  0.63574749
Step     10: eval  CrossEntropyLoss |  0.55278647
Step     10: eval          Accuracy |  0.96875000
Did not save checkpoint as output_dir is None

Step     20: Ran 10 train steps in 1.47 secs
Step     20: train CrossEntropyLoss |  0.45609015
Step     20: eval  CrossEntropyLoss |  0.30403709
Step     20: eval          Accuracy |  0.96875000
Did not save checkpoint as output_dir is None

Step     30: Ran 10 train steps in 0.89 secs
Step     30: train CrossEntropyLoss |  0.26376367
Step     30: ev

In [250]:
# Function to predict the sentiment of a tweet
def predict(sentence):
    inputs = np.array(tweet_to_tensor(sentence, vocab_dict=vocab))
    inputs = inputs[None, :]  
    preds_probs = model(inputs)
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'
    return preds, sentiment

In [271]:
# Predict the sentiment of your own sentence
sentence = "That was horrible."
tmp_pred, tmp_sentiment = predict(sentence)
print(f"The sentiment of the sentence \n***\n\"{sentence}\"\n***\nis {tmp_sentiment}.")

The sentiment of the sentence 
***
"That was horrible."
***
is negative.
