# Activity 9.02: Building an RNN for predicting Tweets Sentiments

In [2]:
import numpy as np
import pandas as pd

In [4]:
data = pd.read_csv("../Datasets/Tweets.csv")

In [5]:
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [6]:
data.tail()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
14635,569587686496825344,positive,0.3487,,0.0,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,
14636,569587371693355008,negative,1.0,Customer Service Issue,1.0,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,
14637,569587242672398336,neutral,1.0,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",
14638,569587188687634433,negative,1.0,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada)
14639,569587140490866689,neutral,0.6771,,0.0,American,,daviddtwu,,0,@AmericanAir we have 8 ppl so we need 2 know h...,,2015-02-22 11:58:51 -0800,"dallas, TX",


In [7]:
df = data[['text','airline_sentiment']]

In [8]:
df = df[df['airline_sentiment'] != 'neutral']

In [9]:
y = df['airline_sentiment'].map({'negative':0, 'positive':1})\
                           .values

In [10]:
X = df['text']

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence \
    import pad_sequences

In [12]:
tokenizer = Tokenizer(num_words=10000)

In [13]:
tokenizer.fit_on_texts(X)

In [14]:
tokenizer.word_index

{'to': 1,
 'the': 2,
 'i': 3,
 'a': 4,
 'united': 5,
 'you': 6,
 'for': 7,
 'flight': 8,
 'and': 9,
 'on': 10,
 'my': 11,
 'usairways': 12,
 'americanair': 13,
 'is': 14,
 'in': 15,
 'southwestair': 16,
 'of': 17,
 'jetblue': 18,
 'me': 19,
 'your': 20,
 'it': 21,
 'was': 22,
 'not': 23,
 'no': 24,
 'have': 25,
 'at': 26,
 'with': 27,
 'that': 28,
 'this': 29,
 'get': 30,
 'but': 31,
 'be': 32,
 'cancelled': 33,
 'thanks': 34,
 'now': 35,
 'service': 36,
 'are': 37,
 'we': 38,
 'from': 39,
 'an': 40,
 'been': 41,
 'just': 42,
 '2': 43,
 'so': 44,
 'customer': 45,
 'help': 46,
 't': 47,
 'can': 48,
 'time': 49,
 'co': 50,
 'up': 51,
 'hours': 52,
 'http': 53,
 'do': 54,
 'hold': 55,
 'they': 56,
 'out': 57,
 'amp': 58,
 'plane': 59,
 "i'm": 60,
 'us': 61,
 'all': 62,
 'will': 63,
 'why': 64,
 'thank': 65,
 'still': 66,
 'our': 67,
 'delayed': 68,
 'what': 69,
 'when': 70,
 'how': 71,
 'one': 72,
 "can't": 73,
 'flights': 74,
 'call': 75,
 'gate': 76,
 'hour': 77,
 'had': 78,
 'flightled

In [15]:
vocab_size = len(tokenizer.word_index) + 1

In [16]:
encoded_tweets = tokenizer.texts_to_sequences(X)

In [17]:
padded_tweets = pad_sequences(encoded_tweets, maxlen=280, \
                              padding='post')

In [18]:
padded_tweets.shape

(11541, 280)

In [19]:
indices = np.random.permutation(padded_tweets.shape[0])

In [20]:
train_idx = indices[:10000]
test_idx = indices[10000:]

In [21]:
X_train = padded_tweets[train_idx,]
X_test = padded_tweets[test_idx,]
y_train = y[train_idx,]
y_test = y[test_idx,]

In [22]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding

In [23]:
embedding_vector_length = 300

In [24]:
model = Sequential()
    
model.add(Embedding(vocab_size, embedding_vector_length, \
                    input_length=280))

model.add(LSTM(units= 50, activation = 'relu', \
               return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(100, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 280, 300)          3974400   
_________________________________________________________________
lstm (LSTM)                  (None, 280, 50)           70200     
_________________________________________________________________
dropout (Dropout)            (None, 280, 50)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 4,105,101
Trainable params: 4,105,101
Non-trainable params: 0
______________________________________________

In [26]:
model.compile(optimizer='adam', loss='binary_crossentropy', \
              metrics=['accuracy'])

In [27]:
model.fit(X_train, y_train, epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1ea121ff8c8>