***Step 1 - Importing the Dataset***

In [1]:
! pip3 install wget

import wget
wget.download("https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/sentiment140-subset.csv.zip")

!unzip -n sentiment140-subset.csv.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=57531f075375320c9e50209b8db7a43edaa9727ff8d93abed6c8f9a3cccd3408
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Archive:  sentiment140-subset.csv.zip
  inflating: sentiment140-subset.csv  


***Step 2 - Loading the Dataset***

In [2]:
! pip3 install pandas
import pandas as pd

data = pd.read_csv('sentiment140-subset.csv', nrows=50000)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


***Step 3 - Reading the Dataset***

In [3]:
data.columns

Index(['polarity', 'text'], dtype='object')

In [4]:
print(len(data))
data.head()

50000


Unnamed: 0,polarity,text
0,0,@kconsidder You never tweet
1,0,Sick today coding from the couch.
2,1,"@ChargerJenn Thx for answering so quick,I was ..."
3,1,Wii fit says I've lost 10 pounds since last ti...
4,0,@MrKinetik Not a thing!!! I don't really have...


***Step 4 - Processing the Dataset***

In [5]:
import re
import tensorflow as tf

max_features = 4000

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)

In [7]:
X = tokenizer.texts_to_sequences(data['text'].values)

In [8]:
X = tf.keras.preprocessing.sequence.pad_sequences(X)

In [9]:
X.shape

(50000, 35)

***Step 5 - Create a Model***

In [10]:
embed_dim = 256
lstm_out = 196

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(tf.keras.layers.SpatialDropout1D(0.4))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_out, dropout=0.05, recurrent_dropout=0.2)))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])



In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 35, 256)           1024000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 35, 256)          0         
 ropout1D)                                                       
                                                                 
 bidirectional (Bidirectiona  (None, 392)              710304    
 l)                                                              
                                                                 
 dense (Dense)               (None, 2)                 786       
                                                                 
Total params: 1,735,090
Trainable params: 1,735,090
Non-trainable params: 0
_________________________________________________________________


***Step 6 - Initialize Train and Test Data***

In [12]:
import numpy as np
! pip3 install sklearn
from sklearn.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
Y = pd.get_dummies(data['polarity'])

In [14]:
result_dict = {0: 'Negative', 1: 'Positive'}
y_arr = np.vectorize(result_dict.get)(Y.columns)

In [15]:
Y = Y.values

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [17]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(33500, 35) (33500, 2)
(16500, 35) (16500, 2)


***Step 7 - Training the Model***

In [18]:
model.fit(X_train, Y_train, epochs=20, batch_size=128, verbose=2)

Epoch 1/20
262/262 - 102s - loss: 0.5391 - accuracy: 0.7245 - 102s/epoch - 390ms/step
Epoch 2/20
262/262 - 83s - loss: 0.4473 - accuracy: 0.7932 - 83s/epoch - 316ms/step
Epoch 3/20
262/262 - 79s - loss: 0.4132 - accuracy: 0.8106 - 79s/epoch - 302ms/step
Epoch 4/20
262/262 - 80s - loss: 0.3816 - accuracy: 0.8275 - 80s/epoch - 306ms/step
Epoch 5/20
262/262 - 80s - loss: 0.3521 - accuracy: 0.8407 - 80s/epoch - 306ms/step
Epoch 6/20
262/262 - 81s - loss: 0.3250 - accuracy: 0.8534 - 81s/epoch - 308ms/step
Epoch 7/20
262/262 - 81s - loss: 0.2984 - accuracy: 0.8682 - 81s/epoch - 309ms/step
Epoch 8/20
262/262 - 81s - loss: 0.2752 - accuracy: 0.8767 - 81s/epoch - 309ms/step
Epoch 9/20
262/262 - 80s - loss: 0.2544 - accuracy: 0.8877 - 80s/epoch - 305ms/step
Epoch 10/20
262/262 - 80s - loss: 0.2320 - accuracy: 0.8992 - 80s/epoch - 305ms/step
Epoch 11/20
262/262 - 81s - loss: 0.2121 - accuracy: 0.9095 - 81s/epoch - 308ms/step
Epoch 12/20
262/262 - 81s - loss: 0.1945 - accuracy: 0.9194 - 81s/epoch 

<keras.callbacks.History at 0x7f45301690d0>

In [19]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train'], loc='upper left')
plt.show()

NameError: ignored

***Step 8 - Computing the Accuracy***

In [20]:
score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=64)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

258/258 - 7s - loss: 1.2859 - accuracy: 0.7436 - 7s/epoch - 26ms/step
score: 1.29
acc: 0.74


***Step 9 - Perform Sentiment Analysis***

In [21]:
twt = ['I do not recommend this product']

In [22]:
twt = tokenizer.texts_to_sequences(twt)

In [23]:
twt = tf.keras.preprocessing.sequence.pad_sequences(twt, maxlen=X.shape[1], dtype='int32', value=0)

In [24]:
sentiment = model.predict(twt, batch_size=1)[0]
print(sentiment)

if(np.argmax(sentiment) == 0):
    print(y_arr[0])
elif (np.argmax(sentiment) == 1):
    print(y_arr[1])

[9.9999976e-01 1.9359808e-07]
Negative
