In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('sentiments.csv')

In [3]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [4]:
df.shape

(41157, 6)

## data cleaning

In [5]:
sentences = [sent for sent in df.OriginalTweet]

In [6]:
len(sentences)

41157

In [7]:
sentences[2]

'Coronavirus Australia: Woolworths to give elderly, disabled dedicated shopping hours amid COVID-19 outbreak https://t.co/bInCA9Vp8P'

In [8]:
import string
cleaned_sents = []
for sent in sentences:
    table = sent.maketrans('','',string.punctuation)
    txt = sent.translate(table)
    cleaned_sents.append(txt)


In [9]:
cleaned_sents[0]

'MeNyrbie PhilGahan Chrisitv httpstcoiFz9FAn2Pa and httpstcoxX6ghGFzCC and httpstcoI2NlzdxNo8'

In [10]:
url_free_sents = []
for i in range(len(cleaned_sents)):
    txt = [sent for sent in cleaned_sents[i].split() if sent[:4] != 'http']
    url_free_sents.append(txt)
    

In [11]:
url_free_sents2 = []
for i in range(len(url_free_sents)):
    txt = ' '.join(url_free_sents[i])
    url_free_sents2.append(txt)
    

In [12]:
url_free_sents2[2]

'Coronavirus Australia Woolworths to give elderly disabled dedicated shopping hours amid COVID19 outbreak'

In [13]:
import nltk
import re
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [14]:
# corpus = []
# porter = PorterStemmer()
# for i in range(len(url_free_sents2)):
#     review = re.sub('[^a-zA-Z]',' ',url_free_sents2[i])
#     review = review.lower()
#     review = review.split()
    
#     reviews = [porter.stem(word) for word in review if word not in stopwords.words('english')]
#     review = ' '.join(review)
    
#     corpus.append(review)
import pickle
with open('corpus.pkl','rb') as f:
    corpus = pickle.load(f)

In [15]:
len(corpus)

41157

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
tokenizer = Tokenizer(oov_token = "<OOV>")

In [18]:
tokenizer.fit_on_texts(corpus)
with open('tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer,f)

In [19]:
sequences = tokenizer.texts_to_sequences(corpus)

In [20]:
len(sequences)

41157

In [21]:
sequences[0]

[22650, 22651, 12756, 4, 4]

In [22]:
padded = pad_sequences(sequences)

In [23]:
df.Sentiment.unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [24]:
dict = {
    'Extremely Negative' : 0,
    'Negative' : 1,
    'Neutral':2,
    'Positive':3,
    'Extremely Positive':4
}

In [25]:
df['Sentiment'] = df.Sentiment.map(dict)

In [26]:
labels = []
for i in df.Sentiment:
    labels.append(i)

In [27]:
len(labels)

41157

In [28]:
# training_size = int(len(corpus)*0.8)

In [29]:
# training_sequences = padded[:training_size]
# training_labels = labels[:training_size]

# testing_sequences = padded[training_size:]
# testing_labels = labels[training_size:]

from sklearn.model_selection import train_test_split
training_sequences,testing_sequences,training_labels,testing_labels = train_test_split(padded,labels,test_size=0.2,random_state=42)

In [30]:
len(training_labels)

32925

In [31]:
len(training_sequences[1])

62

In [32]:
vocab_size = 60000
embedding_vectors = 40
import tensorflow as tf

In [33]:
# model = tf.keras.models.Sequential([
#     tf.keras.layers.Embedding(vocab_size,40,input_length=62),
#     tf.keras.layers.LSTM(100),
#     tf.keras.layers.Dense(5,activation='softmax')
# ])
from tensorflow.keras.models import load_model
model = load_model('sentiment_model.h5')

In [34]:
# model.compile(optimizer='adam',loss= tf.keras.losses.sparse_categorical_crossentropy,metrics=['accuracy'])

In [35]:
import numpy as  np
training_sequences = np.asarray(training_sequences)
training_labels = np.asarray(training_labels)
testing_sequences = np.asarray(testing_sequences)
testing_labels = np.asarray(testing_labels)

In [36]:
testing_sequences

array([[   0,    0,    0, ...,    2,  161,  117],
       [   0,    0,    0, ...,    3, 1568,  294],
       [   0,    0,    0, ...,    3, 1368,    5],
       ...,
       [   0,    0,    0, ...,   10,  428,    5],
       [   0,    0,    0, ...,    5,    9,  335],
       [   0,    0,    0, ...,   22,    5,  132]])

In [37]:
# model.fit(training_sequences,training_labels,epochs=5)

In [38]:
model.evaluate(testing_sequences,testing_labels)



[0.3036256432533264, 0.9141156673431396]

In [39]:
pred = model.predict(testing_sequences)

In [40]:
for i in range(10):
    print(np.argmax(pred[i]))

1
0
3
3
1
3
3
1
2
4


In [41]:
testing_labels[:10]

array([2, 0, 3, 3, 1, 3, 3, 1, 2, 4])

In [42]:
# import pickle
# with open('corpus.pkl','wb') as f:
#     pickle.dump(corpus,f)

In [43]:
p = model.predict( testing_sequences[1].reshape(1,62,1) )

In [44]:
np.argmax(p)

0

In [45]:
testing_labels[1]

0

In [46]:
tf.keras.models.save_model(model,'sentiment_model.h5')

In [47]:
with open('tokenizer.pkl','rb') as f:
    tk = pickle.load(f)

In [48]:
tk.word_index


{'<OOV>': 1,
 'the': 2,
 'to': 3,
 'and': 4,
 'covid': 5,
 'of': 6,
 'a': 7,
 'in': 8,
 'coronavirus': 9,
 'for': 10,
 'is': 11,
 'are': 12,
 'i': 13,
 'you': 14,
 'on': 15,
 'this': 16,
 'prices': 17,
 'at': 18,
 'food': 19,
 'supermarket': 20,
 'store': 21,
 'with': 22,
 'we': 23,
 'it': 24,
 'that': 25,
 'grocery': 26,
 'have': 27,
 'as': 28,
 'be': 29,
 'people': 30,
 'from': 31,
 'amp': 32,
 'all': 33,
 's': 34,
 'your': 35,
 'not': 36,
 'will': 37,
 'consumer': 38,
 'my': 39,
 'can': 40,
 'our': 41,
 'they': 42,
 'out': 43,
 'has': 44,
 'up': 45,
 'by': 46,
 'more': 47,
 'or': 48,
 'shopping': 49,
 'if': 50,
 'but': 51,
 'online': 52,
 'how': 53,
 'pandemic': 54,
 'their': 55,
 'during': 56,
 'so': 57,
 'now': 58,
 't': 59,
 'no': 60,
 'get': 61,
 'about': 62,
 'what': 63,
 'who': 64,
 'need': 65,
 'us': 66,
 'workers': 67,
 'just': 68,
 'panic': 69,
 'do': 70,
 'sanitizer': 71,
 'like': 72,
 'was': 73,
 'an': 74,
 'time': 75,
 'when': 76,
 'demand': 77,
 'there': 78,
 'go': 79,


In [51]:
model.predict(testing_sequences[0].reshape(1,62,1))

array([[1.5562946e-03, 9.4710463e-01, 4.2617016e-02, 8.3322711e-03,
        3.8972634e-04]], dtype=float32)

In [67]:
sentences[15]

'Lines at the grocery store have been unpredictable, but is eating out a safe alternative? \r\r\n\r\r\nFind out more about whether you should be avoiding restaurants right now:  https://t.co/9idZSis5oQ\r\r\n\r\r\n#coronavirus #covid19 https://t.co/ZHbh898lf6'