

***

## COVID-19 Tweet Sentiment Prediction  

Given *tweets about the COVID-19 pandemic*, let's try to predict the **sentiment** of a given tweet.  
  
We will use a TensorFlow RNN to make our predictions.

# Let's GO

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import re
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
train_df = pd.read_csv("Corona_NLP_train.csv",encoding='latin-1')
test_df = pd.read_csv("Corona_NLP_test.csv",encoding='latin-1')

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df.info()

In [None]:
test_df.info()

# EDA

In [None]:
time = train_df.groupby(['TweetAt']).size()
fig = px.line(train_df, 
              x=time.index, 
              y=time.values, 
              title = 'Date of tweets',
              template='simple_white')

fig.update_layout(
font=dict(size=15,family="Times New Roman"),)

fig.show()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x=train_df["Sentiment"])

### There's 5 sentiments in the dataset: Positive, Negative, Neutral, Extremely Postive and Extremely Negative.Each Positive and Negative account for 27.8% and 24.1%.

In [None]:
location = train_df['Location'].value_counts().nlargest(n=10)
fig = px.bar(y=location.values,
       x=location.index,
       orientation='v',
       color=location.index,
       text=location.values,
       color_discrete_sequence= px.colors.qualitative.Bold)
fig.show()

### We can notice that UK and US ranked top 2 locations of covid-19 tweets, and India & Australia ranked the next.

In [None]:
positive = train_df.loc[np.where(train_df['Sentiment'] == 'Positive')].reset_index()
negative = train_df.loc[np.where(train_df['Sentiment'] == 'Negative')].reset_index()
neutral = train_df.loc[np.where(train_df['Sentiment'] == 'Neutral')].reset_index()
extremely_positive = train_df.loc[np.where(train_df['Sentiment'] == 'Extremely Positive')].reset_index()
extremely_negative = train_df.loc[np.where(train_df['Sentiment'] == 'Extremely Negative')].reset_index()

In [None]:
train_df['Length'] = train_df['OriginalTweet'].apply(len)
hist_data = [extremely_positive['Length'],positive['Length'],neutral['Length'],
             negative['Length'],extremely_negative['Length']]

group_labels = ['extremely_positive','positive','neutral','negative','extremely_negative']

colors = ['navy', 'blue', 'lightblue','lightsalmon','red']

In [None]:
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, colors = colors)
fig.show()

### It is obvious that sentiment of 'extremely negative' and 'extremely positive' mostly distributed at high length, and negative & positive the next.

# Preprocessing

In [None]:
train_inputs = train_df['OriginalTweet'].copy()
test_inputs = test_df['OriginalTweet'].copy()

train_labels = train_df['Sentiment'].copy()
test_labels = test_df['Sentiment'].copy()

In [None]:
sentiment_encoding = {
    'Extremely Negative': 0,
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2,
    'Extremely Positive': 2
}

train_labels = train_labels.replace(sentiment_encoding)
test_labels = test_labels.replace(sentiment_encoding)

In [None]:
train_inputs

In [None]:
#stop_words = set(stopwords.words('english'))


In [None]:


def process_tweet(tweet):
    
    # remove urls
    tweet = re.sub(r'http\S+', ' ', tweet)
    
    # remove html tags
    tweet = re.sub(r'<.*?>', ' ', tweet)
    
    # remove digits
    tweet = re.sub(r'\d+', ' ', tweet)
    
    # remove hashtags
    tweet = re.sub(r'#\w+', ' ', tweet)
    
    # remove mentions
    tweet = re.sub(r'@\w+', ' ', tweet)
    
    #removing stop words
    tweet = tweet.split()
    tweet = " ".join([word for word in tweet if not word.is_stop])
    
    return tweet

In [None]:
train_inputs = train_inputs.apply(process_tweet)
test_inputs = test_inputs.apply(process_tweet)

In [None]:
train_inputs

In [None]:
max_seq_length = np.max(train_inputs.apply(lambda tweet: len(tweet)))

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_inputs)

vocab_length = len(tokenizer.word_index) + 1


train_inputs = tokenizer.texts_to_sequences(train_inputs)
test_inputs = tokenizer.texts_to_sequences(test_inputs)
# Use Post Padding 
train_inputs = pad_sequences(train_inputs, maxlen=max_seq_length, padding='post')
test_inputs = pad_sequences(test_inputs, maxlen=max_seq_length, padding='post')

In [None]:
print("Vocab length:", vocab_length)
print("Max sequence length:", max_seq_length)

# Modeling

In [None]:
train_inputs.shape

In [None]:
embedding_dim = 16    #16 features


inputs = tf.keras.Input(shape=(max_seq_length,), name='input_layer')

embedding = tf.keras.layers.Embedding(
    input_dim=vocab_length,
    output_dim=embedding_dim,
    input_length=max_seq_length,
    name='word_embedding'
)(inputs)

gru_layer = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(units=256, return_sequences=True, name='gru_layer'),
    name='bidirectional_layer'
)(embedding)

max_pooling = tf.keras.layers.GlobalMaxPool1D(name='max_pooling')(gru_layer)

dropout_1 = tf.keras.layers.Dropout(0.4, name='dropout_1')(max_pooling)

dense = tf.keras.layers.Dense(64, activation='relu', name='dense')(dropout_1)

dropout_2 = tf.keras.layers.Dropout(0.4, name='dropout_2')(dense)

outputs = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(dropout_2)


model = tf.keras.Model(inputs=inputs, outputs=outputs)

print(model.summary())

tf.keras.utils.plot_model(model)

# Training

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


batch_size = 32
epochs = 2

history = model.fit(
    train_inputs,
    train_labels,
    validation_split=0.12,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2
)

# Results

In [None]:
fig = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'index': "epoch", 'value': "loss"}
)

fig.show()

In [None]:
model.evaluate(test_inputs, test_labels)

# Conclusion  

I have used basic data preparation libraries like Pandas, Numpy,Plotly,SNS and Matplotlib. Used Regex,Spacy and Keras preprocessing tokenizer for text cleaning and preprocessing. Applied sequence processing models like Bidirectional GRU,
GlobalMaxPooling for down sampling. Got accuracy of 87 Percent 

***

