In [40]:
import numpy as np 
import pandas as pd

#For data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


#For text preprocessing
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as Layers
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam


In [3]:
train = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv',encoding='latin1')
test = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv',encoding='latin1')

In [4]:
train.head()

In [6]:
train.info()

In [8]:
train.isnull().sum()

# Data Visualization of different sentiments classes




In [9]:
#Defining variables to count the appearances of each sentiments in the dataset
print("Train")
dist_train = train['Sentiment'].value_counts()
print(dist_train)
print("Test")
dist_test = test['Sentiment'].value_counts()
print(dist_test)

In [10]:
#Create plot
sns.barplot(x= dist_train.index, y= dist_train.values, palette = 'magma')

#Set the size of plot
sns.set(rc = {'figure.figsize':(20,7)})

#Set the title for plot
plt.title('Train Dataset: Sentiment Class Distibution')

#Label the X axis of the plot
plt.xlabel('Sentiments')

#Label the Y axis of the plot
plt.ylabel('Count')

In [11]:
t=train['Location'].value_counts()[:10]
import plotly.express as px

fig = px.bar(t,y='Location', color=t.index)
fig.show()

In [12]:

train['OriginalTweet']

# Define features and target variables

In [13]:
X = train['OriginalTweet'].copy()
y = train['Sentiment'].copy()

In [14]:
def data_cleaner(tweet):
    
    # remove urls
    tweet = re.sub(r'http\S+', ' ', tweet)
    
    # remove html tags
    tweet = re.sub(r'<.*?>',' ', tweet)
    
    # remove digits
    tweet = re.sub(r'\d+',' ', tweet)
    
    # remove hashtags
    tweet = re.sub(r'#\w+',' ', tweet)
    
    # remove mentions
    tweet = re.sub(r'@\w+',' ', tweet)
    
    #removing stop words
    tweet = tweet.split()
    tweet = " ".join([word for word in tweet if not word in stop_words])
    
    return tweet

In [15]:
train['Sentiment']=train['Sentiment'].map({'Positive':0,'Negative':1,'Neutral':2,'Extremely Positive':3,
                                           'Extremely Negative':4})

In [16]:
#Defining a variable for stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

In [17]:
#Apply the above defined function to train dataset
X_cleaned = X.apply(data_cleaner)
X_cleaned.head()

In [18]:
import wordcloud
all_words = ' '.join([text for text in X_cleaned])
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [19]:
Positive =' '.join([text for text in train['OriginalTweet'][train['Sentiment'] == 0]])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(Positive)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [20]:
Negative =' '.join([text for text in train['OriginalTweet'][train['Sentiment'] == 1]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(Negative)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [21]:
Neutral =' '.join([text for text in train['OriginalTweet'][train['Sentiment'] == 2]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(Neutral)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [22]:
Extremely_Positive =' '.join([text for text in train['OriginalTweet'][train['Sentiment'] == 3]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(Extremely_Positive)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [23]:
Extremely_Negative =' '.join([text for text in train['OriginalTweet'][train['Sentiment'] == 4]])

wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(Extremely_Negative)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [24]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_cleaned)
X = tokenizer.texts_to_sequences(X_cleaned)
vocab_size = len(tokenizer.word_index)+1

print("Vocabulary size: {}".format(vocab_size))
print("\nFor Example:\n")
print("Sentence:\n{}".format(X_cleaned[1]))
print("\nAfter tokenizing :\n{}".format(X[1]))

X = pad_sequences(X, padding='post')
print("\nAfter padding :\n{}".format(X[1]))

In [25]:
encoding = {'Extremely Negative': 0,'Negative': 0,'Neutral': 1,'Positive':2,'Extremely Positive': 2}

labels = ['Negative', 'Neutral', 'Positive']
           
y.replace(encoding, inplace=True)
y

In [26]:
tf.keras.backend.clear_session()

# hyper parameters
EPOCHS = 2
BATCH_SIZE = 32
embedding_dim = 16
units = 256

model = tf.keras.Sequential([
    Layers.Embedding(vocab_size, embedding_dim, input_length=X.shape[1]),
    Layers.Bidirectional(Layers.LSTM(units,return_sequences=True)),
    Layers.GlobalMaxPool1D(),
    Layers.Dropout(0.4),
    Layers.Dense(64, activation="relu"),
    Layers.Dropout(0.4),
    Layers.Dense(3)
])


model.compile(loss=SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam',metrics=['accuracy']
             )

model.summary()

In [27]:
history = model.fit(X, y, epochs=EPOCHS, validation_split=0.12, batch_size=BATCH_SIZE)

In [28]:
X_test = test['OriginalTweet'].copy()
y_test = test['Sentiment'].copy()

#Apply the tweets preprocessing functions to test dataset
X_test = X_test.apply(data_cleaner)

X_test = tokenizer.texts_to_sequences(X_test)

X_test = pad_sequences(X_test, padding='post')

y_test.replace(encoding, inplace=True)

In [31]:
pred = model.predict(X_test)
pred

In [32]:
loss, acc = model.evaluate(X_test,y_test,verbose=0)
print('Test loss: {}'.format(loss))
print('Test Accuracy: {}'.format(acc))

In [35]:
y_test_arg=np.argmax(y_test,axis=0)
Y_pred = np.argmax(model.predict(X_test),axis=1)

In [37]:
conf = confusion_matrix(y_test, Y_pred)

cm = pd.DataFrame(
    conf, index = [i for i in labels],
    columns = [i for i in labels]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()

In [39]:
print(classification_report(y_test, Y_pred, target_names=labels))