# The link of the data set:
https://www.kaggle.com/datatattle/covid-19-nlp-text-classification

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from textblob import TextBlob
from wordcloud import WordCloud

## Read the data

In [None]:
df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding = "ISO-8859-1")

## Explore the data

In [None]:
f'The data has {df.shape[0]} Rows and {df.shape[1]} Columns'

In [None]:
df.head()

In [None]:
df.isna().sum()

## Plot the Sentiment column

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(y=df['Sentiment'])
plt.yticks(size=13)
plt.show()

## Remove the unnecessary columns

In [None]:
df = df.iloc[:,4:]

In [None]:
df['Sentiment'].unique()

## Text Preprocessing

In [None]:
# Create a function to clean the tweets
def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
 
    return text


df = df[df['Sentiment'] != "Neutral"]
df = df[df['Sentiment'] != "Extremely Negative"]
df = df[df['Sentiment'] != "Extremely Positive"]

# apply the function (Clean Text)
df['OriginalTweet'] = df['OriginalTweet'].apply(cleanTxt)

## Find subjectivity and polarity

In [None]:
# Create a function to get the subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
    return  TextBlob(text).sentiment.polarity


# Create two new columns 'Subjectivity' & 'Polarity'
df['Subjectivity'] = df['OriginalTweet'].apply(getSubjectivity)
df['Polarity'] = df['OriginalTweet'].apply(getPolarity)

## Plot a Word Cloud

In [None]:
allWords = ' '.join([twts for twts in df['OriginalTweet']])
wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)


plt.figure(figsize=(10,6))
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## Comvert the probability to Negative or Positive

In [None]:
def getAnalysis(score):
    if score < 0:
        return 'Negative'
    else:
        return 'Positive'
df['Analysis'] = df['Polarity'].apply(getAnalysis)

## Show the top 10 Positive Tweets

In [None]:
print('Printing positive tweets:\n')
j=1
sortedDF = df.sort_values(by=['Polarity']) #Sort the tweets
for i in range(sortedDF.shape[0]):
    if j > 10:
        break
    if( sortedDF.iloc[i,-1] == 'Positive'):
        print(str(j) + ') '+ sortedDF.iloc[i,0])
        print()
        j += 1

## Show the top 10 Negative Tweets

In [None]:
print('Printing Negative tweets:\n')
j=1
sortedDF = df.sort_values(by=['Polarity'], ascending=False) #Sort the tweets
for i in range(sortedDF.shape[0]):
    if j > 10:
        break
    if( sortedDF.iloc[i,-1] == 'Negative'):
        print(str(j) + ') '+ sortedDF.iloc[i,0])
        print()
        j += 1

## Plot the Subjectivity and the Polarity of the first 100 row

In [None]:
plt.figure(figsize=(10,6)) 
for i in range(100):
    plt.scatter(df.iloc[i,-2], df.iloc[i,-3], color='skyblue') 
# plt.scatter(x,y,color)   
plt.title('Sentiment Analysis', size=20) 
plt.xlabel('Polarity', size=15) 
plt.ylabel('Subjectivity', size=15) 
plt.show()

# Classify with Keras and tensorflow

## Import the libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

## Read the data

In [None]:
df = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding = "ISO-8859-1")
df = df[['OriginalTweet', 'Sentiment']]

## Map Negative and Extremely Negative to 0 else 1

In [None]:
def getAnalysis(score):
    if score == 'Negative' or score == 'Extremely Negative':
        return 0
    else:
        return 1
df['Sentiment'] = df['Sentiment'].apply(getAnalysis)

## Tokenizer

In [None]:
tokenizer = Tokenizer(num_words=1500, split=' ')
tokenizer.fit_on_texts(df['OriginalTweet'].values)

X = tokenizer.texts_to_sequences(df['OriginalTweet'])
X = pad_sequences(X)

## Build the model

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(1500, embed_dim,input_length = 28))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

## Label encoder for the Sentiment column

In [None]:
Le = LabelEncoder()
y = Le.fit_transform(df['Sentiment'])

## Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15)

## train the model 

In [None]:
model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=5, batch_size=32)

In [None]:
model.evaluate(X_test,y_test)