In [None]:
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## 1. Loading Dataframes

In [None]:
train = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_train.csv', encoding='ISO-8859-1')
test = pd.read_csv('../input/covid-19-nlp-text-classification/Corona_NLP_test.csv', encoding='ISO-8859-1')

## 2. Explotary Data Analysis

### checking out dfs

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
test.head()

In [None]:
test.info()  # same dtypes with train df

In [None]:
test.isnull().sum()  # there are null values again in location column

### tweet locations

In [None]:
train.Location.value_counts(dropna = False)[:20]

In [None]:
train.Location = train.Location.str.split(",").str[0]

In [None]:
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(11,4)})

plt.figure(figsize=(12, 6))
sns.barplot(train["Location"].value_counts().values[:10],
            train["Location"].value_counts().index[:10]);
plt.title("Top 10 Countries with maximum Covid-19 tweets", fontsize=14)
plt.xlabel("Number of tweets", fontsize=14)
plt.ylabel("Country Name", fontsize=14)
plt.show()

### tweet sentiment values

In [None]:
train['Sentiment'].value_counts() 

In [None]:
sns.countplot(x = "Sentiment", data = train)

### regrouping train and test dfs

In [None]:
encoding = {'Extremely Negative': 0,
            'Negative': 0,
            'Neutral': 1,
            'Positive':2,
            'Extremely Positive': 2
           }

labels = ['Negative', 'Neutral', 'Positive']
           
train["Sentiment"].replace(encoding, inplace=True)
test["Sentiment"].replace(encoding, inplace=True)

In [None]:
sns.countplot(x = "Sentiment", data = train)

In [None]:
sns.countplot(x = "Sentiment", data = test)

### analysis of locations with sentiment

In [None]:
loc_with_sentiment = train.iloc[:, [2,5]]


In [None]:
plt.figure(figsize=(15, 6))
ax = sns.countplot(x = "Location", hue = "Sentiment", data = loc_with_sentiment, 
              order = train.Location.value_counts()[:10].index, orient = "h", palette = "Paired") 

for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   size=15,
                   xytext = (0, -12), 
                   textcoords = 'offset points')
plt.show()

## hashtags

In [None]:
import regex as re

def extract_hash_tags(s):
    hashes = re.findall(r"#(\w+)", s)
    return " ".join(hashes)
    
train['hashtags'] = train['OriginalTweet'].apply(lambda x : extract_hash_tags(x))

In [None]:
from collections import Counter

allHashTags = list(train[(train['hashtags'] != None) & (train['hashtags'] != "")]['hashtags'])
allHashTags = [tag.lower() for tag in allHashTags]
hash_df = dict(Counter(allHashTags))
top_hash_df = pd.DataFrame(list(hash_df.items()),columns = ['word','count']).reset_index(drop=True).sort_values('count',ascending=False)[:15]
top_hash_df.head()

In [None]:
import plotly.express as px

fig = px.bar(x=top_hash_df['word'],y=top_hash_df['count'],
       orientation='v',
       color=top_hash_df['word'],
       text=top_hash_df['count'],
       color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside', 
                  marker_line_color='rgb(8,48,107)', 
                  marker_line_width=1.5, 
                  opacity=0.7)

fig.update_layout(width=1000, 
                  showlegend=False, 
                  xaxis_title="Word",
                  yaxis_title="Count",
                  title="Top #hashtags in Covid19 Tweets")
fig.show()

## mentions

In [None]:
def get_mentions(s):
    mentions = re.findall("(?<![@\w])@(\w{1,25})", s)
    return " ".join(mentions)
train['mentions'] = train['OriginalTweet'].apply(lambda x : get_mentions(x))

In [None]:
allMentions = list(train[(train['mentions'] != None) & (train['mentions'] != "")]['mentions'])
allMentions = [tag.lower() for tag in allMentions]
mentions_df = dict(Counter(allMentions))
top_mentions_df = pd.DataFrame(list(mentions_df.items()),columns = ['word','count']).reset_index(drop=True).sort_values('count',ascending=False)[:15]
top_mentions_df.head()

In [None]:
fig = px.bar(x=top_mentions_df['word'],y=top_mentions_df['count'],
       orientation='v',
       color=top_mentions_df['word'],
       text=top_mentions_df['count'],
       color_discrete_sequence= px.colors.qualitative.Bold)

fig.update_traces(texttemplate='%{text:.2s}', 
                  textposition='outside', 
                  marker_line_color='rgb(8,48,107)', 
                  marker_line_width=1.5, 
                  opacity=0.7)

fig.update_layout(width=1000, 
                  showlegend=False, 
                  xaxis_title="Word",
                  yaxis_title="Count",
                  title="Top #mentions in Covid19 Tweets")
fig.show()

In [None]:
train.drop(["hashtags", "mentions"], axis =1, inplace = True)

### tweet times

In [None]:
train["TweetAt"] = pd.to_datetime(train["TweetAt"])
train["TweetAt"].apply(lambda x : x.dayofweek).value_counts()

In [None]:
train["TweetAt"].apply(lambda x : x.dayofweek).value_counts().plot.barh()
plt.title("maximun tweets during 2020")

In [None]:
train["TweetAt"] = pd.to_datetime(train["TweetAt"])
train["day"] = train["TweetAt"].apply(lambda x : x.dayofweek)
dmap = {0: 'Mon', 1: 'Tue', 2:'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
train["day"] = train["day"].map(dmap)
plt.title("Day with maximun tweets")
sns.countplot(train["day"])


In [None]:
train.drop("day", axis = 1, inplace = True)

### making cheack points and dropping duplicated rows

In [None]:
train.drop_duplicates(inplace = True)
test.drop_duplicates(inplace = True)

In [None]:
train_df = train.copy()
test_df = test.copy()

## 3. Text Mining

In [None]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import TweetTokenizer

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
sentences = train['OriginalTweet'][:5]

for i in sentences[3:4]:
    print("Original:\n")
    print(i)
    print('\nTensorflow Tokenizer\n:')
    a = Tokenizer()
    a.fit_on_texts([i])
    print(a.word_index)
    print("\nTweet Tokenizer:\n")
    print(TweetTokenizer().tokenize(i))
    print('\nNLTK word_tokenizer:\n')
    print(word_tokenize(i))

#### As you can see these all yield different results and you have to see which works best for your use case. 
#### For now we will use NLTK Tweet-Tokenizer.

In [None]:
stop_words = stopwords.words('english')
lem = WordNetLemmatizer()

def cleaning(data):
    #1. Remove urls 
    tweet_without_url = re.sub(r'http\S+', ' ', data)
    
    #2. Remove hashtags
    tweet_without_hashtag = re.sub(r'#\w+',' ', tweet_without_url)
    
    #3. Remove mentions and characters that not in the English alphabets
    tweet_without_mentions = re.sub(r'@\w+',' ', tweet_without_hashtag)
    precleaned_tweet = re.sub('[^A-Za-z]+', ' ', tweet_without_mentions)

    #2. Tokenize
    tweet_tokens = TweetTokenizer().tokenize(precleaned_tweet)
    
    #3. Remove Puncs
    tokens_without_punc = [w for w in tweet_tokens if w.isalpha()]
    
    #4. Removing Stopwords
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    
    #5. lemma
    text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]
    
    #6. Joining
    return " ".join(text_cleaned)


In [None]:
train_df['OriginalTweet'] = train_df['OriginalTweet'].apply(lambda x: cleaning(x))

In [None]:
pd.set_option('display.max_colwidth', -1)  
train_df.iloc[:, [4,5]].head()

In [None]:
test_df['OriginalTweet'] = test_df['OriginalTweet'].apply(lambda x: cleaning(x))

In [None]:
pd.set_option('display.max_colwidth', -1)  
test_df.iloc[:, [4,5]].head()

## uni grams for train tweets

In [None]:
HQ_words = ' '.join([i for i in train_df['OriginalTweet']]).split() 
unigram_HQ = pd.Series(nltk.ngrams(HQ_words, 1)).value_counts()[:15]
unigram_HQ = pd.DataFrame(unigram_HQ)
unigram_HQ['idx'] = unigram_HQ.index
unigram_HQ['idx'] = unigram_HQ.apply(lambda x: '('+x['idx'][0]+')',axis=1)

In [None]:
import plotly.graph_objs as go
import plotly.offline as pyoff

plot_data = [
    go.Bar(
        x=unigram_HQ['idx'],
        y=unigram_HQ[0],
        marker = dict(
            color = 'Blue'
        )
    )
]
plot_layout = go.Layout(
        title='Top 15 uni-grams from Covid-19 Tweets',
        yaxis_title='Count',
        xaxis_title='Uni-gram',
        plot_bgcolor='rgba(0,0,0,0)'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

## Bi-grams for Tweets

In [None]:
bigram_HQ = (pd.Series(nltk.ngrams(HQ_words, 2)).value_counts())[:15]
bigram_HQ = pd.DataFrame(bigram_HQ)
bigram_HQ['idx'] = bigram_HQ.index
bigram_HQ['idx'] = bigram_HQ.apply(lambda x: '('+x['idx'][0]+', '+x['idx'][1]+')',axis=1)

In [None]:
plot_data = [
    go.Bar(
        x=bigram_HQ['idx'],
        y=bigram_HQ[0],
        marker = dict(
            color = 'Red'
        )
    )
]
plot_layout = go.Layout(
        title='Top 15 bi-grams from Covid 19 Tweets',
        yaxis_title='Count',
        xaxis_title='bi-gram',
        plot_bgcolor='rgba(0,0,0,0)'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

## Tri-grams for Tweets

In [None]:
trigram_HQ = (pd.Series(nltk.ngrams(HQ_words, 3)).value_counts())[:20]
trigram_HQ = pd.DataFrame(trigram_HQ)
trigram_HQ['idx'] = trigram_HQ.index
trigram_HQ['idx'] = trigram_HQ.apply(lambda x: '('+x['idx'][0]+', '+x['idx'][1]+', '+x['idx'][2]+')',axis=1)

In [None]:
plot_data = [
    go.Bar(
        x=trigram_HQ['idx'],
        y=trigram_HQ[0],
        marker = dict(
            color = 'Green'
        )
    )
]
plot_layout = go.Layout(
        title='Top 15 Tri-grams from Covid 19 Tweets',
        yaxis_title='Count',
        xaxis_title='Tri-gram',
        plot_bgcolor='rgba(0,0,0,0)'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

## 4. WordCloud - Repetition of Words¶

In [None]:
from wordcloud import WordCloud

In [None]:
Positive = ' '.join([tweet for tweet in train_df['OriginalTweet'][train_df['Sentiment'] == 0]])

wordcloud = WordCloud(background_color = "white", width = 800, height = 500,
                      random_state = 21, max_font_size = 110).generate(Positive)
plt.figure(figsize = (10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
Negative = ' '.join([tweet for tweet in train_df['OriginalTweet'][train_df['Sentiment'] == 1]])

wordcloud = WordCloud(width = 800, height = 500, random_state = 21, max_font_size = 110).generate(Negative)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

## 5. Sentiment Analysis Models 




In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as Layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from keras.layers import Dense, Dropout, Embedding, LSTM, Conv1D, GlobalMaxPooling1D, Bidirectional, SpatialDropout1D

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df["OriginalTweet"])  # fitting tokenizer on training_datase

X = tokenizer.texts_to_sequences(train_df["OriginalTweet"])  # getting text sequences from training dataframe
y = train_df["Sentiment"]

vocab_size = len(tokenizer.word_index) + 1

In [None]:
vocab_size

In [None]:
print("Vocabulary size: {}".format(vocab_size))
print("\n----------Example----------\n")
print("Sentence:\n{}".format(train_df["OriginalTweet"][6]))
print("\nAfter tokenizing :\n{}".format(X[6]))

X = pad_sequences(X, padding='post')  # adding padding of zeros to obtain uniform length for all sequences
print("\nAfter padding :\n{}".format(X[6]))

In [None]:
X.shape

### a. Modeling with LSTM

In [None]:
# hyper parameters
EPOCHS = 3
BATCH_SIZE = 32 
embedding_dim = 16
units = 256

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))  # This version performs the same function as Dropout, however, it drops entire 1D feature maps instead of individual elements.
model.add(LSTM(units, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(3,activation = 'softmax'))  # we have 3 categories so we have to use softmax 
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
Y = pd.get_dummies(train['Sentiment']).values  
# categorical cross entropy requires get_dummies cause of it only accepts [0]s and [1]s

print(X.shape,Y.shape)


In [None]:
model.fit(X, Y, epochs = 5, validation_split = 0.12, batch_size = BATCH_SIZE)

#### results of LSTM model

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot()

## b. Modeling with Bidirectional LSTM

In [None]:
tf.keras.backend.clear_session()


model_blstm = tf.keras.Sequential([
    Layers.Embedding(vocab_size, embedding_dim, input_length = X.shape[1]),
    Layers.Bidirectional(Layers.LSTM(units, return_sequences = True)),  # recurrent layer with lstm
    Layers.GlobalMaxPool1D(),  # Downsamples the input representation by taking the maximum value over the target
    Layers.Dropout(0.2),
    Layers.Dense(64, activation = "relu"),
    Layers.Dropout(0.2),
    Layers.Dense(3, activation = 'softmax')
])

In [None]:
model_blstm.compile(loss = SparseCategoricalCrossentropy(from_logits = True),  # Computes the crossentropy loss between the labels and predictions.
              optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model_blstm.summary()

In [None]:
model_blstm.fit(X, y, epochs = EPOCHS, validation_split = 0.12, batch_size = BATCH_SIZE)

#### Results of Bi-LSTM model

In [None]:
model_blstm_loss = pd.DataFrame(model_blstm.history.history)
model_blstm_loss.plot()

## c. Modeling with CNN

In [None]:
model_cnn = Sequential()
model_cnn.add(Embedding(vocab_size,embedding_dim,input_length=X.shape[1]))

model_cnn.add(Conv1D(64, kernel_size=3, padding='same', activation='relu', strides=1))
# This layer creates a convolution kernel that is convolved with the layer input over a single spatial 
# (or temporal) dimension to produce a tensor of outputs.

model_cnn.add(GlobalMaxPooling1D()) 
# Downsamples the input representation by taking the maximum value over the dimension.

model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dropout(0.2))

model_cnn.add(Dense(3,activation='softmax'))

model_cnn.compile(loss='categorical_crossentropy',optimizer= 'adam',metrics=['accuracy'])

model_cnn.summary()

In [None]:
model_cnn.fit(X, Y, validation_split = 0.12,epochs=2, batch_size=BATCH_SIZE)

In [None]:
model_cnn_loss = pd.DataFrame(model_cnn.history.history)
model_cnn_loss.plot()

## 6. Evaluation

### Preprocessing test data 

In [None]:
X_test = test['OriginalTweet'].copy()
y_test = test['Sentiment'].copy()

X_test = X_test.apply(cleaning)

X_test = tokenizer.texts_to_sequences(X_test)

X_test = pad_sequences(X_test, padding='post')

### Making Predictions with Bi-LSTM

In [None]:
pred = model_cnn.predict_classes(X_test)

### Model Results

In [None]:
print(classification_report(y_test, pred))

In [None]:
conf = confusion_matrix(y_test, pred)

cm = pd.DataFrame(
    conf, index = [i for i in labels],
    columns = [i for i in labels]
)

plt.figure(figsize = (12,7))
sns.heatmap(cm, annot=True, fmt="d")
plt.show()