In [None]:
from nltk.corpus import stopwords
import nltk
import re
from wordcloud import WordCloud, STOPWORDS , ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import keras
import seaborn as sns
from sklearn.metrics import confusion_matrix , classification_report
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_train.csv',engine="python")
df_t = pd.read_csv("/kaggle/input/covid-19-nlp-text-classification/Corona_NLP_test.csv",engine="python")

In [None]:
df.head()

In [None]:
df.Location = df.Location.fillna('unknown')
df_t.Location = df_t.Location.fillna('unknown')

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y='Location',data=df,order=df.Location.value_counts().iloc[1:20].index).set_title("Twitted locations")

In [None]:
stop_words = stopwords.words('english')
def clean_data(twit):
    # Remove URLS
    twit = re.sub(r'http\S+', ' ', twit)
    
    #remove html
    
    twit = re.sub(r'<.*?>', " ",twit)
    
    #Removeing digits
    
    twit = re.sub(r'\d+',' ', twit)
    
    #remove hastags
    
    twit = re.sub(r'#\w+',' ', twit)
    
    #remove mentioned
    
    twit = re.sub(r'@\w+',' ', twit)
    
    #remove money
    
    twit = re.sub(r'£|\$', 'money', twit)
   
    #remove stop words
    
    twit = twit.split()
    twit = " ".join([word for word in twit if not word in stop_words])
    
    
    return twit

In [None]:
df["text"] = df["OriginalTweet"].apply(clean_data)
df_t["text"] = df_t["OriginalTweet"].apply(clean_data)

In [None]:
ws = WordCloud()
ws.generate(str(df["text"]))
plt.imshow(ws, interpolation='bilinear')

In [None]:
df['target'] = df['Sentiment']
df['target'].replace({'Neutral':0,'Extremely Negative':1,'Negative':1,'Positive':2,'Extremely Positive':2},inplace=True)
df_t['target'] = df_t['Sentiment']
df_t['target'].replace({'Neutral':0,'Extremely Negative':1,'Negative':1,'Positive':2,'Extremely Positive':2},inplace=True)

In [None]:
df_1 = df[df['target']==0]
ws = WordCloud()
ws.generate(str(df_1["text"]))
plt.imshow(ws, interpolation='bilinear')
plt.title('Neutral Twits')

In [None]:
df_1 = df[df['target']==1]
ws = WordCloud()
ws.generate(str(df_1["text"]))
plt.imshow(ws, interpolation='bilinear')
plt.title('Negative Twits')

In [None]:
df_1 = df[df['target']==2]
ws = WordCloud()
ws.generate(str(df_1["text"]))
plt.imshow(ws, interpolation='bilinear')
plt.title('Positive Twits')

In [None]:
X_train = df['text']
y_train = df['target']
X_test = df_t['text']
y_test = df_t['target']

In [None]:
max_len = np.max(X_train.apply(lambda x :len(x)))
print(max_len)

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(X_train)
vocab_length = len(tokenizer.word_index) + 1

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')

In [None]:
from keras.utils import to_categorical

y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [None]:
embedding_dim = 16

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_length, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,return_sequences=True)),
    tf.keras.layers.GlobalAvgPool1D(),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(3, activation='softmax')     
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
history = model.fit(X_train,y_train,epochs=10,batch_size=64, validation_data=(X_test,y_test))

In [None]:
pred = model.predict_classes(X_test)

In [None]:
print(classification_report(np.argmax(y_test,1),pred))

In [None]:
cm = tf.math.confusion_matrix(labels=np.argmax(y_test,1),predictions=pred)
plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Truth')