NLP text classification for corona virus tweets

In [1]:
from pyspark.sql import SparkSession

In [2]:
park= SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
import numpy as np

from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline

#ML Logisitic Regression
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator



In [4]:
dataset= spark.read.csv('Corona_NLP_train.csv', header=True, inferSchema=True)

In [5]:
dataset.head()

Row(UserName='3799', ScreenName='48751', Location='London', TweetAt='16-03-2020', OriginalTweet='@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8', Sentiment='Neutral')

In [6]:
dataset.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|                null|     null|
|           Stay calm|          stay safe.|                null|

In [7]:
dataset.select("OriginalTweet", "Sentiment").show(5)

+--------------------+---------+
|       OriginalTweet|Sentiment|
+--------------------+---------+
|@MeNyrbie @Phil_G...|  Neutral|
|advice Talk to yo...| Positive|
|Coronavirus Austr...| Positive|
|My food stock is ...|     null|
|                null|     null|
+--------------------+---------+
only showing top 5 rows



In [8]:
drop_list= ['UserName', 'ScreenName', 'Location', 'TweetAt']

In [9]:
dataset= dataset.select([column for column in dataset.columns if column not in drop_list])

In [10]:
dataset.show()

+--------------------+---------+
|       OriginalTweet|Sentiment|
+--------------------+---------+
|@MeNyrbie @Phil_G...|  Neutral|
|advice Talk to yo...| Positive|
|Coronavirus Austr...| Positive|
|My food stock is ...|     null|
|                null|     null|
|                null|     null|
|                null|     null|
|Me, ready to go a...|     null|
|                null|     null|
|                null|     null|
|As news of the re...| Positive|
|"Cashier at groce...| Positive|
|Was at the superm...|     null|
|                null|     null|
|Due to COVID-19 o...| Positive|
|For corona preven...| Negative|
|All month there h...|  Neutral|
|Due to the Covid-...|     null|
|                null|     null|
|                null|     null|
+--------------------+---------+
only showing top 20 rows



In [11]:
dataset = dataset.dropna()
dataset.count()

28617

In [12]:
dataset = dataset.dropna()
dataset.count()

28617

In [13]:
#Checking the type of sentiments
from pyspark.sql.functions import col
dataset.groupBy("Sentiment").count().orderBy(col("count").desc()).show()

+--------------------+-----+
|           Sentiment|count|
+--------------------+-----+
|            Positive| 7718|
|            Negative| 6857|
|             Neutral| 5224|
|  Extremely Positive| 4412|
|  Extremely Negative| 3751|
|   social distancing|    5|
|    N. Y. - April 10|    3|
|        Corona Virus|    2|
|        Stay with us|    2|
| supermarket workers|    2|
|           of course|    2|
| but we also need...|    2|
| or click the lin...|    2|
|            delivery|    2|
| state governors ...|    2|
| not going to the...|    2|
| ecological collapse|    2|
|             however|    2|
| just ""selfish p...|    2|
| as lower oil pri...|    1|
+--------------------+-----+
only showing top 20 rows



In [14]:
#Filtering the data to get Sentiment in terms of Positive, Negative,Neutral,Extremely Positive and Extremely Negative values
import pyspark.sql.functions as fn
data = dataset.where(fn.col("Sentiment").isin(["Positive", "Negative", "Neutral","Extremely Positive","Extremely Negative"]))
data.groupBy("Sentiment").count().orderBy(col("count").desc()).show()

+------------------+-----+
|         Sentiment|count|
+------------------+-----+
|          Positive| 7718|
|          Negative| 6857|
|           Neutral| 5224|
|Extremely Positive| 4412|
|Extremely Negative| 3751|
+------------------+-----+



Tokenizers, stopwords has been used to transform the data to find the count vector of the feature column for the  modelling purpose.

In [15]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# using tokenizer
regexTokenizer = RegexTokenizer(inputCol="OriginalTweet", outputCol="words", pattern="\\W")
# checking the stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# checking the bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=1000, minDF=5)

To convert the text to label, String indexing has been used

In [16]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "Sentiment", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(10)

+--------------------+------------------+--------------------+--------------------+--------------------+-----+
|       OriginalTweet|         Sentiment|               words|            filtered|            features|label|
+--------------------+------------------+--------------------+--------------------+--------------------+-----+
|@MeNyrbie @Phil_G...|           Neutral|[menyrbie, phil_g...|[menyrbie, phil_g...|(1000,[1,5],[2.0,...|  2.0|
|advice Talk to yo...|          Positive|[advice, talk, to...|[advice, talk, to...|(1000,[0,2,25,34,...|  0.0|
|Coronavirus Austr...|          Positive|[coronavirus, aus...|[coronavirus, aus...|(1000,[0,5,6,8,9,...|  0.0|
|As news of the re...|          Positive|[as, news, of, th...|[as, news, of, re...|(1000,[0,1,2,5,8,...|  0.0|
|"Cashier at groce...|          Positive|[cashier, at, gro...|[cashier, at, gro...|(1000,[0,4,5,11,1...|  0.0|
|Due to COVID-19 o...|          Positive|[due, to, covid, ...|[due, to, covid, ...|(1000,[0,1,4,5,7,...|  0.0|
|

Splitting the dataset into train data and test data

In [17]:
(trainData, testData) = dataset.randomSplit([0.75, 0.25], seed=120)
print("Training Data Count: " + str(trainData.count()))
print("Test Data Count: " + str(testData.count()))

Training Data Count: 21023
Test Data Count: 6939


# Random Forest Classifier Model

In [18]:
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(labelCol="label", 
                            featuresCol="features",
                            numTrees = 100, 
                            maxDepth = 4, 
                            maxBins = 32)
# Training the model
rfcModel = rfc.fit(trainData)
# Making prediction
predictions = rfcModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) .select("OriginalTweet","Sentiment","probability","label","prediction") .orderBy("probability", ascending=False)  .show(n = 10, truncate = 30)

+------------------------------+------------------+------------------------------+-----+----------+
|                 OriginalTweet|         Sentiment|                   probability|label|prediction|
+------------------------------+------------------+------------------------------+-----+----------+
|Everyone stay safe and be c...|Extremely Positive|[0.31715463502639174,0.2069...|  3.0|       0.0|
|The craft-distilling indust...|Extremely Positive|[0.3170706909168704,0.21139...|  3.0|       0.0|
|We would like to thank @Bar...|Extremely Positive|[0.31471189504562613,0.1862...|  3.0|       0.0|
|Wash your hands frequently ...|Extremely Positive|[0.31271377825392316,0.2072...|  3.0|       0.0|
|Our thanks to a longtime pa...|Extremely Positive|[0.3125189388505169,0.20390...|  3.0|       0.0|
|Its pathetic that it feels...|Extremely Positive|[0.3113314710153119,0.20507...|  3.0|       0.0|
|Want a sweet happy feel goo...|Extremely Positive|[0.311301786878458,0.203847...|  3.0|       0.0|


In [19]:
#Accuracy of model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.15665602788233612

The accuracy for Random Forest Classifier model is 15.66%

# Logistic Regression model

In [20]:
logr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
logrModel = logr.fit(trainData)
# making prediction on test data 
predictions = logrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) .select("OriginalTweet","Sentiment","probability","label","prediction") .orderBy("probability", ascending=False) .show(n = 10, truncate = 30)

+------------------------------+------------------+------------------------------+-----+----------+
|                 OriginalTweet|         Sentiment|                   probability|label|prediction|
+------------------------------+------------------+------------------------------+-----+----------+
|@berndsankara Hi. We're wor...|          Positive|[0.6257999141116156,0.15840...|  0.0|       0.0|
|@adrparsons @commerson @Mic...|          Positive|[0.5991311183043058,0.12418...|  0.0|       0.0|
|As we take stock of the pos...|Extremely Positive|[0.595381293274773,0.125248...|  3.0|       0.0|
|The #coronavirus must act l...|Extremely Positive|[0.592823084833878,0.101718...|  3.0|       0.0|
|@wearyrabbit Hi there - We'...|          Positive|[0.5926720150112601,0.18631...|  0.0|       0.0|
|Empty store shelves hurt pe...|          Negative|[0.5898156981859813,0.12496...|  1.0|       0.0|
|I told my Mom &amp; Dad tha...|          Positive|[0.5822067121323986,0.07961...|  0.0|       0.0|


In [21]:
#Accuracy of model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.48468548688085306

From our anaysis, we can see that Logistic Regression model gives the accuracy 48.46% for the dataset however, Random forest classifier model gives an accuracy of 15.66%.
