In [1]:
# import python spark library

from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('nlp').getOrCreate()
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [2]:
spark=SparkSession.builder.appName('nlpML').getOrCreate()

In [11]:
data= spark.read.csv('Corona_NLP_train.csv', header = True,inferSchema=True)

In [12]:
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|                null|     null|
|           Stay calm|          stay safe.|                null|

In [9]:
from pyspark.sql.functions import length

In [13]:
data=data.withColumn('length', length(data['OriginalTweet']))

In [14]:
data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|length|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|   111|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|   237|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|   131|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|    51|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|                null|     null|  null|
|       

In [15]:
data.groupby('Sentiment').mean().show()

+--------------------+-----------+
|           Sentiment|avg(length)|
+--------------------+-----------+
|    online education|      150.0|
| potatoes &amp; v...|       23.0|
| only a few preli...|       10.0|
| Vaccines and Tre...|       41.0|
|              #virus|       12.0|
| consumer and mor...|        5.0|
|"" as shoppers ac...|       63.0|
|      Mumbai or Pune|       26.0|
|000 tests for COV...|      144.0|
| spot the scams""...|       53.0|
| claiming the new...|       63.0|
| we are confronte...|       54.0|
| they should add ...|      247.0|
| as lower oil pri...|       72.0|
|  closed the borders|      162.0|
| IFB vice preside...|      227.0|
|            teachers|       28.0|
| but we have to a...|      142.0|
| the company prod...|       34.0|
| a stock market n...|       51.0|
+--------------------+-----------+
only showing top 20 rows



# Features Transformation

In [29]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_OriginalTweet")
stopremove=StopWordsRemover(inputCol="token_OriginalTweet", outputCol="stop_tokens")
count_vec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")

# we also need to convert our labels in numbers
Netural_Positive_Negative_null_to_num = StringIndexer(inputCol="Sentiment", outputCol='label')


In [30]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [31]:
clean_up = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

 # Model

In [32]:
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, DecisionTreeClassifier

nb=NaiveBayes()
rf=RandomForestClassifier(numTrees=200)
dtc=DecisionTreeClassifier(maxDepth=15)

# Pipeline

In [33]:
from pyspark.ml import Pipeline
data_prep_pipeline= Pipeline(stages=[Netural_Positive_Negative_null_to_num, tokenizer, stopremove,count_vec, idf,clean_up])

In [None]:
cleaner=data_prep_pipeline.fit(data)

In [None]:
clean_data=cleaner.transform(data)

In [None]:
clean_data.show()

In [None]:
clean_data=clean_data.select(['label', 'features'])

In [None]:
clean_data.show()

In [None]:
(training, testing)=clean_data.randomSplit([0.7,0.3])

In [None]:
sentiment_predictor=dtc.fit(training)

In [None]:
test_results=sentiment_predictor.transform(testing)

In [None]:
test_results.show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
acc_eval=MulticlassClassificationEvaluator()
acc=acc_eval.evaluate(test_results)

In [None]:
print ("Accuracy of the model is::", acc)