In [1]:
#Import Module
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [2]:
appName = "Sentiment analysis in Twitter"
spark = SparkSession.builder.appName('appName').getOrCreate()

In [3]:
#import spark modules

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF,Tokenizer,StopWordsRemover

In [4]:
#Read data file into Spark DataFrame

#read csv file into dataFrame with automatically inferred schema

tweets_csv = spark.read.csv('dataset/tweets.csv',inferSchema=True,header=True)

tweets_csv.show(truncate=True,n=3)

+------+---------+---------------+--------------------+
|ItemID|Sentiment|SentimentSource|       SentimentText|
+------+---------+---------------+--------------------+
|  1038|        1|   Sentiment140|that film is fant...|
|  1804|        1|   Sentiment140|this music is rea...|
|  1693|        0|   Sentiment140|winter is terribl...|
+------+---------+---------------+--------------------+
only showing top 3 rows



In [5]:
# select only sentiment and sentiment column and cast sentiment column data into integer

data = tweets_csv.select("SentimentText",col("Sentiment").cast("Int").alias("label"))

data.show(truncate=False,n=3)


+---------------------------------+-----+
|SentimentText                    |label|
+---------------------------------+-----+
|that film is fantastic #brilliant|1    |
|this music is really bad #myband |1    |
|winter is terrible #thumbs-down  |0    |
+---------------------------------+-----+
only showing top 3 rows



# Divide Data into training and testing data


In [6]:
#divide data, 70% and training data 30% for testing

dividedData = data.randomSplit([0.7,0.3])
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows",train_rows, ",Testing data rows",test_rows)

Training data rows 1365 ,Testing data rows 567


# Prepare training data

In [7]:
#Seperate SentimentText into individual words using tokenizer

tokenizer = Tokenizer(inputCol="SentimentText",outputCol="SentimentWords")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(truncate=False,n=5)

+-------------------------+-----+------------------------------+
|SentimentText            |label|SentimentWords                |
+-------------------------+-----+------------------------------+
|I adore cheese #bestever |1    |[i, adore, cheese, #bestever] |
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|
+-------------------------+-----+------------------------------+
only showing top 5 rows



In [8]:
#Removing stop words (not req words to be features)

swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(),outputCol="MeaningfulWords")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(truncate=False,n=5)

+-------------------------+-----+------------------------------+---------------------------+
|SentimentText            |label|SentimentWords                |MeaningfulWords            |
+-------------------------+-----+------------------------------+---------------------------+
|I adore cheese #bestever |1    |[i, adore, cheese, #bestever] |[adore, cheese, #bestever] |
|I adore cheese #brilliant|1    |[i, adore, cheese, #brilliant]|[adore, cheese, #brilliant]|
|I adore cheese #favorite |1    |[i, adore, cheese, #favorite] |[adore, cheese, #favorite] |
|I adore cheese #loveit   |1    |[i, adore, cheese, #loveit]   |[adore, cheese, #loveit]   |
|I adore cheese #thumbs-up|1    |[i, adore, cheese, #thumbs-up]|[adore, cheese, #thumbs-up]|
+-------------------------+-----+------------------------------+---------------------------+
only showing top 5 rows



# Converting words feature into numerical feature.

In [9]:
hashTF = HashingTF(inputCol = swr.getOutputCol(),outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain).select("label","MeaningfulWords","features")
numericTrainData.show(truncate=False,n=5)


+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[65702,69876,108823],[1.0,1.0,1.0])|
|1    |[adore, cheese, #brilliant]|(262144,[61111,65702,69876],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[65702,69876,156543],[1.0,1.0,1.0])|
|1    |[adore, cheese, #loveit]   |(262144,[65702,65728,69876],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #thumbs-up]|(262144,[3984,65702,69876],[1.0,1.0,1.0])  |
+-----+---------------------------+-------------------------------------------+
only showing top 5 rows



# Train our classifier model using the training data

In [10]:
lr = LogisticRegression(labelCol='label',featuresCol="features",maxIter=10,regParam=0.01)
model = lr.fit(numericTrainData)
print("Training is done!")


Training is done!


In [11]:
#Prepare testing data

tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTrain).select("label","MeaningfulWords","features")
numericTest.show(truncate=False,n=5)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[65702,69876,108823],[1.0,1.0,1.0])|
|1    |[adore, cheese, #brilliant]|(262144,[61111,65702,69876],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #favorite] |(262144,[65702,69876,156543],[1.0,1.0,1.0])|
|1    |[adore, cheese, #loveit]   |(262144,[65702,65728,69876],[1.0,1.0,1.0]) |
|1    |[adore, cheese, #thumbs-up]|(262144,[3984,65702,69876],[1.0,1.0,1.0])  |
+-----+---------------------------+-------------------------------------------+
only showing top 5 rows



In [12]:
numericTest.show(truncate=False,n=2)

+-----+---------------------------+-------------------------------------------+
|label|MeaningfulWords            |features                                   |
+-----+---------------------------+-------------------------------------------+
|1    |[adore, cheese, #bestever] |(262144,[65702,69876,108823],[1.0,1.0,1.0])|
|1    |[adore, cheese, #brilliant]|(262144,[61111,65702,69876],[1.0,1.0,1.0]) |
+-----+---------------------------+-------------------------------------------+
only showing top 2 rows



# Predict testing and calculate the accuracy model 

In [13]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select("MeaningfulWords","prediction","Label")
predictionFinal.show(truncate=False,n=5)
correctPrediction = predictionFinal.filter(predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()

print("correct prediction:", correctPrediction, ",total data:" , totalData, ",accuracy:",correctPrediction/totalData)

+---------------------------+----------+-----+
|MeaningfulWords            |prediction|Label|
+---------------------------+----------+-----+
|[adore, cheese, #bestever] |1.0       |1    |
|[adore, cheese, #brilliant]|1.0       |1    |
|[adore, cheese, #favorite] |1.0       |1    |
|[adore, cheese, #loveit]   |1.0       |1    |
|[adore, cheese, #thumbs-up]|1.0       |1    |
+---------------------------+----------+-----+
only showing top 5 rows

correct prediction: 1344 ,total data: 1365 ,accuracy: 0.9846153846153847
