In [None]:
#import the necessary libraries
import re
import shutil

import findspark
findspark.init('/Users/swapnilsinha/spark/spark-3.0.1-bin-hadoop3.2')

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer,StopWordsRemover,Word2Vec,StringIndexer
from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.sql.functions import udf
from pyspark.sql.types import *


In [None]:
#Loading the Data To Spark DataFrame
spark = SparkSession.builder.appName('SentimentClassifierCreationWithSparkML').getOrCreate()
#unpack the data
shutil.unpack_archive('datasets/training.1600000.processed.noemoticon.csv.tar.gz', 'datasets')
print('Archive file unpacked successfully.')
#read the csv data as a dataframe
df = spark.read.csv('datasets/training.1600000.processed.noemoticon.csv',inferSchema=True)
#print the total number of records read into the dataframe
print('Total Number of records in df : ',df.count())
#drop the null values.
df = df.dropna()
#drop the duplicate values
df = df.dropDuplicates()
print('Total Number of records in df after deleting duplicate and null records : ',df.count())

Archive file unpacked successfully.
Total Number of records in df :  1600000
Total Number of records in df after deleting duplicate and null records :  1600000


In [None]:
#create a new dataframe with renamed columns
Ndf = df.withColumnRenamed('_c0','sentiment').withColumnRenamed('_c5','text')
#select the text and its corresponding sentiment from the dataframe
df.select('text','sentiment').show(5)

+--------------------+---------+
|                text|sentiment|
+--------------------+---------+
|  I want Miley to...|        0|
|            Exams!!!|        0|
| I wanna go home....|        0|
| I will not watch...|        0|
|        No Followers|        0|
+--------------------+---------+
only showing top 5 rows



In [None]:
#Clean and Prepare the Data
def removePattern(inputText, pattern):
    r = re.findall(pattern, inputText)
    for i in r:
        inputText = re.sub(i, '', inputText)        
    return inputText
#create a function to cleanse the tweets
def cleanTweet(txt):
    '''
    Remove Twitter Return Handles (RT @xxx:)
    '''
    txt = removePattern(txt, 'RT @[\w]*:')
    '''
    Remove Twitter Handles (@xxx)
    '''
    txt = removePattern(txt, '@[\w]*')
    '''
    Remove URL Links (httpxxx)
    '''
    txt = removePattern(txt, 'https?://[A-Za-z0-9./]*')
    '''
    Remove Special Characters, Numbers and Punctuations
    '''
    txt = re.sub('[^A-Za-z]+', ' ', txt)
    return txt

In [None]:
#Create a new dataframe with clean tweets
udfCleanTweet = udf(cleanTweet, StringType())
df=df.withColumn('cleanTweetText', udfCleanTweet('text'))

In [None]:
#Select the sentiment column from the original dataframe and obtain the distinct values held by it.
df.select('sentiment').distinct().show()

+---------+
|sentiment|
+---------+
|        4|
|        0|
+---------+



In [None]:
#Group the sentiment column by distinct values and count their occurances
df.groupby('sentiment').count().show()

+---------+------+
|sentiment| count|
+---------+------+
|        4|800000|
|        0|800000|
+---------+------+



In [None]:
#Create a function to classify which sentiment a tweet belongs to.
def mapTarget(sentiment):
    return 1 if sentiment == 4 else sentiment

In [None]:
#Create a target column in the dataframe
udfMapTarget = udf(mapTarget, IntegerType())
df = df.withColumn('target', udfMapTarget('sentiment'))

In [None]:
df.groupby('target').count().show()

+------+------+
|target| count|
+------+------+
|     1|800000|
|     0|800000|
+------+------+



In [None]:
df=df.select('text','cleanTweetText','target')
df.show(5)

+--------------------+--------------------+------+
|                text|      cleanTweetText|target|
+--------------------+--------------------+------+
|  I want Miley to...| I want Miley to ...|     0|
|            Exams!!!|              Exams |     0|
| I wanna go home....| I wanna go home ...|     0|
| I will not watch...| I will not watch...|     0|
|        No Followers|        No Followers|     0|
+--------------------+--------------------+------+
only showing top 5 rows



In [None]:
#Train Test Split using randomSplit
dfTrain,dfTest = df.randomSplit([0.8,0.2])
#Feature Transformations
tokenizer = Tokenizer(inputCol='cleanTweetText', outputCol='tokenTweet')
stopRemover = StopWordsRemover(inputCol='tokenTweet',outputCol='filteredTokens')
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol='filteredTokens', outputCol='features')
labelStringIdx = StringIndexer(inputCol = 'target', outputCol = 'label')
#Create the Logistic Regression Model
model = LogisticRegression(maxIter=100)
#Create the pipeline
dfPrepPipe = Pipeline(stages=[tokenizer,stopRemover,word2Vec,labelStringIdx,model])

In [None]:
#Fit the model and do evaluations
pipeline_Fit = dfPrepPipe.fit(dfTrain)
predictions = pipeline_Fit.transform(dfTest)
predictions.printSchema()

root
 |-- text: string (nullable = true)
 |-- cleanTweetText: string (nullable = true)
 |-- target: integer (nullable = true)
 |-- tokenTweet: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filteredTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [None]:
predictions.select(['cleanTweetText','features','label','rawPrediction','probability','prediction']).show(5)

+--------------------+--------------------+-----+--------------------+--------------------+----------+
|      cleanTweetText|            features|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-----+--------------------+--------------------+----------+
| I want Miley to ...|[0.01353734713047...|  0.0|[0.59144533590323...|[0.64369670410457...|       0.0|
|        love you lt |[-0.0808262992650...|  1.0|[-2.8843092921010...|[0.05293468369937...|       1.0|
|      Hello Twitter |[0.08169510029256...|  1.0|[-2.8707641781935...|[0.05361786209311...|       1.0|
| I wanna go home ...|[-0.0382282886033...|  0.0|[2.43213292335975...|[0.91924500906395...|       0.0|
|        No Followers|[0.16656325850635...|  0.0|[-2.9296898778204...|[0.05070525034315...|       1.0|
+--------------------+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
evaluator = BinaryClassificationEvaluator()
roc_accuracy=evaluator.evaluate(predictions)
print('ROC-Accuracy of model at predicting sentiment is: {:.4f}'.format(roc_accuracy))

ROC-Accuracy of model at predicting sentiment is: 0.8276


In [None]:
number_Of_TestRecord = dfTest.agg({'target':'count'}).collect()[0]['count(target)']
accuracy = predictions.filter(predictions['label'] == predictions['prediction']).count()/number_Of_TestRecord
print('Accuracy of model at predicting sentiment is: {:.4f}'.format(accuracy))

Accuracy of model at predicting sentiment is: 0.7479


In [None]:
#Save the trained model
pipeline_Fit.save('W2VLogreg.model')