In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
#for structured data we use Spark SQL, SparkSession acts a pipeline between data and sql statements
from pyspark.sql import SparkSession

In [3]:
# sparksession is like a class and we need to create an instance of a class to utilize
spark = SparkSession.builder.appName("NLP_5A_Data_Processing").getOrCreate()

In [4]:
#Reading the csv file data
Movie_Reviews_DF = spark.read.csv("D:/Movie_reviews.csv", inferSchema = True, header = True)

In [5]:
#Seeing the shape of the dataset
print("Shape:", (Movie_Reviews_DF.count(), len(Movie_Reviews_DF.columns)))

Shape: (7087, 2)


In [6]:
#Looking at the schema
#both columns are of string type
Movie_Reviews_DF.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [7]:
#Loding the random function
from pyspark.sql.functions import rand 
#Displaying random observations from the data
Movie_Reviews_DF.orderBy(rand()).show(10,False)  # Note Sentiment values are read in as string

+------------------------------------------------------------------------+---------+
|Review                                                                  |Sentiment|
+------------------------------------------------------------------------+---------+
|Harry Potter dragged Draco Malfoy ’ s trousers down past his hips and   |0        |
|the Da Vinci Code sucked.                                               |0        |
|Oh, and Brokeback Mountain is a TERRIBLE movie...                       |0        |
|Brokeback Mountain is fucking horrible..                                |0        |
|The Da Vinci Code is awesome!!                                          |1        |
|I want to be here because I love Harry Potter, and I really want a place|1        |
|Harry Potter dragged Draco Malfoy ’ s trousers down past his hips and   |0        |
|As I sit here, watching the MTV Movie Awards, I am reminded of how much |0        |
|I love Harry Potter..                                           

In [8]:
#Cleaning the data
#Filtering the data for data only with only either 0 or 1 sentiment value
Movie_Reviews_DF = Movie_Reviews_DF.filter(((Movie_Reviews_DF.Sentiment =='1') | (Movie_Reviews_DF.Sentiment =='0')))

In [9]:
#Checking the count to see if any rows are deleted (rows with different sentiment values)
Movie_Reviews_DF.count()

6990

In [10]:
#Grouping by sentiment values to see balance of data
#(Fairly balanced)
Movie_Reviews_DF.groupBy('Sentiment').count().show()

+---------+-----+
|Sentiment|count|
+---------+-----+
|        0| 3081|
|        1| 3909|
+---------+-----+



In [11]:
#Looking at the schema again
Movie_Reviews_DF.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [12]:
#in order to perform logistic regression 
#we should have sentiment value of numeric datatype
#Adding a column label to store converted float values 
#from string value in Sentiment (and dropping the Sentiment(String type) column)
Movie_Reviews_DF = Movie_Reviews_DF.withColumn("Label", Movie_Reviews_DF.Sentiment.cast('float')).drop('Sentiment')

In [13]:
#Looking at the schema again
Movie_Reviews_DF.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)



In [14]:
#Displaying random data
Movie_Reviews_DF.orderBy(rand()).show(10,False)

+----------------------------------------------------------------------------+-----+
|Review                                                                      |Label|
+----------------------------------------------------------------------------+-----+
|I am going to start reading the Harry Potter series again because that i    |1.0  |
|Harry Potter is AWESOME I don't care if anyone says differently!..          |1.0  |
|I want to be here because I love Harry Potter, and I really want a place    |1.0  |
|But if Crash won the academy award, Brokeback Mountain must have sucked     |0.0  |
|Because I would like to make friends who like the same things I like, an    |1.0  |
|Brokeback Mountain was an AWESOME movie.                                    |1.0  |
|the last stand and Mission Impossible 3 both were awesome movies.           |1.0  |
|Other than that, all I've heard is that the Da Vinci Code kinda sucks!      |0.0  |
|"I think the movie "" Brokeback Mountain "" was stupid and overe

In [15]:
#Checking for the values after transformation
Movie_Reviews_DF.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  1.0| 3909|
|  0.0| 3081|
+-----+-----+



In [16]:
# Adding length column to the dataframe
#Length of the review might matter because repetition of words would occur in the same review
#Loading length function 
from pyspark.sql.functions import length

In [17]:
#For each row calculating length of review and adding it to a new column
Movie_Reviews_DF = Movie_Reviews_DF.withColumn('length',length(Movie_Reviews_DF['Review']))

In [18]:
#Displaying the data
Movie_Reviews_DF.orderBy(rand()).show(5,False)

+------------------------------------------------------------------------+-----+------+
|Review                                                                  |Label|length|
+------------------------------------------------------------------------+-----+------+
|I hate Harry Potter.                                                    |0.0  |20    |
|the last stand and Mission Impossible 3 both were awesome movies.       |1.0  |65    |
|we're gonna like watch Mission Impossible or Hoot.(                     |1.0  |51    |
|I think I hate Harry Potter because it outshines much better reading mat|0.0  |72    |
|I hate Harry Potter.                                                    |0.0  |20    |
+------------------------------------------------------------------------+-----+------+
only showing top 5 rows



In [19]:
#Average length of a review for a 0 and 1 sentiment review(negative and positive)
#Fairly close
Movie_Reviews_DF.groupBy('Label').agg({'Length':'mean'}).show()

+-----+-----------------+
|Label|      avg(Length)|
+-----+-----------------+
|  1.0|47.61882834484523|
|  0.0|50.95845504706264|
+-----+-----------------+



In [20]:
# Data Preprocessing for NLP

In [21]:
#Tokenization
#Importing the Tokenizer function
from pyspark.ml.feature import Tokenizer

In [22]:
#Taking review column and creating new column tokens for storing the tokens created from review column
tokenization = Tokenizer(inputCol='Review',outputCol='Tokens')

In [23]:
#Applying the Tokenizer function to the dataframe
Tokenized_DF = tokenization.transform(Movie_Reviews_DF)

In [24]:
#looking at the tokens columns
Tokenized_DF.select('Tokens').show(10, False)

+----------------------------------------------------------------------------------------+
|Tokens                                                                                  |
+----------------------------------------------------------------------------------------+
|[the, da, vinci, code, book, is, just, awesome.]                                        |
|[this, was, the, first, clive, cussler, i've, ever, read,, but, even, books, like, rel] |
|[i, liked, the, da, vinci, code, a, lot.]                                               |
|[i, liked, the, da, vinci, code, a, lot.]                                               |
|[i, liked, the, da, vinci, code, but, it, ultimatly, didn't, seem, to, hold, it's, own.]|
|[that's, not, even, an, exaggeration, ), and, at, midnight, we, went, to, wal-mart, to] |
|[i, loved, the, da, vinci, code,, but, now, i, want, something, better, and, different] |
|[i, thought, da, vinci, code, was, great,, same, with, kite, runner.]                   |

In [25]:
#To get a count of tokens for each row before removing the stop words
# importing size function from sql functions
from pyspark.sql.functions import size

In [26]:
#Selecting all columns from dataframe and adding a new column based on no of tokens in each observation
# Size is a sql function to count number of items in a list
Tokenized_DF = Tokenized_DF.select('*',size('Tokens').alias('Tokens_Count'))

In [27]:
#looking at the tokens and tokens count columns
#Tokenization converts sentences to lower case and then creates tokens
Tokenized_DF.select('Tokens','Tokens_Count').show(10, False)

+----------------------------------------------------------------------------------------+------------+
|Tokens                                                                                  |Tokens_Count|
+----------------------------------------------------------------------------------------+------------+
|[the, da, vinci, code, book, is, just, awesome.]                                        |8           |
|[this, was, the, first, clive, cussler, i've, ever, read,, but, even, books, like, rel] |14          |
|[i, liked, the, da, vinci, code, a, lot.]                                               |8           |
|[i, liked, the, da, vinci, code, a, lot.]                                               |8           |
|[i, liked, the, da, vinci, code, but, it, ultimatly, didn't, seem, to, hold, it's, own.]|15          |
|[that's, not, even, an, exaggeration, ), and, at, midnight, we, went, to, wal-mart, to] |14          |
|[i, loved, the, da, vinci, code,, but, now, i, want, something,

In [28]:
#Removal of stopwords
#Importing the StopWordsRemover function
from pyspark.ml.feature import StopWordsRemover

In [29]:
#Taking Tokens column and creating new column Refined Tokens for storing the tokens after removal of stopwords
stopword_removal=StopWordsRemover(inputCol='Tokens',outputCol='Refined_Tokens')

In [30]:
#Applying the StopWordsRemover function to the dataframe
Refined_DF = stopword_removal.transform(Tokenized_DF)

In [31]:
#Selecting only the refined tokens column which has tokens after stop words have been removed
Refined_DF.select(['Refined_Tokens']).show(10,False)

+-------------------------------------------------------------+
|Refined_Tokens                                               |
+-------------------------------------------------------------+
|[da, vinci, code, book, awesome.]                            |
|[first, clive, cussler, ever, read,, even, books, like, rel] |
|[liked, da, vinci, code, lot.]                               |
|[liked, da, vinci, code, lot.]                               |
|[liked, da, vinci, code, ultimatly, seem, hold, own.]        |
|[even, exaggeration, ), midnight, went, wal-mart]            |
|[loved, da, vinci, code,, want, something, better, different]|
|[thought, da, vinci, code, great,, kite, runner.]            |
|[da, vinci, code, actually, good, movie...]                  |
|[thought, da, vinci, code, pretty, good, book.]              |
+-------------------------------------------------------------+
only showing top 10 rows



In [32]:
#To get a count of tokens for each row after removing the stop words
#importing size function from sql functions
from pyspark.sql.functions import size

In [33]:
#Selecting all columns from dataframe and adding a new column based on no of refined tokens in each observation
#Size is a sql function to count number of items in a list
Refined_DF = Refined_DF.select('*',size('Refined_Tokens').alias('Refined_Tokens_Count'))

In [34]:
#Looking at the tokens,tokens count and refined tokens, refined tokens count columns
#To see if the counts vary which indicates removal of stop words in tokens 
Refined_DF.select('Tokens','Tokens_Count','Refined_Tokens','Refined_Tokens_Count').show(10, False)

+----------------------------------------------------------------------------------------+------------+-------------------------------------------------------------+--------------------+
|Tokens                                                                                  |Tokens_Count|Refined_Tokens                                               |Refined_Tokens_Count|
+----------------------------------------------------------------------------------------+------------+-------------------------------------------------------------+--------------------+
|[the, da, vinci, code, book, is, just, awesome.]                                        |8           |[da, vinci, code, book, awesome.]                            |5                   |
|[this, was, the, first, clive, cussler, i've, ever, read,, but, even, books, like, rel] |14          |[first, clive, cussler, ever, read,, even, books, like, rel] |9                   |
|[i, liked, the, da, vinci, code, a, lot.]                       

In [35]:
#Looking at random data
Refined_DF.orderBy(rand()).show(4, False)

+------------------------------------------------------------------------+-----+------+-----------------------------------------------------------------------------------------+------------+-------------------------------------------------+--------------------+
|Review                                                                  |Label|length|Tokens                                                                                   |Tokens_Count|Refined_Tokens                                   |Refined_Tokens_Count|
+------------------------------------------------------------------------+-----+------+-----------------------------------------------------------------------------------------+------------+-------------------------------------------------+--------------------+
|My dad's being stupid about brokeback mountain...                       |0.0  |49    |[my, dad's, being, stupid, about, brokeback, mountain...]                                |7           |[dad's, stupid, brokebac

In [36]:
#Looking at schema
Refined_DF.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)
 |-- length: integer (nullable = true)
 |-- Tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Tokens_Count: integer (nullable = false)
 |-- Refined_Tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Refined_Tokens_Count: integer (nullable = false)



In [37]:
# Count Vectorization

In [38]:
#Creating features based on Count Vectorization in PySpark using the Refined dataframe

In [39]:
#Imprting function for CV calculation
from pyspark.ml.feature import CountVectorizer

In [40]:
#Taking refined tokens column and creating new column CV features 
count_vec=CountVectorizer(inputCol='Refined_Tokens', outputCol='CV_features')

In [41]:
CV_DF=count_vec.fit(Refined_DF).transform(Refined_DF)

In [42]:
CV_DF.select(['Refined_Tokens','CV_features']).show(10,False)

+-------------------------------------------------------------+----------------------------------------------------------------------------------+
|Refined_Tokens                                               |CV_features                                                                       |
+-------------------------------------------------------------+----------------------------------------------------------------------------------+
|[da, vinci, code, book, awesome.]                            |(2302,[0,1,4,43,236],[1.0,1.0,1.0,1.0,1.0])                                       |
|[first, clive, cussler, ever, read,, even, books, like, rel] |(2302,[11,51,229,237,275,742,824,1087,1250],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[liked, da, vinci, code, lot.]                               |(2302,[0,1,4,53,356],[1.0,1.0,1.0,1.0,1.0])                                       |
|[liked, da, vinci, code, lot.]                               |(2302,[0,1,4,53,356],[1.0,1.0,1.0,1.0,1.0])            

In [None]:
#this method takes each word in the BoW and counts how many times that word appears in each document. It is basically computing Term Frequency (TF) or the number of times each word occurs in each document.

In [43]:
CV_DF_Model = CV_DF.select(['CV_features','Label'])

In [44]:
CV_DF_Model.select(['Label','CV_features']).show(10,False)

+-----+----------------------------------------------------------------------------------+
|Label|CV_features                                                                       |
+-----+----------------------------------------------------------------------------------+
|1.0  |(2302,[0,1,4,43,236],[1.0,1.0,1.0,1.0,1.0])                                       |
|1.0  |(2302,[11,51,229,237,275,742,824,1087,1250],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1.0  |(2302,[0,1,4,53,356],[1.0,1.0,1.0,1.0,1.0])                                       |
|1.0  |(2302,[0,1,4,53,356],[1.0,1.0,1.0,1.0,1.0])                                       |
|1.0  |(2302,[0,1,4,53,655,1339,1427,1449],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])            |
|1.0  |(2302,[46,229,271,1150,1990,2203],[1.0,1.0,1.0,1.0,1.0,1.0])                      |
|1.0  |(2302,[0,1,22,30,111,219,389,535],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])              |
|1.0  |(2302,[0,1,4,228,1258,1716,2263],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                   |

In [45]:
#To see the schema of the dataset
CV_DF_Model.printSchema()

root
 |-- CV_features: vector (nullable = true)
 |-- Label: float (nullable = true)



In [46]:
#Splitting the data of CV model
CV_Training_DF,CV_Test_DF = CV_DF_Model.randomSplit([0.75,0.25])

In [47]:
#Checking the balance of training dataframe of CV model
CV_Training_DF.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2921|
|  0.0| 2312|
+-----+-----+



In [48]:
#Checking the balance of testing dataframe of CV model
CV_Test_DF.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  988|
|  0.0|  769|
+-----+-----+



In [49]:
#Term Frequency(TF) and Inverse Document Frequency(IDF)

In [50]:
#Creating features based on TF-IDF in PySpark using the Refined dataframe

In [51]:
#Imprting function for TF and IDF calculation
from pyspark.ml.feature import HashingTF,IDF

In [52]:
#TERM FREQUENCY
#It is the score based on the number of times the word appears in current dataframe
#Taking refined tokens column and creating new column tf features for storing the tf value created
hashing_vec=HashingTF(inputCol='Refined_Tokens',outputCol='TF_features')

In [53]:
#Applying the HashingTF function to the dataframe
Hashing_DF = hashing_vec.transform(Refined_DF)

In [54]:
#Looking at the refined tokens and corresoponding TF features columns
Hashing_DF.select(['Refined_Tokens','TF_features']).show(4,False)

+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|Refined_Tokens                                              |TF_features                                                                                                  |
+------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|[da, vinci, code, book, awesome.]                           |(262144,[93284,111793,189113,212976,235054],[1.0,1.0,1.0,1.0,1.0])                                           |
|[first, clive, cussler, ever, read,, even, books, like, rel]|(262144,[47372,82111,113624,120246,139559,174966,203802,208258,227467],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[liked, da, vinci, code, lot.]                              |(262144,[32675,93284,111793,227152,235054],[1.0,1.0,1.0,1.0,1.0])        

In [None]:
#The term TF represents term frequency and computes a ratio of the number of times a word appears in a document divided by the total number of terms in that document. It therefore attempts to measure “importance” – higher the value, the more frequently the term was used with respect to other terms in the document. 

In [55]:
#[da, vinci, code, book, awesome.]---(262144,[93284,111793,189113,212976,235054],[1.0,1.0,1.0,1.0,1.0])  
#262144 - Total number of tokens in the dataframe
#93284 - frequency of the word da
#111793 - frequency of the word vinci
#189113 - frequency of the word code
#212976 - frequency of the word book
#235054 - frequency of the word awesome
#[1.0,1.0,1.0,1.0,1.0])-- list indicating the presence of the words [da, vinci, code, book, awesome.] in the review with 1

In [56]:
#INVERSE DOCUMENT FREQUENCY
#It is calculated by taking the total number of documents, dividing it by the number of documents that contain a word, and calculating the logarithm
#Taking TF features column and creating new column TF-IDF features for storing the TF-IDF value created
TF_IDF_vec=IDF(inputCol='TF_features',outputCol='TF_IDF_features')

In [None]:
#The term IDF, computes a measure of relative importance of the term with respect to the same term used in all other documents in the corpus. Thus, if a term appears in all documents, it’s not helping in differentiating documents. Such a term will therefore be assigned a very low relative importance.

In [57]:
#Applying the IDF function to the dataframe
TF_IDF_DF = TF_IDF_vec.fit(Hashing_DF).transform(Hashing_DF)

In [58]:
#Looking at the refined tokens and corresoponding TF-IDF features columns
#Multiplying these TF and IDF results in the TF-IDF score of a word in a document. 
#The higher the score, the more relevant that word is in that particular document.
TF_IDF_DF.select(['Refined_Tokens','TF_IDF_features']).show(10,False)

+-------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Refined_Tokens                                               |TF_IDF_features                                                                                                                                                                                                                           |
+-------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[da, vinci, code, book, awesome.]                            |(262144,[93284,111793,189113,212976,2350

In [None]:
#The product of TF and IDF gives us the TL-IDF score or weight that ranks each term by its relative importance. 


In [59]:
TF_IDF_DF_Model = TF_IDF_DF.select(['TF_IDF_features','Label'])

In [60]:
TF_IDF_DF_Model.select(['Label','TF_IDF_features']).show(10,False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Label|TF_IDF_features                                                                                                                                                                                                                           |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1.0  |(262144,[93284,111793,189113,212976,235054],[1.469010739519602,1.2610218398134343,6.079790164272204,4.0401945311395675,1.2620319409094194])                                                                                               |
|1.0  |(262144,[47372,82111,

In [61]:
#To see the schema of the dataset
TF_IDF_DF_Model.printSchema()

root
 |-- TF_IDF_features: vector (nullable = true)
 |-- Label: float (nullable = true)



In [62]:
#Splitting the data of TFIDF model
TFIDF_Training_DF,TFIDF_Test_DF = TF_IDF_DF_Model.randomSplit([0.75,0.25])

In [63]:
#Checking the balance of training dataframe
TFIDF_Training_DF.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2937|
|  0.0| 2313|
+-----+-----+



In [64]:
#Checking the balance of testing dataframe
TFIDF_Test_DF.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  972|
|  0.0|  768|
+-----+-----+



In [65]:
#Importing Logistic Regression
from pyspark.ml.classification import LogisticRegression

In [66]:
#Logistic Regression model(Using CV)
CV_log_reg=LogisticRegression(featuresCol='CV_features',labelCol='Label').fit(CV_Training_DF)

In [67]:
#Get Training Summary(Using CV)

In [68]:
CV_training_summary = CV_log_reg.summary
print("Area Under ROC:" + str(CV_training_summary.areaUnderROC))
print("Weighted Accuracy:" + str(CV_training_summary.accuracy))
print("Weighted Recall:" + str(CV_training_summary.weightedRecall))
print("Weighted Precision:" + str(CV_training_summary.weightedPrecision))
print("Weighted F1 Measure:" + str(CV_training_summary.weightedFMeasure()))

Area Under ROC:0.9999984452165384
Weighted Accuracy:1.0
Weighted Recall:1.0
Weighted Precision:1.0
Weighted F1 Measure:1.0


In [69]:
#Evaluation of test data (Using CV)
CV_results=CV_log_reg.evaluate(CV_Test_DF).predictions

In [70]:
#Displaying the results of TFIDF
CV_results.show(10, False)

+-------------------------------------------------------------------------------------+-----+----------------------------------------+------------------------------------------+----------+
|CV_features                                                                          |Label|rawPrediction                           |probability                               |prediction|
+-------------------------------------------------------------------------------------+-----+----------------------------------------+------------------------------------------+----------+
|(2302,[0,1,4,5,12,305,340],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                            |1.0  |[-26.653836401563705,26.653836401563705]|[2.656965585654917E-12,0.999999999997343] |1.0       |
|(2302,[0,1,4,5,64,2029],[1.0,1.0,1.0,1.0,1.0,1.0])                                   |1.0  |[-16.511341767538717,16.511341767538717]|[6.748625867178597E-8,0.9999999325137413] |1.0       |
|(2302,[0,1,4,5,220,247,338,636,1706],[1.0,1.0,1.0,1.0,

In [71]:
CV_results.select('Label', 'prediction','probability').show(20,False)

+-----+----------+------------------------------------------+
|Label|prediction|probability                               |
+-----+----------+------------------------------------------+
|1.0  |1.0       |[2.656965585654917E-12,0.999999999997343] |
|1.0  |1.0       |[6.748625867178597E-8,0.9999999325137413] |
|1.0  |0.0       |[0.9320254537061838,0.06797454629381615]  |
|1.0  |1.0       |[9.79549026155353E-12,0.9999999999902045] |
|1.0  |1.0       |[1.1385305576935607E-9,0.9999999988614694]|
|1.0  |1.0       |[8.308494789964832E-7,0.999999169150521]  |
|0.0  |0.0       |[0.9999999999999858,1.426857540299771E-14]|
|1.0  |1.0       |[4.5842490699912783E-7,0.999999541575093] |
|1.0  |1.0       |[2.575831872851638E-19,1.0]               |
|1.0  |1.0       |[7.052233925854608E-10,0.9999999992947766]|
|1.0  |1.0       |[7.052233925854608E-10,0.9999999992947766]|
|1.0  |1.0       |[7.052233925854608E-10,0.9999999992947766]|
|1.0  |1.0       |[7.052233925854608E-10,0.9999999992947766]|
|1.0  |1

In [72]:
#confusion matrix for CV results
CV_true_positives = CV_results[(CV_results.Label == 1) & (CV_results.prediction == 1)].count()
CV_true_negatives = CV_results[(CV_results.Label == 0) & (CV_results.prediction == 0)].count()
CV_false_positives = CV_results[(CV_results.Label == 0) & (CV_results.prediction == 1)].count()
CV_false_negatives = CV_results[(CV_results.Label == 1) & (CV_results.prediction == 0)].count()

In [73]:
#Displaying Confurion matrix of CV
print("CV_true_postives ARE :", CV_true_positives , "CV_true_negatives ARE :", CV_true_negatives)
print("CV_false_postives ARE :" ,CV_false_positives , "CV_false_negatives ARE :" ,CV_false_negatives)

CV_true_postives ARE : 978 CV_true_negatives ARE : 743
CV_false_postives ARE : 26 CV_false_negatives ARE : 10


In [74]:
#CV Recall Value
CV_recall = float(CV_true_positives)/(CV_true_positives + CV_false_negatives)
print("CV Recall Value is :" ,CV_recall)

CV Recall Value is : 0.9898785425101214


In [75]:
#CV Precision Value
CV_precision = float(CV_true_positives) / (CV_true_positives + CV_false_positives)
print("CV Precision Value is :" ,CV_precision)

CV Precision Value is : 0.9741035856573705


In [76]:
#CV Accuracy Value
CV_accuracy=float((CV_true_positives + CV_true_negatives) /(CV_results.count()))
print("Cv Accuracy Value is :" ,CV_accuracy)

Cv Accuracy Value is : 0.9795105293113261


In [77]:
#Logistic Regression model(Using TF-IDF)
TFIDF_log_reg=LogisticRegression(featuresCol='TF_IDF_features',labelCol='Label').fit(TFIDF_Training_DF)

In [78]:
#Get Training Summary(Using TF-IDF)

In [79]:
TFIDF_training_summary = TFIDF_log_reg.summary
print("Area Under ROC:" + str(TFIDF_training_summary.areaUnderROC))
print("Weighted Accuracy:" + str(TFIDF_training_summary.accuracy))
print("Weighted Recall:" + str(TFIDF_training_summary.weightedRecall))
print("Weighted Precision:" + str(TFIDF_training_summary.weightedPrecision))
print("Weighted F1 Measure:" + str(TFIDF_training_summary.weightedFMeasure()))

Area Under ROC:0.9999981599465707
Weighted Accuracy:1.0
Weighted Recall:1.0
Weighted Precision:1.0
Weighted F1 Measure:1.0


In [80]:
#Evaluation of test data (Using TF-IDF)
TFIDF_results=TFIDF_log_reg.evaluate(TFIDF_Test_DF).predictions

In [81]:
#Displaying the results of TFIDF
TFIDF_results.show(10, False)

+----------------------------------------------------------------------------------------------------------+-----+----------------------------------------+------------------------------------------+----------+
|TF_IDF_features                                                                                           |Label|rawPrediction                           |probability                               |prediction|
+----------------------------------------------------------------------------------------------------------+-----+----------------------------------------+------------------------------------------+----------+
|(262144,[14,535,31179,197995],[3.203404648350779,1.8474968967991254,4.398031590258477,1.3535091525350544])|0.0  |[20.055594793030522,-20.055594793030522]|[0.9999999980503087,1.9496912797521345E-9]|0.0       |
|(262144,[14,535,31179,197995],[3.203404648350779,1.8474968967991254,4.398031590258477,1.3535091525350544])|0.0  |[20.055594793030522,-20.055594793030522]|[0.99

In [82]:
TFIDF_results.select('Label', 'prediction','probability').show(20,False)

+-----+----------+-------------------------------------------+
|Label|prediction|probability                                |
+-----+----------+-------------------------------------------+
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.9496912797521345E-9] |
|0.0  |0.0       |[0.9999999980503087,1.949691279752134

In [83]:
#For TF-IDF results BinaryClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [84]:
#confusion matrix for TFIDF results
TFIDF_true_positives = TFIDF_results[(TFIDF_results.Label == 1) & (TFIDF_results.prediction == 1)].count()
TFIDF_true_negatives = TFIDF_results[(TFIDF_results.Label == 0) & (TFIDF_results.prediction == 0)].count()
TFIDF_false_positives = TFIDF_results[(TFIDF_results.Label == 0) & (TFIDF_results.prediction == 1)].count()
TFIDF_false_negatives = TFIDF_results[(TFIDF_results.Label == 1) & (TFIDF_results.prediction == 0)].count()

In [85]:
#Displaying Confurion matrix of TFIDF
print("TFIDF_true_postives ARE :", TFIDF_true_positives , " and TFIDF_true_negatives ARE :", TFIDF_true_negatives)
print("TFIDF_false_postives ARE :" ,TFIDF_false_positives , " and TFIDF_false_negatives ARE :" ,TFIDF_false_negatives)

TFIDF_true_postives ARE : 960  and TFIDF_true_negatives ARE : 738
TFIDF_false_postives ARE : 30  and TFIDF_false_negatives ARE : 12


In [86]:
#TFIDF Recall Value
TFIDF_recall = float(TFIDF_true_positives)/(TFIDF_true_positives + TFIDF_false_negatives)
print("TFIDF Recall Value is :" ,TFIDF_recall)

TFIDF Recall Value is : 0.9876543209876543


In [87]:
#TFIDF Precision Value
TFIDF_precision = float(TFIDF_true_positives) / (TFIDF_true_positives + TFIDF_false_positives)
print("TFIDF Precision Value is :" ,TFIDF_precision)

TFIDF Precision Value is : 0.9696969696969697


In [88]:
#TFIDF Accuracy Value
TFIDF_accuracy=float((TFIDF_true_positives+TFIDF_true_negatives) /(TFIDF_results.count()))
print("TFIDF Accuracy Value is :" ,TFIDF_accuracy)

TFIDF Accuracy Value is : 0.9758620689655172


In [None]:
#Precision is about the number of actual positive cases out of all the positive
#cases predicted by the model
#CV Precision Value is : 0.9741035856573705(97%)
#TFIDF Precision Value is : 0.9696969696969697(96%)

In [None]:
#Recall:
#It talks about the quality of the machine learning model when it comes
#to predicting a positive class. So out of total positive classes, how many
#was the model able to predict correctly? This metric is widely used as
#evaluation criteria for classification models.
#CV Recall Value is : 0.9898785425101214(98%)
#TFIDF Recall Value is : 0.9876543209876543(98%)

In [None]:
#TFIDF Accuracy Value is : 0.9758620689655172(97%)
#Cv Accuracy Value is : 0.9795105293113261(97%)

In [None]:
#Even if the accuracy is wrong we can see the models are equally performing well by precision and recall
#Both method are equally accurate
#Where as using TFIDF has a better precision
