In [2]:
#create spark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('MovieClassification_nlp_ML').getOrCreate()

In [3]:
#read dataset
textRevs_df=spark.read.csv('C:/Users/Owner/Documents/Machine Learning/Movie_reviews.csv', inferSchema=True,header=True, sep=',')

In [4]:
#know the columns and their datatypes
textRevs_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [5]:
#count the number of rows
textRevs_df.count()

7087

In [6]:
#select random rows from the dataset to see how they're.. using rand function
from pyspark.sql.functions import rand 
textRevs_df.orderBy(rand()).show(10,False)  # Note Sentiment values are read in as string

+------------------------------------------------------------------------+---------+
|Review                                                                  |Sentiment|
+------------------------------------------------------------------------+---------+
|Because I would like to make friends who like the same things I like, an|1        |
|Mission Impossible 3 was excellent.                                     |1        |
|As I sit here, watching the MTV Movie Awards, I am reminded of how much |0        |
|I am going to start reading the Harry Potter series again because that i|1        |
|I love Harry Potter.                                                    |1        |
|da vinci code was an awesome movie...                                   |1        |
|I love Harry Potter.                                                    |1        |
|I love Harry Potter..                                                   |1        |
|Is it just me, or does Harry Potter suck?...                    

Because they are of string data type, they are left aligned

# Data Cleaning

In [7]:
#As sentiment column has values other than 0 and 1, we are going to filter and make a fresh dataset using filter 
textRevs_df = textRevs_df.filter(((textRevs_df.Sentiment =='1') | (textRevs_df.Sentiment =='0')))
textRevs_df.count()

6990

In [8]:
#Checking distribution of rows for each sentiment values of 0 and 1
textRevs_df.groupBy('Sentiment').count().show()

+---------+-----+
|Sentiment|count|
+---------+-----+
|        0| 3081|
|        1| 3909|
+---------+-----+



In [9]:
#check the columns and their datatypes using printSchema
textRevs_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Sentiment: string (nullable = true)



 Sentiment value read in is not a float, required by LR. So, we have to convert it. 
 Since we will end up with duplicate columns with the same data, we will drop the original
 sentiment column 

If model has more positive reviews than negative reviews, ml will lean towards positive.. so the closer the groups of divisions of 0 and 1 are, the more accurate the model will be

In [10]:
textRevs_df = textRevs_df.withColumn("Label", textRevs_df.Sentiment.cast('float')).drop('Sentiment')

Above thing can be done with stringindexer too

In [11]:
textRevs_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)



In [12]:
textRevs_df.orderBy(rand()).show(10,False)

+------------------------------------------------------------+-----+
|Review                                                      |Label|
+------------------------------------------------------------+-----+
|Brokeback Mountain was boring.                              |0.0  |
|mission impossible 2 rocks!!....                            |1.0  |
|Brokeback Mountain was so awesome.                          |1.0  |
|Is it just me, or does Harry Potter suck?...                |0.0  |
|Love luv lubb the Da Vinci Code!                            |1.0  |
|He's like,'YEAH I GOT ACNE AND I LOVE BROKEBACK MOUNTAIN '..|1.0  |
|Brokeback Mountain was boring.                              |0.0  |
|Brokeback Mountain was boring.                              |0.0  |
|mission impossible 2 rocks!!....                            |1.0  |
|mission impossible 2 rocks!!....                            |1.0  |
+------------------------------------------------------------+-----+
only showing top 10 rows



The data is not clean.. there are many punctuations which are unneccessary. When we tokenize its gonna convert it to a word. Sometimes lengths of sentences also differ.
We're gonna look into these kinds of things before we analyse

Again looking at data to make sure we haven't lost any during trnasformation

In [13]:
textRevs_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  1.0| 3909|
|  0.0| 3081|
+-----+-----+



In [14]:
# Add length to the dataframe
from pyspark.sql.functions import length

In [15]:
textRevs_df=textRevs_df.withColumn('length',length(textRevs_df['Review']))

In [16]:
textRevs_df.orderBy(rand()).show(5,True)

+--------------------+-----+------+
|              Review|Label|length|
+--------------------+-----+------+
|Always knows what...|  0.0|    61|
|So as felicia's m...|  1.0|    71|
|I LOVE THE DA VIN...|  1.0|    28|
|I think I hate Ha...|  0.0|    72|
|The Da Vinci Code...|  1.0|    30|
+--------------------+-----+------+
only showing top 5 rows



Stop word remover removes vocab but punctuations wont be removed.
agg is performed on length and finds avg of set of values in that column


To handle punctuations, create a function having a list of punctuations and check for all the reviews with the function and replace them.

In [17]:
textRevs_df.groupBy('Label').agg({'Length':'mean'}).show()

+-----+-----------------+
|Label|      avg(Length)|
+-----+-----------------+
|  1.0|47.61882834484523|
|  0.0|50.95845504706264|
+-----+-----------------+



we can observe that negative comments are 3 more than positive comments. This is nothing to be bothered about

In [18]:
from pyspark.sql.functions import regexp_replace, trim, col, lower

def removePunctuation(column):
    return trim(lower(regexp_replace(column, '([^\s\w_]|_)+', '')))

In [19]:
textRevs_df = textRevs_df.withColumn('review_nopunct', removePunctuation(col('Review')))

review_nopunct column has all the punctuations removed from the reviews.

In [20]:
textRevs_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)
 |-- length: integer (nullable = true)
 |-- review_nopunct: string (nullable = true)



In [21]:
textRevs_df.show(5,False)

+------------------------------------------------------------------------+-----+------+----------------------------------------------------------------------+
|Review                                                                  |Label|length|review_nopunct                                                        |
+------------------------------------------------------------------------+-----+------+----------------------------------------------------------------------+
|The Da Vinci Code book is just awesome.                                 |1.0  |39    |the da vinci code book is just awesome                                |
|this was the first clive cussler i've ever read, but even books like Rel|1.0  |72    |this was the first clive cussler ive ever read but even books like rel|
|i liked the Da Vinci Code a lot.                                        |1.0  |32    |i liked the da vinci code a lot                                       |
|i liked the Da Vinci Code a lot.             

# Data Preparation for NLP

In [22]:
#Tokenizer splits the sentence into words and also all the letters are converted to lower cases.
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='review_nopunct',outputCol='tokens')
tokenized_df=tokenization.transform(textRevs_df)
tokenized_df.select('tokens').show(10, False)

+--------------------------------------------------------------------------------------+
|tokens                                                                                |
+--------------------------------------------------------------------------------------+
|[the, da, vinci, code, book, is, just, awesome]                                       |
|[this, was, the, first, clive, cussler, ive, ever, read, but, even, books, like, rel] |
|[i, liked, the, da, vinci, code, a, lot]                                              |
|[i, liked, the, da, vinci, code, a, lot]                                              |
|[i, liked, the, da, vinci, code, but, it, ultimatly, didnt, seem, to, hold, its, own] |
|[thats, not, even, an, exaggeration, , and, at, midnight, we, went, to, walmart, to]  |
|[i, loved, the, da, vinci, code, but, now, i, want, something, better, and, different]|
|[i, thought, da, vinci, code, was, great, same, with, kite, runner]                   |
|[the, da, vinci, cod

In [23]:
#Some very common words such as 'this', 'the', 'to' etc are known as stop words. 
#In order to decrease the computation overhead, its always a good idea to drop them
#hence we use stopwordsremover
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)
refined_df.select(['refined_tokens']).show(10,False)

+----------------------------------------------------------------+
|refined_tokens                                                  |
+----------------------------------------------------------------+
|[da, vinci, code, book, awesome]                                |
|[first, clive, cussler, ive, ever, read, even, books, like, rel]|
|[liked, da, vinci, code, lot]                                   |
|[liked, da, vinci, code, lot]                                   |
|[liked, da, vinci, code, ultimatly, didnt, seem, hold]          |
|[thats, even, exaggeration, , midnight, went, walmart]          |
|[loved, da, vinci, code, want, something, better, different]    |
|[thought, da, vinci, code, great, kite, runner]                 |
|[da, vinci, code, actually, good, movie]                        |
|[thought, da, vinci, code, pretty, good, book]                  |
+----------------------------------------------------------------+
only showing top 10 rows



Everything's converted to lowercase and then breaks into individual words by tokenizer.

In [24]:
refined_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)
 |-- length: integer (nullable = true)
 |-- review_nopunct: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- refined_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [25]:
# get a count of tokens in each movie
from pyspark.sql.functions import size
refined_df = refined_df.select('*',size('refined_tokens').alias('token_count'))

size counts no. of items in a list.. as we can see the review tokens were in the list []

In [26]:
# display a count of tokens in each movie
refined_df.orderBy(rand()).show(4, False)
refined_df.printSchema()

+----------------------------------------------------------------------+-----+------+-------------------------------------------------------------------+-----------------------------------------------------------------------------------+-------------------------------------------------------+-----------+
|Review                                                                |Label|length|review_nopunct                                                     |tokens                                                                             |refined_tokens                                         |token_count|
+----------------------------------------------------------------------+-----+------+-------------------------------------------------------------------+-----------------------------------------------------------------------------------+-------------------------------------------------------+-----------+
|I love The Da Vinci Code...                                           |1.0  |27  

In [27]:
refined_df.select(['refined_tokens','token_count']).show(5,False)

+----------------------------------------------------------------+-----------+
|refined_tokens                                                  |token_count|
+----------------------------------------------------------------+-----------+
|[da, vinci, code, book, awesome]                                |5          |
|[first, clive, cussler, ive, ever, read, even, books, like, rel]|10         |
|[liked, da, vinci, code, lot]                                   |5          |
|[liked, da, vinci, code, lot]                                   |5          |
|[liked, da, vinci, code, ultimatly, didnt, seem, hold]          |8          |
+----------------------------------------------------------------+-----------+
only showing top 5 rows



review length and token count give us similar kind of info

# Count Vectorisation

This method takes each word in the BoW and counts how many times that word appears in each document. It is basically computing Term Frequency (TF) or the number of times each word occurs in each document.

In [28]:
from pyspark.ml.feature import CountVectorizer

In [29]:
count_vec = CountVectorizer(inputCol='refined_tokens', outputCol='cv_features')
cv_df = count_vec.fit(refined_df).transform(refined_df)

In [30]:
cv_df.select(['refined_tokens','cv_features']).show(10,False)

+----------------------------------------------------------------+----------------------------------------------------------------------------------------+
|refined_tokens                                                  |cv_features                                                                             |
+----------------------------------------------------------------+----------------------------------------------------------------------------------------+
|[da, vinci, code, book, awesome]                                |(1706,[0,1,2,8,174],[1.0,1.0,1.0,1.0,1.0])                                              |
|[first, clive, cussler, ive, ever, read, even, books, like, rel]|(1706,[13,40,41,185,187,208,214,825,871,1590],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[liked, da, vinci, code, lot]                                   |(1706,[0,1,2,44,189],[1.0,1.0,1.0,1.0,1.0])                                             |
|[liked, da, vinci, code, lot]                                  

In [31]:
count_vec.fit(refined_df).vocabulary

['da',
 'vinci',
 'code',
 'harry',
 'brokeback',
 'potter',
 'mountain',
 'love',
 'awesome',
 'mission',
 'impossible',
 '',
 'movie',
 'like',
 'hate',
 'sucked',
 'sucks',
 'much',
 'really',
 'movies',
 'know',
 'suck',
 '3',
 'want',
 'loved',
 'think',
 'one',
 'stupid',
 'depressing',
 'horrible',
 'reading',
 'fucking',
 'oh',
 'terrible',
 'right',
 'left',
 'ok',
 'beautiful',
 'went',
 'saw',
 'read',
 'first',
 '2',
 'dont',
 'liked',
 'absolutely',
 'way',
 'tom',
 'heard',
 'big',
 'time',
 'going',
 'film',
 'boring',
 'said',
 'great',
 'watch',
 'series',
 'people',
 'man',
 'got',
 'watching',
 'cant',
 'b',
 'things',
 'story',
 'last',
 'thats',
 'friday',
 'gay',
 'wait',
 'person',
 'theres',
 'cool',
 'anyone',
 'better',
 'excellent',
 'says',
 'always',
 'rocks',
 'knows',
 'anyway',
 'mom',
 'friends',
 'review',
 'worth',
 'care',
 'opinion',
 'never',
 'p',
 'either',
 'stand',
 'guy',
 'luv',
 'needs',
 'hat',
 'hes',
 'make',
 'snuck',
 'past',
 'soo',
 '

We can see that above LOC sets up a count_vec model to count and then vectorize the data from in the ‘refined_tokens’ column and create a new column named ‘cv_features’ to contain this vector. And applies this transformation model to the refined_df, which contained tokenized documents (after tokenization and stopword removal was performed on the original data). The resulting dataframe, cv_df, now includes a new vector column that shows the count of each word in each document. 

In the above output from cv_features column ; 
(1706,[0,1,2,174,180,181,184],[1.0,1.0,1.0,1.0,1.0,1.0,1.0]) :  Indicates the number of words in BoW or Vocabulary, indicates th eindex positions, indicates the number of times word appears in the document respectively.

In [32]:
model_cv_df = cv_df.select(['cv_features','Label'])

In Feature Engineering, we convert cv_features to vector using vectorassembler and the resulting vector 'cv_features_vec' is used as independant variable in the logistic regression.

In [33]:
from pyspark.ml.feature import VectorAssembler

In [34]:
df_assembler = VectorAssembler(inputCols=['cv_features'],outputCol='cv_features_vec')
model_cv_df = df_assembler.transform(model_cv_df)

In [35]:
model_cv_df.printSchema()

root
 |-- cv_features: vector (nullable = true)
 |-- Label: float (nullable = true)
 |-- cv_features_vec: vector (nullable = true)



In [36]:
#split the data into training and test datasets using 75/25 split
trainingcv_df,testcv_df = model_cv_df.randomSplit([0.75,0.25])

Using groupby in the training and test datasets, we check the distribution of data

In [37]:
trainingcv_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2985|
|  0.0| 2343|
+-----+-----+



In [38]:
testcv_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0|  924|
|  0.0|  738|
+-----+-----+



In [39]:
from pyspark.ml.classification import LogisticRegression
log_reg_cv = LogisticRegression(featuresCol = 'cv_features_vec', labelCol = 'Label').fit(trainingcv_df)

Using evaluation metrics, we train the model and evaluate how well it has learned.

In [40]:
training_summary_cv = log_reg_cv.summary
print("Area Under ROC:" + str(training_summary_cv.areaUnderROC))
print("Weighted Accuracy:" + str(training_summary_cv.accuracy))
print("Weighted Recall:" + str(training_summary_cv.weightedRecall))
print("Weighted Precision:" + str(training_summary_cv.weightedPrecision))
print("Weighted F1 Measure:" + str(training_summary_cv.weightedFMeasure()))

Area Under ROC:0.9999994995606858
Weighted Accuracy:0.9998123123123123
Weighted Recall:0.9998123123123124
Weighted Precision:0.9998123751682023
Weighted F1 Measure:0.9998123079982135


In [41]:
results_cv = log_reg_cv.evaluate(testcv_df).predictions

In [42]:
results_cv.show(10, False)

+----------------------------------+-----+----------------------------------+----------------------------------------+------------------------------------------+----------+
|cv_features                       |Label|cv_features_vec                   |rawPrediction                           |probability                               |prediction|
+----------------------------------+-----+----------------------------------+----------------------------------------+------------------------------------------+----------+
|(1706,[0,1,2,7],[1.0,1.0,1.0,1.0])|1.0  |(1706,[0,1,2,7],[1.0,1.0,1.0,1.0])|[-22.619734281860328,22.619734281860328]|[1.500977628180061E-10,0.9999999998499023]|1.0       |
|(1706,[0,1,2,7],[1.0,1.0,1.0,1.0])|1.0  |(1706,[0,1,2,7],[1.0,1.0,1.0,1.0])|[-22.619734281860328,22.619734281860328]|[1.500977628180061E-10,0.9999999998499023]|1.0       |
|(1706,[0,1,2,7],[1.0,1.0,1.0,1.0])|1.0  |(1706,[0,1,2,7],[1.0,1.0,1.0,1.0])|[-22.619734281860328,22.619734281860328]|[1.50097762818006

In [43]:
results_cv.select('label', 'prediction','probability').show(20,False)

+-----+----------+------------------------------------------+
|label|prediction|probability                               |
+-----+----------+------------------------------------------+
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1.0       |[1.500977628180061E-10,0.9999999998499023]|
|1.0  |1

In [44]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [45]:
#confusion matrix
true_postives_cv = results_cv[(results_cv.Label == 1) & (results_cv.prediction == 1)].count()
true_negatives_cv = results_cv[(results_cv.Label == 0) & (results_cv.prediction == 0)].count()
false_positives_cv = results_cv[(results_cv.Label == 0) & (results_cv.prediction == 1)].count()
false_negatives_cv = results_cv[(results_cv.Label == 1) & (results_cv.prediction == 0)].count()

In [46]:
recall_cv = float(true_postives_cv)/(true_postives_cv + false_negatives_cv)
print(recall_cv)

0.9902597402597403


In [47]:
print(true_postives_cv, true_negatives_cv)
print(false_positives_cv, false_negatives_cv)

915 715
23 9


In [48]:
precision_cv = float(true_postives_cv) / (true_postives_cv + false_positives_cv)
print(precision_cv)

0.9754797441364605


In [49]:
accuracy_cv = float((true_postives_cv + true_negatives_cv) /(results_cv.count()))
print(accuracy_cv)

0.9807460890493381


In [50]:
F1_score_cv = 2*((precision_cv*recall_cv)/(precision_cv + recall_cv))
F1_score_cv

0.9828141783029002

Precision score of 97% says 97% of the time it is predicting the model well. From the above metrics we can say model is predicting really well. But it also could be biased. 

# Feature Engineering

In [51]:
from pyspark.ml.feature import HashingTF,IDF

In [52]:
hashing_vec = HashingTF(inputCol='refined_tokens',outputCol='tf_features')

In [53]:
hashing_df = hashing_vec.transform(refined_df)

In [54]:
hashing_df.select(['refined_tokens','tf_features']).show(4,False)

+----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+
|refined_tokens                                                  |tf_features                                                                                                            |
+----------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------+
|[da, vinci, code, book, awesome]                                |(262144,[82495,93284,111793,189113,235054],[1.0,1.0,1.0,1.0,1.0])                                                      |
|[first, clive, cussler, ive, ever, read, even, books, like, rel]|(262144,[47372,53570,82111,120246,139559,174966,182843,203802,208258,227467],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[liked, da, vinci, code, lot]                                   

In [55]:
hashing_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)
 |-- length: integer (nullable = true)
 |-- review_nopunct: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- refined_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- token_count: integer (nullable = false)
 |-- tf_features: vector (nullable = true)



In [56]:
tf_idf_vec = IDF(inputCol = 'tf_features', outputCol = 'tf_idf_features')

In [57]:
tf_idf_df = tf_idf_vec.fit(hashing_df).transform(hashing_df)

In [58]:
tf_idf_df.select(['tf_idf_features']).show(10, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|tf_idf_features                                                                                                                                                                                                                                                    |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(262144,[82495,93284,111793,189113,235054],[1.928750258373558,1.269640397597574,1.2610218398134343,5.045716396741666,1.2620319409094194])                                                                            

In [59]:
tf_idf_df.printSchema()

root
 |-- Review: string (nullable = true)
 |-- Label: float (nullable = true)
 |-- length: integer (nullable = true)
 |-- review_nopunct: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- refined_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- token_count: integer (nullable = false)
 |-- tf_features: vector (nullable = true)
 |-- tf_idf_features: vector (nullable = true)



In [60]:
from pyspark.ml.feature import VectorAssembler

In [61]:
model_text_df = tf_idf_df.select(['tf_idf_features','Label'])

In [62]:
df_assembler = VectorAssembler(inputCols = ['tf_idf_features'],outputCol = 'tf_idf_features_vec')
model_text_df = df_assembler.transform(model_text_df)

In [63]:
model_text_df.printSchema()

root
 |-- tf_idf_features: vector (nullable = true)
 |-- Label: float (nullable = true)
 |-- tf_idf_features_vec: vector (nullable = true)



In [None]:
#model_text_df = model_text_df.select('Label', 'tf_idf_features_vec')

# Select Data for building LogisticRegression model

# Apply Logistic Regression

In [64]:
from pyspark.ml.classification import LogisticRegression

In [65]:
#split the data 
training_df,test_df = model_text_df.randomSplit([0.75,0.25])

In [66]:
training_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 2907|
|  0.0| 2292|
+-----+-----+



In [67]:
test_df.groupBy('Label').count().show()

+-----+-----+
|Label|count|
+-----+-----+
|  1.0| 1002|
|  0.0|  789|
+-----+-----+



In [68]:
log_reg = LogisticRegression(featuresCol = 'tf_idf_features_vec', labelCol = 'Label').fit(training_df)

## Get Training Summary

In [69]:
#Running into P4 java error at this time.. I have the results of training summary in the last cell. 
#Please check that or re-run the below cell 
#this is not any other error, it can't reach java sometimes and sometimes it works on my system. 
#Kindly deal with the below cell.


In [70]:
training_summary = log_reg.summary
print("Area Under ROC:" + str(training_summary.areaUnderROC))
print("Weighted Accuracy:" + str(training_summary.accuracy))
print("Weighted Recall:" + str(training_summary.weightedRecall))
print("Weighted Precision:" + str(training_summary.weightedPrecision))
print("Weighted F1 Measure:" + str(training_summary.weightedFMeasure()))

Area Under ROC:0.9999984991394065
Weighted Accuracy:0.9998076553183305
Weighted Recall:0.9998076553183304
Weighted Precision:0.9998077392017504
Weighted F1 Measure:0.9998076597494032


AUC should ideally be near to 1 which is good.
All others being 1 indicated we have balanced data.. also because of tokenizer, its an overfitted model. It could be biased coz its over fitted

In [71]:
results = log_reg.evaluate(test_df).predictions

This gives prob and prediction values

In [72]:
results.show(10, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------+-------------------------------------------+----------+
|tf_idf_features                                                                                                                                              |Label|tf_idf_features_vec                                                                                                                                          |rawPrediction                           |probability                                |prediction|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-

In [73]:
results.select('label', 'prediction','probability').show(20,False)

+-----+----------+-------------------------------------------+
|label|prediction|probability                                |
+-----+----------+-------------------------------------------+
|1.0  |1.0       |[1.9684438091316214E-10,0.9999999998031557]|
|1.0  |1.0       |[7.094213513244375E-9,0.9999999929057863]  |
|0.0  |1.0       |[0.008651691540607888,0.991348308459392]   |
|0.0  |0.0       |[0.9999999959549424,4.045057498455958E-9]  |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.6041128625411527E-8] |
|0.0  |0.0       |[0.9999999839588714,1.604112862541152

label and prediction being same values implies they are predicting the same

This implies train data and test data are predicting the same and also overfitting

True measure is evaluation matrix

In [74]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [75]:
#confusion matrix
true_postives = results[(results.Label == 1) & (results.prediction == 1)].count()
true_negatives = results[(results.Label == 0) & (results.prediction == 0)].count()
false_positives = results[(results.Label == 0) & (results.prediction == 1)].count()
false_negatives = results[(results.Label == 1) & (results.prediction == 0)].count()

In [76]:
recall = float(true_postives)/(true_postives + false_negatives)
print(recall)

0.9880239520958084


In [77]:
print(true_postives, true_negatives)
print(false_positives, false_negatives)

990 766
23 12


Majority of them are tp and tn
remainig are fp and fn

Which implies reviews are being well differentiated


In [78]:
precision = float(true_postives) / (true_postives + false_positives)
print(precision)

0.9772951628825272


In [79]:
accuracy=float((true_postives+true_negatives) /(results.count()))
print(accuracy)

0.9804578447794529


In [80]:
F1_score = 2*((precision*recall)/(precision + recall))
F1_score

0.9826302729528535

High precision and High accuracy implies the model is trained well and hence predicting well

In [81]:
print("CountVectorizer")
print("recall_cv", recall_cv*100)
print("precision_cv", precision_cv*100)
print("true_postives_cv, true_negatives_cv: ", true_postives_cv, true_negatives_cv)
print("false_positives_cv, false_negatives_cv: ", false_positives_cv, false_negatives_cv)
print("accuracy_cv", accuracy_cv*100)
print('****'*20)
print("Area Under ROC:" + str(training_summary_cv.areaUnderROC))
print("Weighted Accuracy:" + str(training_summary_cv.accuracy))
print("Weighted Recall:" + str(training_summary_cv.weightedRecall))
print("Weighted Precision:" + str(training_summary_cv.weightedPrecision))
print("Weighted F1 Measure:" + str(training_summary_cv.weightedFMeasure()))
print('****'*30)
print("TF-IDF")
print("recall", recall*100)
print("true_postives, true_negatives: ", true_postives, true_negatives)
print("false_positives, false_negatives: ", false_positives, false_negatives)
print("precision", precision*100)
print("accuracy", accuracy*100)
print('****'*20)
print("Area Under ROC:" + str(training_summary.areaUnderROC))
print("Weighted Accuracy:" + str(training_summary.accuracy))
print("Weighted Recall:" + str(training_summary.weightedRecall))
print("Weighted Precision:" + str(training_summary.weightedPrecision))
print("Weighted F1 Measure:" + str(training_summary.weightedFMeasure()))

CountVectorizer
recall_cv 99.02597402597402
precision_cv 97.54797441364606
true_postives_cv, true_negatives_cv:  915 715
false_positives_cv, false_negatives_cv:  23 9
accuracy_cv 98.07460890493381
********************************************************************************
Area Under ROC:0.9999994995606858
Weighted Accuracy:0.9998123123123123
Weighted Recall:0.9998123123123124
Weighted Precision:0.9998123751682023
Weighted F1 Measure:0.9998123079982135
************************************************************************************************************************
TF-IDF
recall 98.80239520958084
true_postives, true_negatives:  990 766
false_positives, false_negatives:  23 12
precision 97.72951628825271
accuracy 98.04578447794529
********************************************************************************
Area Under ROC:0.9999984991394065
Weighted Accuracy:0.9998076553183305
Weighted Recall:0.9998076553183304
Weighted Precision:0.9998077392017504
Weighted F1 Measure:0.999

In general, By using TF-IDF, evaluation metrics have better results than the results from count vectorisation.

From the results we can see that the metrics are almost similar either with tf-idf or countvectoriser.