#Bulding a spam detection filter

###Initializing a SparkSession

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP').getOrCreate()

###Loading the dataset

In [0]:
df = spark.read.csv('dbfs:/FileStore/shared_uploads/purvajainpj123@gmail.com/13_SMSSpamCollection.csv', inferSchema=True, sep='\t')
df = df.withColumnRenamed('_c0', 'class').withColumnRenamed('_c1', 'text')
df.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



###Formatting the data

####Creating length column (feature engineering)

In [0]:
from pyspark.sql.functions import length
df = df.withColumn('length', length(df['text']))
df.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



####Visualizing length col

In [0]:
df.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham| 71.4545266210897|
| spam|138.6706827309237|
+-----+-----------------+



###Formatting the text columning

In [0]:
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler

###Label encoding
ham_spam_to_numeric = StringIndexer(inputCol='class', outputCol='label')

###Using tokenizer (splitting the document list)
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')

###Removing irrelevant words
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='token_stop')

###Counting the words
count_vec = CountVectorizer(inputCol='token_stop', outputCol='count_vec')

###Getting TF-IDF
idf = IDF(inputCol='count_vec', outputCol='tf-idf')

###Getting transformed df
transformed_df = VectorAssembler(inputCols=['length', 'tf-idf'], outputCol='features')


###Building a pipeline

In [0]:
from pyspark.ml import Pipeline
df_pipe = Pipeline(stages=[ham_spam_to_numeric,
                           tokenizer,
                           stop_remove,
                           count_vec,
                           idf,
                           transformed_df])
final_df = df_pipe.fit(df).transform(df).select('label', 'features')
final_df.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[0,8,12,32...|
|  0.0|(13424,[0,1,25,30...|
|  1.0|(13424,[0,3,14,20...|
|  0.0|(13424,[0,1,71,81...|
|  0.0|(13424,[0,37,135,...|
|  1.0|(13424,[0,11,61,1...|
|  0.0|(13424,[0,11,54,1...|
|  0.0|(13424,[0,128,186...|
|  1.0|(13424,[0,2,48,12...|
|  1.0|(13424,[0,1,2,14,...|
|  0.0|(13424,[0,19,44,1...|
|  1.0|(13424,[0,9,17,38...|
|  1.0|(13424,[0,14,31,4...|
|  0.0|(13424,[0,40,96,2...|
|  0.0|(13424,[0,556,179...|
|  1.0|(13424,[0,31,110,...|
|  0.0|(13424,[0,83,215,...|
|  0.0|(13424,[0,1,3,50,...|
|  0.0|(13424,[0,1,75,10...|
|  1.0|(13424,[0,5,31,34...|
+-----+--------------------+
only showing top 20 rows



###Machine Learning

####Splitting the dataset

In [0]:
train_data, test_data = final_df.randomSplit([0.7, 0.3])

####Building the ML model

In [0]:
from pyspark.ml.classification import NaiveBayes
classifier = NaiveBayes(featuresCol='features', labelCol='label', predictionCol='prediction')
fittied_classifer = classifier.fit(train_data)

####Predicting the test data

In [0]:
preds = fittied_classifer.transform(test_data)
preds.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,2,5,5...|[-830.33233742900...|[1.0,1.0578401186...|       0.0|
|  0.0|(13424,[0,1,2,10,...|[-538.01890104709...|[1.0,1.0598599280...|       0.0|
|  0.0|(13424,[0,1,2,12,...|[-867.34902713517...|[1.0,4.3261595213...|       0.0|
|  0.0|(13424,[0,1,2,15,...|[-1371.4088783393...|[1.0,5.0170633556...|       0.0|
|  0.0|(13424,[0,1,2,17,...|[-821.60250392482...|[1.0,2.2181450481...|       0.0|
|  0.0|(13424,[0,1,2,21,...|[-963.44567991420...|[1.0,2.1040008876...|       0.0|
|  0.0|(13424,[0,1,2,22,...|[-748.04862096011...|[1.0,2.9704117247...|       0.0|
|  0.0|(13424,[0,1,2,28,...|[-1528.9432108873...|[0.98006837223901...|       0.0|
|  0.0|(13424,[0,1,2,31,...|[-596.17272302429...|[1.0,2.1567576661...|       0.0|
|  0.0|(13424,[0

####Evaluating the model

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label')
area_under_curve = evaluator.evaluate(preds)
accuracy = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='label')
accuracy = accuracy.evaluate(preds)
print(area_under_curve)
print(accuracy)

0.9470336491436406
0.9209601873536299
