# Spam SMS Prediction

In [75]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [76]:
# Initialize and create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('spam_sms').getOrCreate()

In [77]:
# Using Spark to read spam SMS data set.
data = spark.read.csv('SMS_Spam_Collection/SMSSpamCollection', inferSchema=True, sep='\t')

In [78]:
# Printing the first few rows of the dataframe
data.show(4)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
+----+--------------------+
only showing top 4 rows



In [79]:
#Giving the header names
data = data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')

In [80]:
data.show(4)

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
+-----+--------------------+
only showing top 4 rows



## Cleaning and preparing the data

*_Creating a new length feature_*

In [81]:
from pyspark.sql.functions import length

In [82]:
data = data.withColumn('length',length(data['text']))

In [83]:
data.show(4)

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
+-----+--------------------+------+
only showing top 4 rows



In [84]:
#Grouping spam and ham sms
data.groupBy('class').mean().show()

+-----+-----------------+
|class|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



From the above observation, we can concluse that if a sms is a spam, then it will be having more length as compared to ham sms.

## Feature Transformations

In [85]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer

tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
remover = StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')

In [86]:
#Converting the categorical data type class (ham-spam) to numerical type
ham_spam_to_num = StringIndexer(inputCol='class', outputCol='label')

In [87]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [88]:
assembler = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

## Building the model

#### Using Naive Bayes, Logistic Regression and Random Tree Classifier

In [89]:
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier

In [90]:
nb = NaiveBayes(labelCol='label', featuresCol='features')
lor = LogisticRegression(labelCol='label', featuresCol='features')
rfc = RandomForestClassifier(labelCol='label', featuresCol='features')

###### Building the Pipeline

In [91]:
from pyspark.ml import Pipeline

In [92]:
pipeline_nb = Pipeline(stages=[tokenizer,remover,count_vec,idf,ham_spam_to_num,assembler,nb])
pipeline_lor = Pipeline(stages=[tokenizer,remover,count_vec,idf,ham_spam_to_num,assembler,lor])
pipeline_rfc = Pipeline(stages=[tokenizer,remover,count_vec,idf,ham_spam_to_num,assembler,rfc])

In [93]:
cleaner_nb = pipeline_nb.fit(data)
cleaner_lor = pipeline_lor.fit(data)
cleaner_rfc = pipeline_rfc.fit(data)

In [94]:
clean_data_nb = cleaner_nb.transform(data)
clean_data_lor = cleaner_lor.transform(data)
clean_data_rfc = cleaner_rfc.transform(data)

In [95]:
clean_data_nb.head()

Row(class='ham', text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', length=111, token_text=['go', 'until', 'jurong', 'point,', 'crazy..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'cine', 'there', 'got', 'amore', 'wat...'], stop_tokens=['go', 'jurong', 'point,', 'crazy..', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'cine', 'got', 'amore', 'wat...'], c_vec=SparseVector(13423, {7: 1.0, 11: 1.0, 31: 1.0, 61: 1.0, 72: 1.0, 344: 1.0, 625: 1.0, 731: 1.0, 1409: 1.0, 1598: 1.0, 4485: 1.0, 6440: 1.0, 8092: 1.0, 8838: 1.0, 11344: 1.0, 12979: 1.0}), tf_idf=SparseVector(13423, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4.2072, 72: 4.322, 344: 5.4072, 625: 5.918, 731: 6.1411, 1409: 6.6801, 1598: 6.8343, 4485: 7.5274, 6440: 7.9329, 8092: 7.9329, 8838: 7.9329, 11344: 7.9329, 12979: 7.9329}), label=0.0, features=SparseVector(13424, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4

In [96]:
clean_data_lor.head()

Row(class='ham', text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', length=111, token_text=['go', 'until', 'jurong', 'point,', 'crazy..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'cine', 'there', 'got', 'amore', 'wat...'], stop_tokens=['go', 'jurong', 'point,', 'crazy..', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'cine', 'got', 'amore', 'wat...'], c_vec=SparseVector(13423, {7: 1.0, 11: 1.0, 31: 1.0, 61: 1.0, 72: 1.0, 344: 1.0, 625: 1.0, 731: 1.0, 1409: 1.0, 1598: 1.0, 4485: 1.0, 6440: 1.0, 8092: 1.0, 8838: 1.0, 11344: 1.0, 12979: 1.0}), tf_idf=SparseVector(13423, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4.2072, 72: 4.322, 344: 5.4072, 625: 5.918, 731: 6.1411, 1409: 6.6801, 1598: 6.8343, 4485: 7.5274, 6440: 7.9329, 8092: 7.9329, 8838: 7.9329, 11344: 7.9329, 12979: 7.9329}), label=0.0, features=SparseVector(13424, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4

In [97]:
clean_data_rfc.head()

Row(class='ham', text='Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', length=111, token_text=['go', 'until', 'jurong', 'point,', 'crazy..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'cine', 'there', 'got', 'amore', 'wat...'], stop_tokens=['go', 'jurong', 'point,', 'crazy..', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet...', 'cine', 'got', 'amore', 'wat...'], c_vec=SparseVector(13423, {7: 1.0, 11: 1.0, 31: 1.0, 61: 1.0, 72: 1.0, 344: 1.0, 625: 1.0, 731: 1.0, 1409: 1.0, 1598: 1.0, 4485: 1.0, 6440: 1.0, 8092: 1.0, 8838: 1.0, 11344: 1.0, 12979: 1.0}), tf_idf=SparseVector(13423, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4.2072, 72: 4.322, 344: 5.4072, 625: 5.918, 731: 6.1411, 1409: 6.6801, 1598: 6.8343, 4485: 7.5274, 6440: 7.9329, 8092: 7.9329, 8838: 7.9329, 11344: 7.9329, 12979: 7.9329}), label=0.0, features=SparseVector(13424, {7: 3.1126, 11: 3.2055, 31: 3.822, 61: 4

##### Training and Evaluation!

In [98]:
clean_data_nb = clean_data_nb.select('label','features')
clean_data_lor = clean_data_lor.select('label','features')
clean_data_rfc = clean_data_rfc.select('label','features')

In [99]:
clean_data_nb.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
+-----+--------------------+
only showing top 3 rows



In [100]:
clean_data_lor.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
+-----+--------------------+
only showing top 3 rows



In [101]:
clean_data_rfc.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
+-----+--------------------+
only showing top 3 rows



In [102]:
train_nb,test_nb = clean_data_nb.randomSplit([0.7,0.3], seed=12345)
train_lor,test_lor = clean_data_2.randomSplit([0.7,0.3], seed=12345)
train_rfc,test_rfc = clean_data_2.randomSplit([0.7,0.3], seed=12345)

In [103]:
spam_predictor_nb = nb.fit(train_nb)
spam_predictor_lor = lor.fit(train_lor)
spam_predictor_rfc = rfc.fit(train_rfc)

In [104]:
test_results_nb = spam_predictor_nb.transform(test_nb)
test_results_lor = spam_predictor_lor.transform(test_lor)
test_results_rfc = spam_predictor_rfc.transform(test_rfc)

In [105]:
test_results_nb.show(4)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,4,50,...|[-819.92914430065...|[1.0,9.2949203991...|       0.0|
|  0.0|(13424,[0,1,5,15,...|[-997.90523073055...|[1.0,1.6286640272...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-874.03499580029...|[1.0,1.8162872740...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1148.1854854661...|[1.0,3.3656532679...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 4 rows



In [106]:
test_results_lor.show(4)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,4,50,...|[39.9998544815758...|[1.0,4.2489725140...|       0.0|
|  0.0|(13424,[0,1,5,15,...|[48.0074481259798...|[1.0,1.4145887133...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[29.8269701451849...|[0.99999999999988...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[44.5525676760709...|[1.0,4.4778104307...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 4 rows



In [107]:
test_results_rfc.show(4)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,4,50,...|[16.5628813339442...|[0.82814406669721...|       0.0|
|  0.0|(13424,[0,1,5,15,...|[16.7247635623985...|[0.83623817811992...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[16.3189019594733...|[0.81594509797366...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[16.4845134432068...|[0.82422567216034...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 4 rows



In [108]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [109]:
acc_eval = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')

In [112]:
acc_nb = acc_eval.evaluate(test_results_nb)
print("Accuracy of Naive Bayes Model at predictiong spam was: {}".format(acc_nb))
print('-'*90)

acc_lor = acc_eval.evaluate(test_results_lor)
print("Accuracy of Logistic Regression Model at predictiong spam was: {}".format(acc_lor))
print('-'*90)

acc_rfc = acc_eval.evaluate(test_results_rfc)
print("Accuracy of Random Forest Classifier Model at predictiong spam was: {}".format(acc_rfc))
print('-'*90)

Accuracy of Naive Bayes Model at predictiong spam was: 0.9246423704291925
------------------------------------------------------------------------------------------
Accuracy of Logistic Regression Model at predictiong spam was: 0.9636361033474314
------------------------------------------------------------------------------------------
Accuracy of Random Forest Classifier Model at predictiong spam was: 0.8206673337533509
------------------------------------------------------------------------------------------


In [None]:
#Closing the spark session
spark.stop()