## Importing the relevant libraries

In [50]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, IDF, StopWordsRemover, VectorAssembler, StringIndexer, RegexTokenizer
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, DecisionTreeClassifier, LogisticRegression, GBTClassifier, MultilayerPerceptronClassifier, LinearSVC, FMClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.pipeline import Pipeline
from pyspark.sql.functions import length, col, count_distinct
import gc
import pandas as pd

## Initiating a Spark session

In [2]:
spark = SparkSession.builder.appName('sms_spam_detection').getOrCreate()
spark

## Loading the SMS spam dataset

In [3]:
df = spark.read.csv('smsspamcollection/SMSSpamCollection',inferSchema=True,sep='\t')
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



## Feature Engineering

In [4]:
df.count()

5574

In [5]:
df.show(5)

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
+----+--------------------+
only showing top 5 rows



In [6]:
df = df.withColumnRenamed('_c0','target').withColumnRenamed('_c1','text')
df.show(5)

+------+--------------------+
|target|                text|
+------+--------------------+
|   ham|Go until jurong p...|
|   ham|Ok lar... Joking ...|
|  spam|Free entry in 2 a...|
|   ham|U dun say so earl...|
|   ham|Nah I don't think...|
+------+--------------------+
only showing top 5 rows



In [7]:
df = df.withColumn('length',length(col('text')))
df.show(5)

+------+--------------------+------+
|target|                text|length|
+------+--------------------+------+
|   ham|Go until jurong p...|   111|
|   ham|Ok lar... Joking ...|    29|
|  spam|Free entry in 2 a...|   155|
|   ham|U dun say so earl...|    49|
|   ham|Nah I don't think...|    61|
+------+--------------------+------+
only showing top 5 rows



In [8]:
df.groupBy('target').count().show()

+------+-----+
|target|count|
+------+-----+
|   ham| 4827|
|  spam|  747|
+------+-----+



In [9]:
df.groupBy('target').mean().show()

+------+-----------------+
|target|      avg(length)|
+------+-----------------+
|   ham|71.45431945307645|
|  spam|138.6706827309237|
+------+-----------------+



In [10]:
tokenizer = RegexTokenizer(inputCol='text',outputCol='tokens',pattern='\\W')
stopwords_remover = StopWordsRemover(inputCol='tokens',outputCol='filtered_tokens')
count_vect = CountVectorizer(inputCol='filtered_tokens',outputCol='vectorized_tokens')
idf = IDF(inputCol='vectorized_tokens',outputCol='tf_idf')
assembler = VectorAssembler(inputCols=['length','tf_idf'],outputCol='features')
indexer = StringIndexer(inputCol='target',outputCol='label')

In [11]:
nb = NaiveBayes()

In [12]:
pipeline = Pipeline(stages=[indexer,tokenizer,stopwords_remover,count_vect,idf,assembler])
pipeline

Pipeline_eeda5428d848

In [13]:
data_prep_pipe = pipeline.fit(df)
data_prep_pipe

PipelineModel_f35dd0773d98

In [14]:
transformed_df = data_prep_pipe.transform(df)
transformed_df.show(5)

+------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|target|                text|length|label|              tokens|     filtered_tokens|   vectorized_tokens|              tf_idf|            features|
+------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|   ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(8623,[11,16,37,6...|(8623,[11,16,37,6...|(8624,[0,12,17,38...|
|   ham|Ok lar... Joking ...|    29|  0.0|[ok, lar, joking,...|[ok, lar, joking,...|(8623,[0,9,247,37...|(8623,[0,9,247,37...|(8624,[0,1,10,248...|
|  spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(8623,[2,10,23,24...|(8623,[2,10,23,24...|(8624,[0,3,11,24,...|
|   ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(8623,[0,56,81,85...|(8623,[

In [15]:
transformed_df.columns

['target',
 'text',
 'length',
 'label',
 'tokens',
 'filtered_tokens',
 'vectorized_tokens',
 'tf_idf',
 'features']

In [16]:
transformed_df = transformed_df.select('label','features')
transformed_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(8624,[0,12,17,38...|
|  0.0|(8624,[0,1,10,248...|
|  1.0|(8624,[0,3,11,24,...|
|  0.0|(8624,[0,1,57,82,...|
|  0.0|(8624,[0,53,138,3...|
+-----+--------------------+
only showing top 5 rows



## Splitting transformed data into train and test sets

In [17]:
train_df, test_df = transformed_df.randomSplit([0.7,0.3])

In [18]:
train_df.count(), test_df.count()

(3882, 1692)

In [19]:
train_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|    (8624,[0],[7.0])|
|  0.0|    (8624,[0],[7.0])|
|  0.0|   (8624,[0],[27.0])|
|  0.0|(8624,[0,1],[8.0,...|
|  0.0|(8624,[0,1],[12.0...|
+-----+--------------------+
only showing top 5 rows



In [20]:
test_df.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|    (8624,[0],[3.0])|
|  0.0|   (8624,[0],[10.0])|
|  0.0|   (8624,[0],[24.0])|
|  0.0|   (8624,[0],[27.0])|
|  0.0|(8624,[0,1,2,4,19...|
+-----+--------------------+
only showing top 5 rows



In [21]:
nb_model = NaiveBayes().fit(train_df)
nb_model

NaiveBayesModel: uid=NaiveBayes_dcaeaadd5e39, modelType=multinomial, numClasses=2, numFeatures=8624

In [22]:
test_results = nb_model.transform(test_df)
test_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|    (8624,[0],[3.0])|[-1.6471524253324...|[0.89472073661644...|       0.0|
|  0.0|   (8624,[0],[10.0])|[-5.1453959403931...|[0.94522751789885...|       0.0|
|  0.0|   (8624,[0],[24.0])|[-12.141882970514...|[0.98614179778159...|       0.0|
|  0.0|   (8624,[0],[27.0])|[-13.641130191254...|[0.98973297891914...|       0.0|
|  0.0|(8624,[0,1,2,4,19...|[-611.13050409227...|[1.0,3.9870607150...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [23]:
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
f1_eval = MulticlassClassificationEvaluator(metricName='f1')
precision_eval = MulticlassClassificationEvaluator(metricName='weightedPrecision')
recall_eval = MulticlassClassificationEvaluator(metricName='weightedRecall')
roc_auc_eval = BinaryClassificationEvaluator(metricName='areaUnderROC')

In [24]:
acc = acc_eval.evaluate(test_results)
f1 = f1_eval.evaluate(test_results)
precision = precision_eval.evaluate(test_results)
recall = recall_eval.evaluate(test_results)
roc_auc = roc_auc_eval.evaluate(test_results)

In [25]:
print("Accuracy of Naive Bayes:",acc)
print("Precision of Naive Bayes:",precision)
print("Recall of Naive Bayes:",recall)
print("ROC AUC Score of Naive Bayes:",roc_auc)
print("F1 Score of Naive Bayes:",f1)

Accuracy of Naive Bayes: 0.9598108747044918
Precision of Naive Bayes: 0.9658926063018967
Recall of Naive Bayes: 0.9598108747044917
ROC AUC Score of Naive Bayes: 0.1712152062018643
F1 Score of Naive Bayes: 0.9615055324909287


## Model Training & Evaluation

In [55]:
model_names = []
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
roc_auc_scores = []

In [56]:
def train_and_evaluate_model(model):
    clf = model.fit(train_df)
    test_results = clf.transform(test_df)
    acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
    f1_eval = MulticlassClassificationEvaluator(metricName='f1')
    precision_eval = MulticlassClassificationEvaluator(metricName='weightedPrecision')
    recall_eval = MulticlassClassificationEvaluator(metricName='weightedRecall')
    roc_auc_eval = BinaryClassificationEvaluator(metricName='areaUnderROC')
    acc = acc_eval.evaluate(test_results)
    f1 = f1_eval.evaluate(test_results)
    precision = precision_eval.evaluate(test_results)
    recall = recall_eval.evaluate(test_results)
    roc_auc = roc_auc_eval.evaluate(test_results)
    model_name = str(model).split('(')[0]
    print(f"Accuracy of {model_name}:",acc)
    print(f"Precision of {model_name}:",precision)
    print(f"Recall of {model_name}:",recall)
    print(f"ROC AUC Score of {model_name}:",roc_auc)
    print(f"F1 Score of {model_name}:",f1)
    model_names.append(model)
    accuracy_scores.append(acc)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    roc_auc_scores.append(roc_auc)
    del acc, f1, precision, recall, roc_auc, acc_eval, precision_eval, recall_eval, f1_eval, roc_auc_eval
    gc.collect()

In [57]:
train_and_evaluate_model(LogisticRegression())

Accuracy of LogisticRegression_72f09ac2241f: 0.9810874704491725
Precision of LogisticRegression_72f09ac2241f: 0.9809270524852111
Recall of LogisticRegression_72f09ac2241f: 0.9810874704491725
ROC AUC Score of LogisticRegression_72f09ac2241f: 0.9814095835493843
F1 Score of LogisticRegression_72f09ac2241f: 0.9806874782247222


In [58]:
train_and_evaluate_model(NaiveBayes())

Accuracy of NaiveBayes_ef524dd0597d: 0.9598108747044918
Precision of NaiveBayes_ef524dd0597d: 0.9658926063018967
Recall of NaiveBayes_ef524dd0597d: 0.9598108747044917
ROC AUC Score of NaiveBayes_ef524dd0597d: 0.1712152062018643
F1 Score of NaiveBayes_ef524dd0597d: 0.9615055324909287


In [59]:
train_and_evaluate_model(LinearSVC())

Accuracy of LinearSVC_998131734ff5: 0.9816784869976359
Precision of LinearSVC_998131734ff5: 0.9814915869906106
Recall of LinearSVC_998131734ff5: 0.9816784869976359
ROC AUC Score of LinearSVC_998131734ff5: 0.9900914344972407
F1 Score of LinearSVC_998131734ff5: 0.9815478123979356


In [60]:
train_and_evaluate_model(DecisionTreeClassifier())

Accuracy of DecisionTreeClassifier_0597e20c012b: 0.9373522458628841
Precision of DecisionTreeClassifier_0597e20c012b: 0.9348901266289947
Recall of DecisionTreeClassifier_0597e20c012b: 0.9373522458628841
ROC AUC Score of DecisionTreeClassifier_0597e20c012b: 0.5570706815221378
F1 Score of DecisionTreeClassifier_0597e20c012b: 0.9315693223367296


In [61]:
train_and_evaluate_model(RandomForestClassifier())

Accuracy of RandomForestClassifier_78ee4567735d: 0.8764775413711584
Precision of RandomForestClassifier_78ee4567735d: 0.8917805608456266
Recall of RandomForestClassifier_78ee4567735d: 0.8764775413711584
ROC AUC Score of RandomForestClassifier_78ee4567735d: 0.9450918771261999
F1 Score of RandomForestClassifier_78ee4567735d: 0.821614847168587


In [62]:
train_and_evaluate_model(GBTClassifier())

Accuracy of GBTClassifier_d2f310e9795a: 0.9515366430260047
Precision of GBTClassifier_d2f310e9795a: 0.949930045557557
Recall of GBTClassifier_d2f310e9795a: 0.9515366430260047
ROC AUC Score of GBTClassifier_d2f310e9795a: 0.960438455604315
F1 Score of GBTClassifier_d2f310e9795a: 0.9487344497466098


In [63]:
train_and_evaluate_model(FMClassifier())

Accuracy of FMClassifier_afe3053b6f5c: 0.966903073286052
Precision of FMClassifier_afe3053b6f5c: 0.9661061897287664
Recall of FMClassifier_afe3053b6f5c: 0.9669030732860521
ROC AUC Score of FMClassifier_afe3053b6f5c: 0.8940472727732621
F1 Score of FMClassifier_afe3053b6f5c: 0.9662030868932642


## Baseline Models Performance Comparison

In [64]:
model_perfs = pd.DataFrame({'Model': model_names, 
                            'Accuracy': accuracy_scores, 
                            'Precision': precision_scores,
                            'Recall': recall_scores,
                            'F1': f1_scores,
                            'ROC AUC': roc_auc_scores}).sort_values(by='F1',ascending=False).reset_index(drop=True)
model_perfs

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC
0,LinearSVC_998131734ff5,0.981678,0.981492,0.981678,0.981548,0.990091
1,LogisticRegression_72f09ac2241f,0.981087,0.980927,0.981087,0.980687,0.98141
2,FMClassifier_afe3053b6f5c,0.966903,0.966106,0.966903,0.966203,0.894047
3,NaiveBayes_ef524dd0597d,0.959811,0.965893,0.959811,0.961506,0.171215
4,GBTClassifier_d2f310e9795a,0.951537,0.94993,0.951537,0.948734,0.960438
5,DecisionTreeClassifier_0597e20c012b,0.937352,0.93489,0.937352,0.931569,0.557071
6,RandomForestClassifier_78ee4567735d,0.876478,0.891781,0.876478,0.821615,0.945092


In [65]:
best_model = model_perfs.iloc[0]['Model']
best_model

LinearSVC_998131734ff5

The Linear SVC model has emerged as the best performing spam detection classifier achieving an incredible accuracy of more than 98% on the test set.

In [75]:
best_model_pipeline = Pipeline(stages=[tokenizer,stopwords_remover,count_vect,idf,assembler,best_model])
best_model_pipeline

Pipeline_07d605f8a16b

In [71]:
type(best_model)

pyspark.ml.classification.LinearSVC

In [72]:
type(best_model_pipeline)

pyspark.ml.pipeline.Pipeline

In [77]:
df.orderBy(rand()).first()['text']

'Happy or sad , one thing about past is- "Its no more" GOOD MORNING :-):-).'

## Saving the best model pipeline for deployment

In [79]:
best_model_pipeline.save('sms_spam_detector/')