# Import Libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (
    StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, VectorAssembler
)
from pyspark.ml import Pipeline
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, NaiveBayes
)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import split

# Create Spark Session

In [2]:
spark = SparkSession.builder.appName("TP_Part_One").getOrCreate()

# Load Dataset from the file

In [3]:
data = spark.read.text("smsspamcollection/SMSSpamCollection")

In [4]:
data.show()

+--------------------+
|               value|
+--------------------+
|ham\tGo until jur...|
|ham\tOk lar... Jo...|
|spam\tFree entry ...|
|ham\tU dun say so...|
|ham\tNah I don't ...|
|spam\tFreeMsg Hey...|
|ham\tEven my brot...|
|ham\tAs per your ...|
|spam\tWINNER!! As...|
|spam\tHad your mo...|
|ham\tI'm gonna be...|
|spam\tSIX chances...|
|spam\tURGENT! You...|
|ham\tI've been se...|
|ham\tI HAVE A DAT...|
|spam\tXXXMobileMo...|
|ham\tOh k...i'm w...|
|ham\tEh u remembe...|
|ham\tFine if that...|
|spam\tEngland v M...|
+--------------------+
only showing top 20 rows



# Split value in two columns

In [5]:
data = data.withColumn("label", split(data["value"], "\t").getItem(0)) \
           .withColumn("message", split(data["value"], "\t").getItem(1)) \
           .drop("value")

In [6]:
data.show()

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



# Convert labels into numeric values

In [7]:
indexer = StringIndexer(inputCol="label", outputCol="labelIndex")
data = indexer.fit(data).transform(data)

In [8]:
data.show()

+-----+--------------------+----------+
|label|             message|labelIndex|
+-----+--------------------+----------+
|  ham|Go until jurong p...|       0.0|
|  ham|Ok lar... Joking ...|       0.0|
| spam|Free entry in 2 a...|       1.0|
|  ham|U dun say so earl...|       0.0|
|  ham|Nah I don't think...|       0.0|
| spam|FreeMsg Hey there...|       1.0|
|  ham|Even my brother i...|       0.0|
|  ham|As per your reque...|       0.0|
| spam|WINNER!! As a val...|       1.0|
| spam|Had your mobile 1...|       1.0|
|  ham|I'm gonna be home...|       0.0|
| spam|SIX chances to wi...|       1.0|
| spam|URGENT! You have ...|       1.0|
|  ham|I've been searchi...|       0.0|
|  ham|I HAVE A DATE ON ...|       0.0|
| spam|XXXMobileMovieClu...|       1.0|
|  ham|Oh k...i'm watchi...|       0.0|
|  ham|Eh u remember how...|       0.0|
|  ham|Fine if thats th...|       0.0|
| spam|England v Macedon...|       1.0|
+-----+--------------------+----------+
only showing top 20 rows



# Tokenize text messages

In [9]:
tokenizer = Tokenizer(inputCol="message", outputCol="words")
data = tokenizer.transform(data)

In [10]:
data.show()

+-----+--------------------+----------+--------------------+
|label|             message|labelIndex|               words|
+-----+--------------------+----------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|
| spam|Free entry in 2 a...|       1.0|[free, entry, in,...|
|  ham|U dun say so earl...|       0.0|[u, dun, say, so,...|
|  ham|Nah I don't think...|       0.0|[nah, i, don't, t...|
| spam|FreeMsg Hey there...|       1.0|[freemsg, hey, th...|
|  ham|Even my brother i...|       0.0|[even, my, brothe...|
|  ham|As per your reque...|       0.0|[as, per, your, r...|
| spam|WINNER!! As a val...|       1.0|[winner!!, as, a,...|
| spam|Had your mobile 1...|       1.0|[had, your, mobil...|
|  ham|I'm gonna be home...|       0.0|[i'm, gonna, be, ...|
| spam|SIX chances to wi...|       1.0|[six, chances, to...|
| spam|URGENT! You have ...|       1.0|[urgent!, you, ha...|
|  ham|I've been searchi

# Remove stop words

In [11]:
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
data = stop_words_remover.transform(data)

In [12]:
data.show()

+-----+--------------------+----------+--------------------+--------------------+
|label|             message|labelIndex|               words|      filtered_words|
+-----+--------------------+----------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|
| spam|Free entry in 2 a...|       1.0|[free, entry, in,...|[free, entry, 2, ...|
|  ham|U dun say so earl...|       0.0|[u, dun, say, so,...|[u, dun, say, ear...|
|  ham|Nah I don't think...|       0.0|[nah, i, don't, t...|[nah, think, goes...|
| spam|FreeMsg Hey there...|       1.0|[freemsg, hey, th...|[freemsg, hey, da...|
|  ham|Even my brother i...|       0.0|[even, my, brothe...|[even, brother, l...|
|  ham|As per your reque...|       0.0|[as, per, your, r...|[per, request, 'm...|
| spam|WINNER!! As a val...|       1.0|[winner!!, as, a,...|[winner!!, valued...|
| spam|Had your 

# Create Term frequency vectors

In [13]:
cv = CountVectorizer(inputCol="filtered_words", outputCol="rawFeatures")
cv_model = cv.fit(data)
data = cv_model.transform(data)

In [14]:
data.show()

+-----+--------------------+----------+--------------------+--------------------+--------------------+
|label|             message|labelIndex|               words|      filtered_words|         rawFeatures|
+-----+--------------------+----------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|(13464,[7,11,31,6...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13464,[0,24,300,...|
| spam|Free entry in 2 a...|       1.0|[free, entry, in,...|[free, entry, 2, ...|(13464,[2,13,19,3...|
|  ham|U dun say so earl...|       0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13464,[0,69,80,1...|
|  ham|Nah I don't think...|       0.0|[nah, i, don't, t...|[nah, think, goes...|(13464,[36,134,31...|
| spam|FreeMsg Hey there...|       1.0|[freemsg, hey, th...|[freemsg, hey, da...|(13464,[10,67,140...|
|  ham|Even my brother i...|       0.0|[even, my, brothe...|[even, brothe

# Scale the term frequencies

In [15]:
idf = IDF(inputCol="rawFeatures", outputCol="scaledFeatures")
idf_model = idf.fit(data)
data = idf_model.transform(data)

In [16]:
data.show()

+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
|label|             message|labelIndex|               words|      filtered_words|         rawFeatures|      scaledFeatures|
+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|(13464,[7,11,31,6...|(13464,[7,11,31,6...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13464,[0,24,300,...|(13464,[0,24,300,...|
| spam|Free entry in 2 a...|       1.0|[free, entry, in,...|[free, entry, 2, ...|(13464,[2,13,19,3...|(13464,[2,13,19,3...|
|  ham|U dun say so earl...|       0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13464,[0,69,80,1...|(13464,[0,69,80,1...|
|  ham|Nah I don't think...|       0.0|[nah, i, don't, t...|[nah, think, goes...|(13464,[36,134,31...|(13464,[36,134,31...|
| spam|F

# Combine features

In [17]:
assembler = VectorAssembler(inputCols=["scaledFeatures"], outputCol="features")
data = assembler.transform(data)

In [21]:
data.show()

+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|             message|labelIndex|               words|      filtered_words|         rawFeatures|      scaledFeatures|            features|
+-----+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|       0.0|[go, until, juron...|[go, jurong, poin...|(13464,[7,11,31,6...|(13464,[7,11,31,6...|(13464,[7,11,31,6...|
|  ham|Ok lar... Joking ...|       0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13464,[0,24,300,...|(13464,[0,24,300,...|(13464,[0,24,300,...|
| spam|Free entry in 2 a...|       1.0|[free, entry, in,...|[free, entry, 2, ...|(13464,[2,13,19,3...|(13464,[2,13,19,3...|(13464,[2,13,19,3...|
|  ham|U dun say so earl...|       0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13464,[0,69,80,1...|(13464,[0,69,80,1...|(13464,

# Data Split

In [25]:
training_data, test_data = data.randomSplit([0.8, 0.2], seed=42)


training_data = training_data.select("labelIndex", "features")
test_data = test_data.select("labelIndex", "features")

In [26]:
print(f"Training data count: {training_data.count()}")
print(f"Test data count: {test_data.count()}")

Training data count: 4503
Test data count: 1071


# Logistic Regression

In [30]:
lr = LogisticRegression(featuresCol="features", labelCol="labelIndex")

In [31]:
lr_model = lr.fit(training_data)

In [32]:
lr_predictions = lr_model.transform(test_data)

# Decision Tree Classifier

In [37]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="labelIndex")

In [38]:
dt_model = dt.fit(training_data)

In [39]:
dt_predictions = dt_model.transform(test_data)

# Random Forest Classifier

In [40]:
rf = RandomForestClassifier(featuresCol="features", labelCol="labelIndex")

In [41]:
rf_model = rf.fit(training_data)

In [42]:
rf_predictions = rf_model.transform(test_data)

# Naive Bayes

In [43]:
nb = NaiveBayes(featuresCol="features", labelCol="labelIndex")

In [44]:
nb_model = nb.fit(training_data)

In [45]:
nb_predictions = nb_model.transform(test_data)

# Models Evaluation

In [35]:
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")

In [36]:
lr_accuracy = evaluator.evaluate(lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy}")

Logistic Regression Accuracy: 0.9803921568627451


In [46]:
dt_accuracy = evaluator.evaluate(dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")

Decision Tree Accuracy: 0.9197012138188608


In [47]:
rf_accuracy = evaluator.evaluate(rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy}")

Random Forest Accuracy: 0.8627450980392157


In [48]:
nb_accuracy = evaluator.evaluate(nb_predictions)
print(f"Naive Bayes Accuracy: {nb_accuracy}")

Naive Bayes Accuracy: 0.9140989729225023


In [60]:
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [10, 50]).addGrid(rf.maxDepth, [5, 10]).addGrid(rf.maxBins, [32, 64]).build()

evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndex",
    predictionCol="prediction",
    metricName="accuracy"
)

crossval = CrossValidator(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5,  
    seed=42
)

cvModel = crossval.fit(training_data)

bestModel = cvModel.bestModel
accuracy = evaluator.evaluate(cvModel.transform(test_data))

print(f"Best Number of Trees: {bestModel.getNumTrees}")
print(f"Best Max Depth: {bestModel.getMaxDepth()}")
print(f"Best Max Bins: {bestModel.getMaxBins()}")
print(f"Random Forest Accuracy: {accuracy}")

Best Number of Trees: 50
Best Max Depth: 10
Best Max Bins: 32
Random Forest Accuracy: 0.8786181139122315


# Conclusion

In [61]:
import pandas as pd

data = {
    'Classification Method': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Naive Bayes'],
    'Accuracy': [0.9803921568627451, 0.9197012138188608, 0.8627450980392157, 0.9140989729225023]
}

df = pd.DataFrame(data)

print(df)

  Classification Method  Accuracy
0   Logistic Regression  0.980392
1         Decision Tree  0.919701
2         Random Forest  0.862745
3           Naive Bayes  0.914099


We can see that the method with the best accuracy is "Logistic Regression", so it is the most suitable method. However the method with the worst accuracy was "Random Forest". "Decision Tree" and "Naive Bayes" also have good result with an accuracy bigger than 0.90, but they would need more tuning to get better results. 