In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

from pyspark.ml.feature import CountVectorizer, CountVectorizerModel

from pyspark.ml.feature import OneHotEncoder, StringIndexer

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassificationModel, RandomForestClassifier


from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.ml.feature import NGram

from pyspark.ml import Pipeline, PipelineModel


In [3]:
# with spark session
location = "/user/edureka_672184/m8_datasets/SMSSpamCollection"
raw_df = spark.read.option("delimiter", "\t").csv(location).toDF("spam", "message")

raw_df.show(2)

+----+--------------------+
|spam|             message|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
+----+--------------------+
only showing top 2 rows



In [4]:
# with spark context
raw = sparkContext.textFile(location) \
.map(lambda line: line.split("\t")) \
.map(lambda row: Row(spam=row[0].strip(), message=row[1].strip()))

raw_df = spark.createDataFrame(raw)
raw_df.show(2)

+--------------------+----+
|             message|spam|
+--------------------+----+
|Go until jurong p...| ham|
|Ok lar... Joking ...| ham|
+--------------------+----+
only showing top 2 rows



### 1. Extract words from the SMS message

In [5]:
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
transformed = tokenizer.transform(raw_df)
transformed.show(3)

+--------------------+----+--------------------+
|             message|spam|               words|
+--------------------+----+--------------------+
|Go until jurong p...| ham|[go, until, juron...|
|Ok lar... Joking ...| ham|[ok, lar..., joki...|
|Free entry in 2 a...|spam|[free, entry, in,...|
+--------------------+----+--------------------+
only showing top 3 rows



### 2. Remove Stop words

In [6]:
remover = StopWordsRemover().setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(3)

+--------------------+----+--------------------+--------------------+
|             message|spam|               words|            filtered|
+--------------------+----+--------------------+--------------------+
|Go until jurong p...| ham|[go, until, juron...|[go, jurong, poin...|
|Ok lar... Joking ...| ham|[ok, lar..., joki...|[ok, lar..., joki...|
|Free entry in 2 a...|spam|[free, entry, in,...|[free, entry, 2, ...|
+--------------------+----+--------------------+--------------------+
only showing top 3 rows



In [7]:
StopWordsRemover().getStopWords()[115:120]

[u'so', u'than', u'too', u'very', u's']

### 3. Modify the stop words to include your custom words such as ‘-‘

In [8]:
# remove dash
stopwords = StopWordsRemover().getStopWords() + ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cleaned = remover.transform(transformed)
cleaned.show(3)

+--------------------+----+--------------------+--------------------+
|             message|spam|               words|            filtered|
+--------------------+----+--------------------+--------------------+
|Go until jurong p...| ham|[go, until, juron...|[go, jurong, poin...|
|Ok lar... Joking ...| ham|[ok, lar..., joki...|[ok, lar..., joki...|
|Free entry in 2 a...|spam|[free, entry, in,...|[free, entry, 2, ...|
+--------------------+----+--------------------+--------------------+
only showing top 3 rows



### 4. Create the features from SMS message using CountVectorizer


In [9]:
# generate features
count_vect_model = CountVectorizer().setInputCol("filtered").setOutputCol("features").fit(cleaned)
featured = count_vect_model.transform(cleaned)

# convert to binary label
indexer = StringIndexer().setInputCol("spam").setOutputCol("label").fit(featured)
indexed = indexer.transform(featured)
indexed.show(3)

+--------------------+----+--------------------+--------------------+--------------------+-----+
|             message|spam|               words|            filtered|            features|label|
+--------------------+----+--------------------+--------------------+--------------------+-----+
|Go until jurong p...| ham|[go, until, juron...|[go, jurong, poin...|(13498,[8,12,33,6...|  0.0|
|Ok lar... Joking ...| ham|[ok, lar..., joki...|[ok, lar..., joki...|(13498,[0,26,308,...|  0.0|
|Free entry in 2 a...|spam|[free, entry, in,...|[free, entry, 2, ...|(13498,[2,14,20,3...|  1.0|
+--------------------+----+--------------------+--------------------+--------------------+-----+
only showing top 3 rows



### 5. Split the data into train and test - decide on a strategy


In [10]:
train, test = indexed.randomSplit([0.7, 0.3], seed = 12345)
train.show(3)

+--------------------+----+--------------------+--------------------+--------------------+-----+
|             message|spam|               words|            filtered|            features|label|
+--------------------+----+--------------------+--------------------+--------------------+-----+
|"AH POOR BABY!HOP...| ham|["ah, poor, baby!...|["ah, poor, baby!...|(13498,[0,2,8,69,...|  0.0|
|"ALRITE HUNNY!WOT...| ham|["alrite, hunny!w...|["alrite, hunny!w...|(13498,[0,2,68,21...|  0.0|
|"Are you comingdo...| ham|["are, you, comin...|["are, comingdown...|(13498,[3577,8197...|  0.0|
+--------------------+----+--------------------+--------------------+--------------------+-----+
only showing top 3 rows



### 6. Use logistic regression and check the accuracy


In [11]:
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
lrModel = lr.fit(train)
predictions = lrModel.transform(test)
predictions.select("features", "label", "prediction").show(5)
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Accuracy ", accuracy)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(13498,[0,2,47,71...|  0.0|       0.0|
|(13498,[9,15,45,5...|  0.0|       0.0|
|(13498,[2,9,19,32...|  0.0|       0.0|
|(13498,[3,7,287,1...|  0.0|       0.0|
|(13498,[200,1770,...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows

('Accuracy ', 0.5)


### 7. Try to use a Random Forest classifier and see if it increases the accuracy.


In [12]:
# Random Forest
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features").setNumTrees(10)
model = rf.fit(train)
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

('Accuracy', 0.515695067264574)


### 8. Introduce bi-gram and tri-gram and note the change in accuracy

In [17]:
ngram = NGram().setN(2).setInputCol("filtered").setOutputCol("ngrams")
ngrams_df = ngram.transform(indexed)

count_vect_model = CountVectorizer().setInputCol("ngrams").setOutputCol("features2").fit(ngrams_df)
featured = count_vect_model.transform(ngrams_df)

train, test = featured.randomSplit([0.7, 0.3], seed = 12345)
train.select("ngrams", "features", "features2").show(3)

+--------------------+--------------------+--------------------+
|              ngrams|            features|           features2|
+--------------------+--------------------+--------------------+
|["ah poor, poor b...|(13498,[0,2,8,69,...|(37504,[173,2643,...|
|["alrite hunny!wo...|(13498,[0,2,68,21...|(37504,[81,6995,7...|
|["are comingdown,...|(13498,[3577,8197...|(37504,[8893,2915...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [18]:
# Random Forest
rf = RandomForestClassifier().setLabelCol("label").setFeaturesCol("features2").setNumTrees(10)
model = rf.fit(train)
predictions = model.transform(test)
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

('Accuracy', 0.5044843049327354)


### 9. Decide on a strategy and generate a data pipeline.

In [29]:
tokenizer = Tokenizer().setInputCol("message").setOutputCol("words")
stopwords = StopWordsRemover().getStopWords()+ ["-"]
remover = StopWordsRemover().setStopWords(stopwords).setInputCol("words").setOutputCol("filtered")
cvmodel = CountVectorizer().setInputCol("filtered").setOutputCol("features")
indexer = StringIndexer().setInputCol("spam").setOutputCol("label")
lr = LogisticRegression().setMaxIter(10).setRegParam(0.3).setElasticNetParam(0.8)
pipeline = Pipeline().setStages([tokenizer, remover, cvmodel, indexer, lr])
model = pipeline.fit(raw_df)
model.write().overwrite().save("use_cases/spam_model_ca1")

In [30]:
# load pipeline
pipeline = pipeline = PipelineModel.load("use_cases/spam_model_ca1")

In [35]:
lr_predictions = pipeline.transform(raw_df)
evaluator = BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("prediction").setMetricName("areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print ("Accuracy", accuracy)

('Accuracy', 0.5044843049327354)
