# In this example, we will create an SVM model to predict whether a banknote is authentic or forgery based on four image characteristics. Note that the data file does not have headers.

## Data file: banknote_authentication.csv

### Resources:

### Spark SVM documentation: https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-support-vector-machine

In [29]:
from pyspark.sql import SparkSession

In [30]:
spark = SparkSession.builder.appName('svm').getOrCreate()

### As the data has no headers (column names), we will first create a schema and then import the data into a dataframe with the predefined schema.

In [31]:
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType

In [32]:
banknote_schema = StructType([
    StructField('variance',DoubleType(),True),
    StructField('skewness',DoubleType(),True),
    StructField('kurtosis',DoubleType(),True),
    StructField('entropy',DoubleType(),True),
    StructField('label',IntegerType(),True),
 ])

In [33]:
data = spark.read.csv('banknote_authentication.csv',header=False,schema=banknote_schema)

In [34]:
data.printSchema()

root
 |-- variance: double (nullable = true)
 |-- skewness: double (nullable = true)
 |-- kurtosis: double (nullable = true)
 |-- entropy: double (nullable = true)
 |-- label: integer (nullable = true)



In [35]:
data.show()

+--------+--------+--------+--------+-----+
|variance|skewness|kurtosis| entropy|label|
+--------+--------+--------+--------+-----+
|  3.6216|  8.6661| -2.8073|-0.44699|    0|
|  4.5459|  8.1674| -2.4586| -1.4621|    0|
|   3.866| -2.6383|  1.9242| 0.10645|    0|
|  3.4566|  9.5228| -4.0112| -3.5944|    0|
| 0.32924| -4.4552|  4.5718| -0.9888|    0|
|  4.3684|  9.6718| -3.9606| -3.1625|    0|
|  3.5912|  3.0129| 0.72888| 0.56421|    0|
|  2.0922|   -6.81|  8.4636|-0.60216|    0|
|  3.2032|  5.7588|-0.75345|-0.61251|    0|
|  1.5356|  9.1772| -2.2718|-0.73535|    0|
|  1.2247|  8.7779| -2.2135|-0.80647|    0|
|  3.9899| -2.7066|  2.3946| 0.86291|    0|
|  1.8993|  7.6625| 0.15394| -3.1108|    0|
| -1.5768|  10.843|  2.5462| -2.9362|    0|
|   3.404|  8.7261| -2.9915|-0.57242|    0|
|  4.6765| -3.3895|  3.4896|  1.4771|    0|
|  2.6719|  3.0646| 0.37158| 0.58619|    0|
| 0.80355|  2.8473|  4.3439|  0.6017|    0|
|  1.4479| -4.8794|  8.3428| -2.1086|    0|
|  5.2423| 11.0272|  -4.353| -4.

### Prepare the data

In [36]:
from pyspark.ml.feature import VectorAssembler

In [37]:
data.columns

['variance', 'skewness', 'kurtosis', 'entropy', 'label']

In [38]:
assembler = VectorAssembler(inputCols=['variance', 'skewness', 'kurtosis', 'entropy'],
                            outputCol='features')

In [39]:
output = assembler.transform(data)

In [40]:
output.printSchema()

root
 |-- variance: double (nullable = true)
 |-- skewness: double (nullable = true)
 |-- kurtosis: double (nullable = true)
 |-- entropy: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)



In [41]:
final_data = output.select('features','label')

In [42]:
final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[3.6216,8.6661,-2...|    0|
|[4.5459,8.1674,-2...|    0|
|[3.866,-2.6383,1....|    0|
|[3.4566,9.5228,-4...|    0|
|[0.32924,-4.4552,...|    0|
|[4.3684,9.6718,-3...|    0|
|[3.5912,3.0129,0....|    0|
|[2.0922,-6.81,8.4...|    0|
|[3.2032,5.7588,-0...|    0|
|[1.5356,9.1772,-2...|    0|
|[1.2247,8.7779,-2...|    0|
|[3.9899,-2.7066,2...|    0|
|[1.8993,7.6625,0....|    0|
|[-1.5768,10.843,2...|    0|
|[3.404,8.7261,-2....|    0|
|[4.6765,-3.3895,3...|    0|
|[2.6719,3.0646,0....|    0|
|[0.80355,2.8473,4...|    0|
|[1.4479,-4.8794,8...|    0|
|[5.2423,11.0272,-...|    0|
+--------------------+-----+
only showing top 20 rows



### Train a linear SVM classifier

In [43]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [44]:
from pyspark.ml.classification import LinearSVC

In [45]:
lsvc = LinearSVC(maxIter=10,regParam=0.1)

In [46]:
lsvc_model = lsvc.fit(train_data)

In [47]:
lsvc_preds = lsvc_model.transform(test_data)

### Evaluate the model

In [48]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [49]:
# default metric: area under ROC
my_binary_eval = BinaryClassificationEvaluator(labelCol='label')

In [50]:
print('Area under ROC')
print(my_binary_eval.evaluate(lsvc_preds))

Area under ROC
0.9991790974763112


In [51]:
lsvc_preds.show()

+--------------------+-----+--------------------+----------+
|            features|label|       rawPrediction|prediction|
+--------------------+-----+--------------------+----------+
|[-7.0421,9.2,0.25...|    1|[-1.6745789277135...|       1.0|
|[-6.7526,8.8172,-...|    1|[-1.7121261863860...|       1.0|
|[-6.5235,9.6014,-...|    1|[-1.4322687381681...|       1.0|
|[-6.4247,9.5311,0...|    1|[-1.3577114091827...|       1.0|
|[-6.3679,8.0102,0...|    1|[-1.6135445011842...|       1.0|
|[-6.2003,8.6806,0...|    1|[-1.5077136087375...|       1.0|
|[-6.0598,9.2952,-...|    1|[-1.3534450644603...|       1.0|
|[-5.8818,7.6584,0...|    1|[-1.4656655618146...|       1.0|
|[-5.637,8.1261,0....|    1|[-1.3086341649750...|       1.0|
|[-5.4901,9.1048,-...|    1|[-1.1669586533815...|       1.0|
|[-5.3012,7.3915,0...|    1|[-1.2296693605402...|       1.0|
|[-5.2049,7.259,0....|    1|[-1.2083670603777...|       1.0|
|[-5.0676,-5.1877,...|    1|[-1.3518739276978...|       1.0|
|[-5.0477,-5.8023,...|  

In [52]:
# test accuracy, precision, and recall
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [53]:
acc_eval = MulticlassClassificationEvaluator(labelCol='label',
                                            metricName='accuracy')

In [54]:
lsvc_acc = acc_eval.evaluate(lsvc_preds)

In [55]:
lsvc_acc

0.9686746987951808