In [48]:
import pyspark
import boto3


In [45]:
# Problem statement and dataset can be found here https://www.kaggle.com/c/titanic

In [4]:
'''
Prerequisites:
Basics understand of python https://www.kaggle.com/learn/python

Pandas https://www.kaggle.com/learn/pandas

Machine learning https://www.kaggle.com/learn/machine-learning

Apache Spark

https://docs.databricks.com/spark/latest/gentle-introduction/gentle-intro.html

https://docs.databricks.com/spark/latest/gentle-introduction/gentle-intro.html#gentle-introduction-to-apache-spark

https://docs.databricks.com/spark/latest/gentle-introduction/for-data-scientists.html

'''

'\nPrerequisites:\nBasics understand of python https://www.kaggle.com/learn/python\n\nPandas https://www.kaggle.com/learn/pandas\n\nMachine learning https://www.kaggle.com/learn/machine-learning\n\nApache Spark\n\nhttps://docs.databricks.com/spark/latest/gentle-introduction/gentle-intro.html\n\nhttps://docs.databricks.com/spark/latest/gentle-introduction/gentle-intro.html#gentle-introduction-to-apache-spark\n\nhttps://docs.databricks.com/spark/latest/gentle-introduction/for-data-scientists.html\n\n'

In [5]:
# Importing the necessary libraries

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer

In [7]:
# Starting Point: SparkSession

In [8]:
# The entry point into all functionality in Spark is the SparkSession class. To create a basic SparkSession, just use SparkSession.builder

In [44]:
spark = SparkSession.builder \
    .appName("titanic_dataset") \
    .getOrCreate()

In [39]:
spark

In [46]:
import requests

# Define the URL and local file path
url = "https://testbucket1730.s3.eu-north-1.amazonaws.com/train+(1).csv"
local_path = "/tmp/dataset.csv"

# Download the file
response = requests.get(url)
with open(local_path, 'wb') as f:
    f.write(response.content)

In [49]:
titanic_df= spark.read.csv(local_path, header=True, inferSchema=True)
titanic_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [50]:

passengers_count = titanic_df.count()

In [51]:
passengers_count

891

In [52]:
# Summary

In [53]:
titanic_df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [54]:
titanic_df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [56]:
# Let's select few features

In [57]:

titanic_df.select("Survived","Pclass","Embarked").show()

+--------+------+--------+
|Survived|Pclass|Embarked|
+--------+------+--------+
|       0|     3|       S|
|       1|     1|       C|
|       1|     3|       S|
|       1|     1|       S|
|       0|     3|       S|
|       0|     3|       Q|
|       0|     1|       S|
|       0|     3|       S|
|       1|     3|       S|
|       1|     2|       C|
|       1|     3|       S|
|       1|     1|       S|
|       0|     3|       S|
|       0|     3|       S|
|       0|     3|       S|
|       1|     2|       S|
|       0|     3|       Q|
|       1|     2|       S|
|       0|     3|       S|
|       1|     3|       C|
+--------+------+--------+
only showing top 20 rows



In [58]:
# Exploratory Data Analysis (EDA)

In [59]:
titanic_df.groupBy("Survived").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  342|
|       0|  549|
+--------+-----+



In [60]:
# Out of 891 passengers in dataset, only around 342 survived.

In [61]:
titanic_df.groupBy("Sex","Survived").count().show()

+------+--------+-----+
|   Sex|Survived|count|
+------+--------+-----+
|  male|       0|  468|
|female|       1|  233|
|female|       0|   81|
|  male|       1|  109|
+------+--------+-----+



In [62]:
# This looks interesting. The number of men on the ship is lot more than the number of women. Still the number of women saved is almost twice the number of males saved.

In [63]:

titanic_df.groupBy("Pclass","Survived").count().show()

+------+--------+-----+
|Pclass|Survived|count|
+------+--------+-----+
|     1|       0|   80|
|     3|       1|  119|
|     1|       1|  136|
|     2|       1|   87|
|     2|       0|   97|
|     3|       0|  372|
+------+--------+-----+



In [64]:
# Checking Null values

In [65]:
# This function use to print feature with null values and null count
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [66]:

# Calling function
null_columns_count_list = null_value_count(titanic_df)

In [67]:
null_columns_count_list

[('Age', 177), ('Cabin', 687), ('Embarked', 2)]

In [68]:
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

+----------------------+-----------------+
|Column_With_Null_Value|Null_Values_Count|
+----------------------+-----------------+
|                   Age|              177|
|                 Cabin|              687|
|              Embarked|                2|
+----------------------+-----------------+



In [69]:
mean_age = titanic_df.select(mean('Age')).collect()[0][0]
print(mean_age)

29.69911764705882


In [74]:
titanic_df.head(5)

[Row(PassengerId=1, Survived=0, Pclass=3, Name='Braund, Mr. Owen Harris', Sex='male', Age=22.0, SibSp=1, Parch=0, Ticket='A/5 21171', Fare=7.25, Cabin=None, Embarked='S', Initial='Mr'),
 Row(PassengerId=2, Survived=1, Pclass=1, Name='Cumings, Mrs. John Bradley (Florence Briggs Thayer)', Sex='female', Age=38.0, SibSp=1, Parch=0, Ticket='PC 17599', Fare=71.2833, Cabin='C85', Embarked='C', Initial='Mrs'),
 Row(PassengerId=3, Survived=1, Pclass=3, Name='Heikkinen, Miss. Laina', Sex='female', Age=26.0, SibSp=0, Parch=0, Ticket='STON/O2. 3101282', Fare=7.925, Cabin=None, Embarked='S', Initial='Miss'),
 Row(PassengerId=4, Survived=1, Pclass=1, Name='Futrelle, Mrs. Jacques Heath (Lily May Peel)', Sex='female', Age=35.0, SibSp=1, Parch=0, Ticket='113803', Fare=53.1, Cabin='C123', Embarked='S', Initial='Mrs'),
 Row(PassengerId=5, Survived=0, Pclass=3, Name='Allen, Mr. William Henry', Sex='male', Age=35.0, SibSp=0, Parch=0, Ticket='373450', Fare=8.05, Cabin=None, Embarked='S', Initial='Mr')]

In [76]:
titanic_df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|Initial|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+-------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|     Mr|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|    Mrs|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|   Miss|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|    Mrs|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|     Mr|
+-----------+---

In [77]:
# Drop columns which are not required

In [78]:
titanic_df = titanic_df.drop("PassengerId","Name","Ticket","Cabin","Embarked","Sex","Initial")

In [79]:
titanic_df.show(5)

+--------+------+----+-----+-----+-------+
|Survived|Pclass| Age|SibSp|Parch|   Fare|
+--------+------+----+-----+-----+-------+
|       0|     3|22.0|    1|    0|   7.25|
|       1|     1|38.0|    1|    0|71.2833|
|       1|     3|26.0|    0|    0|  7.925|
|       1|     1|35.0|    1|    0|   53.1|
|       0|     3|35.0|    0|    0|   8.05|
+--------+------+----+-----+-----+-------+
only showing top 5 rows



In [90]:
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=["Pclass", "Age", "SibSp", "Parch", "Fare"],
                  outputCols=["Pclass", "Age", "SibSp", "Parch", "Fare"]).setStrategy("mean")
titanic_df = imputer.fit(titanic_df).transform(titanic_df)



In [91]:
'''

Modelling

Here is the list of few Classification Algorithms from Spark ML

LogisticRegression

DecisionTreeClassifier

RandomForestClassifier

Gradient-boosted tree classifier

NaiveBayes

Support Vector Machine

LogisticRegression

'''

'\n\nModelling\n\nHere is the list of few Classification Algorithms from Spark ML\n\nLogisticRegression\n\nDecisionTreeClassifier\n\nRandomForestClassifier\n\nGradient-boosted tree classifier\n\nNaiveBayes\n\nSupport Vector Machine\n\nLogisticRegression\n\n'

In [92]:
# Let's put all features into a vector

In [93]:
feature = VectorAssembler(inputCols=titanic_df.columns[1:],outputCol="features")
feature_vector= feature.transform(titanic_df)

In [94]:
feature_vector

DataFrame[Survived: int, Pclass: int, Age: double, SibSp: int, Parch: int, Fare: double, features: vector]

In [95]:
(trainingData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 11)

In [104]:
# Logistic Regression

In [97]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="Survived", featuresCol="features")
#Training algo
lrModel = lr.fit(trainingData)


In [98]:
lr_prediction = lrModel.transform(testData)
lr_prediction.select("prediction", "Survived", "features").show()
evaluator = MulticlassClassificationEvaluator(labelCol="Survived", predictionCol="prediction", metricName="accuracy")

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,22.0,0.0,0.0...|
|       1.0|       0|[1.0,24.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,1.0,0.0...|
|       1.0|       0|(5,[0,1],[1.0,29....|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|[1.0,37.0,1.0,0.0...|
|       1.0|       0|[1.0,38.0,0.0,1.0...|
|       1.0|       0|[1.0,45.0,1.0,0.0...|
|       1.0|       0|[1.0,47.0,0.0,0.0...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|[1.0,62.0,0.0,0.0...|
|       0.0|       0|[1.0,65.0,0.0,0.0...|
|       0.0|       0|[1.0,71.0,0.0,0.0...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [99]:
# Evaluate how well is LogisticRegression doing

In [100]:
lr_accuracy = evaluator.evaluate(lr_prediction)
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))


Accuracy of LogisticRegression is = 0.68617
Test Error of LogisticRegression = 0.31383 


In [103]:
# Decision Tree Algorithm

In [101]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
dt_model = dt.fit(trainingData)
dt_prediction = dt_model.transform(testData)
dt_prediction.select("prediction", "Survived", "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,22.0,0.0,0.0...|
|       1.0|       0|[1.0,24.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,29....|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|[1.0,37.0,1.0,0.0...|
|       1.0|       0|[1.0,38.0,0.0,1.0...|
|       1.0|       0|[1.0,45.0,1.0,0.0...|
|       0.0|       0|[1.0,47.0,0.0,0.0...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|[1.0,62.0,0.0,0.0...|
|       0.0|       0|[1.0,65.0,0.0,0.0...|
|       0.0|       0|[1.0,71.0,0.0,0.0...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [102]:
dt_accuracy = evaluator.evaluate(dt_prediction)
print("Accuracy of DecisionTreeClassifier is = %g"% (dt_accuracy))
print("Test Error of DecisionTreeClassifier = %g " % (1.0 - dt_accuracy))

Accuracy of DecisionTreeClassifier is = 0.702128
Test Error of DecisionTreeClassifier = 0.297872 


In [105]:
# RandomForestClassifier

In [106]:
from pyspark.ml.classification import RandomForestClassifier
rf = DecisionTreeClassifier(labelCol="Survived", featuresCol="features")
rf_model = rf.fit(trainingData)
rf_prediction = rf_model.transform(testData)
rf_prediction.select("prediction", "Survived", "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,22.0,0.0,0.0...|
|       1.0|       0|[1.0,24.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,29....|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|[1.0,37.0,1.0,0.0...|
|       1.0|       0|[1.0,38.0,0.0,1.0...|
|       1.0|       0|[1.0,45.0,1.0,0.0...|
|       0.0|       0|[1.0,47.0,0.0,0.0...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|[1.0,62.0,0.0,0.0...|
|       0.0|       0|[1.0,65.0,0.0,0.0...|
|       0.0|       0|[1.0,71.0,0.0,0.0...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [107]:
rf_accuracy = evaluator.evaluate(rf_prediction)
print("Accuracy of RandomForestClassifier is = %g"% (rf_accuracy))
print("Test Error of RandomForestClassifier  = %g " % (1.0 - rf_accuracy))

Accuracy of RandomForestClassifier is = 0.702128
Test Error of RandomForestClassifier  = 0.297872 


In [108]:
# Gradient-boosted tree classifier

In [109]:
from pyspark.ml.classification import GBTClassifier
gbt = GBTClassifier(labelCol="Survived", featuresCol="features",maxIter=10)
gbt_model = gbt.fit(trainingData)
gbt_prediction = gbt_model.transform(testData)
gbt_prediction.select("prediction", "Survived", "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,22.0,0.0,0.0...|
|       1.0|       0|[1.0,24.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,29....|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|[1.0,37.0,1.0,0.0...|
|       1.0|       0|[1.0,38.0,0.0,1.0...|
|       1.0|       0|[1.0,45.0,1.0,0.0...|
|       0.0|       0|[1.0,47.0,0.0,0.0...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|[1.0,62.0,0.0,0.0...|
|       0.0|       0|[1.0,65.0,0.0,0.0...|
|       0.0|       0|[1.0,71.0,0.0,0.0...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [110]:
gbt_accuracy = evaluator.evaluate(gbt_prediction)
print("Accuracy of Gradient-boosted tree classifie is = %g"% (gbt_accuracy))
print("Test Error of Gradient-boosted tree classifie %g"% (1.0 - gbt_accuracy))

Accuracy of Gradient-boosted tree classifie is = 0.702128
Test Error of Gradient-boosted tree classifie 0.297872


In [111]:
# NaiveBayes

In [112]:

from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="Survived", featuresCol="features")
nb_model = nb.fit(trainingData)
nb_prediction = nb_model.transform(testData)
nb_prediction.select("prediction", "Survived", "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,22.0,0.0,0.0...|
|       1.0|       0|[1.0,24.0,0.0,0.0...|
|       0.0|       0|[1.0,29.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,1.0,0.0...|
|       0.0|       0|(5,[0,1],[1.0,29....|
|       0.0|       0|[1.0,29.699117647...|
|       0.0|       0|[1.0,29.699117647...|
|       0.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|[1.0,37.0,1.0,0.0...|
|       1.0|       0|[1.0,38.0,0.0,1.0...|
|       1.0|       0|[1.0,45.0,1.0,0.0...|
|       0.0|       0|[1.0,47.0,0.0,0.0...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       0.0|       0|[1.0,62.0,0.0,0.0...|
|       0.0|       0|[1.0,65.0,0.0,0.0...|
|       0.0|       0|[1.0,71.0,0.0,0.0...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [113]:
nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

Accuracy of NaiveBayes is  = 0.696809
Test Error of NaiveBayes  = 0.303191 


In [114]:
# Support Vector Machine

In [115]:
from pyspark.ml.classification import LinearSVC
svm = LinearSVC(labelCol="Survived", featuresCol="features")
svm_model = svm.fit(trainingData)
svm_prediction = svm_model.transform(testData)
svm_prediction.select("prediction", "Survived", "features").show()

+----------+--------+--------------------+
|prediction|Survived|            features|
+----------+--------+--------------------+
|       1.0|       0|[1.0,22.0,0.0,0.0...|
|       1.0|       0|[1.0,24.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,0.0,0.0...|
|       1.0|       0|[1.0,29.0,1.0,0.0...|
|       1.0|       0|(5,[0,1],[1.0,29....|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,29.699117647...|
|       1.0|       0|[1.0,31.0,1.0,0.0...|
|       1.0|       0|[1.0,37.0,1.0,0.0...|
|       1.0|       0|[1.0,38.0,0.0,1.0...|
|       1.0|       0|[1.0,45.0,1.0,0.0...|
|       1.0|       0|[1.0,47.0,0.0,0.0...|
|       1.0|       0|[1.0,58.0,0.0,2.0...|
|       1.0|       0|[1.0,62.0,0.0,0.0...|
|       1.0|       0|[1.0,65.0,0.0,0.0...|
|       1.0|       0|[1.0,71.0,0.0,0.0...|
|       1.0|       0|[2.0,19.0,1.0,1.0...|
+----------

In [116]:
# Evaluate how well is Support Vector Machine doing

In [117]:
svm_accuracy = evaluator.evaluate(svm_prediction)
print("Accuracy of Support Vector Machine is = %g"% (svm_accuracy))
print("Test Error of Support Vector Machine = %g " % (1.0 - svm_accuracy))

Accuracy of Support Vector Machine is = 0.680851
Test Error of Support Vector Machine = 0.319149 


In [118]:
# Accuracy of Support Vector Machine is = 0.836257 Test Error of Support Vector Machine = 0.163743

In [119]:
'''

How to increase accuracy of a model ?

Add new features or drop existing features and train model
Tune ML algorith (https://spark.apache.org/docs/latest/ml-tuning.html)

'''

'\n\nHow to increase accuracy of a model ?\n\nAdd new features or drop existing features and train model\nTune ML algorith (https://spark.apache.org/docs/latest/ml-tuning.html)\n\n'

In [122]:
# Reference
# https://spark.apache.org/docs/latest/ml-classification-regression.html