## Predicting if a person would buy life insurnace based on his age using logistic regression

In [37]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [38]:
# Initialize and create a spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Insurance').getOrCreate()

In [39]:
# Import statements to setup ML
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [40]:
# Using Spark to read the customer insurance data set
data = spark.read.csv('insurance_data.csv', header=True, inferSchema=True)

In [41]:
# Printing the first row of the dataframe
data.show(4)

+---+----------------+
|age|bought_insurance|
+---+----------------+
| 22|               0|
| 25|               0|
| 47|               1|
| 52|               0|
+---+----------------+
only showing top 4 rows



In [42]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- bought_insurance: integer (nullable = true)



In [43]:
data.count()

27

In [44]:
#Assembling all the features to a single vector column "features"
assembler = VectorAssembler(inputCols=['age'], outputCol='features')

In [45]:
output = assembler.transform(data)

In [46]:
output.show(4)

+---+----------------+--------+
|age|bought_insurance|features|
+---+----------------+--------+
| 22|               0|  [22.0]|
| 25|               0|  [25.0]|
| 47|               1|  [47.0]|
| 52|               0|  [52.0]|
+---+----------------+--------+
only showing top 4 rows



__Splitting the resultant data into training data and testing data, Training data is to train the model, Testing data is to test the builted model__

In [47]:
train_data, test_data = output.randomSplit([0.7,0.3])

In [48]:
train_data.count()

19

In [49]:
test_data.count()

8

In [50]:
#Creating a logistic regression model object
lor = LogisticRegression(labelCol='bought_insurance', featuresCol='features')

In [51]:
# Creating a logistic regression model and fitting the training data to it
insuranceModel = lor.fit(train_data)

In [52]:
#Getting Results on Test Set
results = insuranceModel.transform(test_data)

In [53]:
results.printSchema()

root
 |-- age: integer (nullable = true)
 |-- bought_insurance: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [54]:
results.count()

8

In [55]:
results.show(4)

+---+----------------+--------+--------------------+--------------------+----------+
|age|bought_insurance|features|       rawPrediction|         probability|prediction|
+---+----------------+--------+--------------------+--------------------+----------+
| 18|               0|  [18.0]|[2.65583952216099...|[0.93436999541627...|       0.0|
| 19|               0|  [19.0]|[2.51487038710603...|[0.92517773660748...|       0.0|
| 25|               0|  [25.0]|[1.66905557677624...|[0.84144986440030...|       0.0|
| 47|               1|  [47.0]|[-1.4322653944329...|[0.19274595492869...|       1.0|
+---+----------------+--------+--------------------+--------------------+----------+
only showing top 4 rows



### MODEL EVALUATION

__1) Converting the data to rdd and evaluating using MulticlassMetrics to print the confusion matrix__

In [56]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [57]:
clean_result = results.withColumn('bought_insurance',results['bought_insurance'].cast('double'))

In [58]:
clean_result.select('prediction','bought_insurance').show(3)

+----------+----------------+
|prediction|bought_insurance|
+----------+----------------+
|       0.0|             0.0|
|       0.0|             0.0|
|       0.0|             0.0|
+----------+----------------+
only showing top 3 rows



In [59]:
predictionAndLabel = clean_result.select('prediction','bought_insurance').rdd

In [60]:
metrics = MulticlassMetrics(predictionAndLabel)

In [61]:
#Printing the confusion matrix
print(metrics.confusionMatrix())

DenseMatrix([[3., 1.],
             [0., 4.]])


In [62]:
#Printing the Accuracy
print(metrics.accuracy)

0.875


In [63]:
metrics.recall()

0.875

In [64]:
metrics.precision()

0.875

__2) Evaluating using BinaryClassificationEvaluator__

In [65]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [66]:
bin_eval = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='bought_insurance')

In [67]:
#Calculating Area Under ROC
AOC = bin_eval.evaluate(results)

In [68]:
#Printing Area Under ROC
print(AOC)

0.9375


In [None]:
#Closing spark session
spark.stop()