# Predicting if a person would buy life insurnace based on his age using logistic regression

### Initialize and create a spark session

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Insurance").getOrCreate()

2019-12-29 19:30:33 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@769756fc


### Initialize Logger

In [2]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML for Logistic Regression

In [3]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{StringIndexer,VectorAssembler,OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler, OneHotEncoder}
import org.apache.spark.ml.linalg.Vectors


### Using Spark to read the customer insurance data set

In [4]:
val data = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("insurance_data.csv")

data: org.apache.spark.sql.DataFrame = [age: int, bought_insurance: int]


### Printing the first row of the dataframe

In [5]:
data.head(1)

res1: Array[org.apache.spark.sql.Row] = Array([22,0])


### Printing the schema of the dataframe

In [6]:
data.printSchema

root
 |-- age: integer (nullable = true)
 |-- bought_insurance: integer (nullable = true)



### Show

In [7]:
data.show(5)

+---+----------------+
|age|bought_insurance|
+---+----------------+
| 22|               0|
| 25|               0|
| 47|               1|
| 52|               0|
| 46|               1|
+---+----------------+
only showing top 5 rows



### Count

In [8]:
data.count()

res4: Long = 27


### Count by dropping duplicates

In [9]:
data.na.drop().count()

res5: Long = 27


### Assembling all the features to a single vector column "features"

In [10]:
val assembler = new VectorAssembler().setInputCols(Array("age")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_1a7c77932116


In [11]:
val output = assembler.transform(data)

output: org.apache.spark.sql.DataFrame = [age: int, bought_insurance: int ... 1 more field]


In [12]:
output.show(5)

+---+----------------+--------+
|age|bought_insurance|features|
+---+----------------+--------+
| 22|               0|  [22.0]|
| 25|               0|  [25.0]|
| 47|               1|  [47.0]|
| 52|               0|  [52.0]|
| 46|               1|  [46.0]|
+---+----------------+--------+
only showing top 5 rows



### Splitting the resultant data into training data and testing data,

<code>
<b>Training data is to train the model</b>
<b>Testing data is to test the builted model</b>
</code>

In [13]:
val Array(train_data,test_data) = output.randomSplit(Array(0.7,0.3))

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [age: int, bought_insurance: int ... 1 more field]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [age: int, bought_insurance: int ... 1 more field]


In [14]:
output.describe().show()

+-------+------------------+------------------+
|summary|               age|  bought_insurance|
+-------+------------------+------------------+
|  count|                27|                27|
|   mean|39.666666666666664|0.5185185185185185|
| stddev|15.745573248474537|0.5091750772173156|
|    min|                18|                 0|
|    max|                62|                 1|
+-------+------------------+------------------+



In [15]:
train_data.describe().show()

+-------+------------------+-------------------+
|summary|               age|   bought_insurance|
+-------+------------------+-------------------+
|  count|                17|                 17|
|   mean|41.470588235294116|0.47058823529411764|
| stddev|  16.5004456267809| 0.5144957554275266|
|    min|                18|                  0|
|    max|                62|                  1|
+-------+------------------+-------------------+



In [16]:
test_data.describe().show()

+-------+------------------+------------------+
|summary|               age|  bought_insurance|
+-------+------------------+------------------+
|  count|                10|                10|
|   mean|              36.6|               0.6|
| stddev|14.683323874382122|0.5163977794943222|
|    min|                18|                 0|
|    max|                55|                 1|
+-------+------------------+------------------+



### Creating a logistic regression model object

In [17]:
val lor = new LogisticRegression().setLabelCol("bought_insurance").setFeaturesCol("features")

lor: org.apache.spark.ml.classification.LogisticRegression = logreg_5634df45c2fe


### Creating a logistic regression model and fitting the training data to it

In [18]:
val insuranceModel = lor.fit(train_data)

2019-12-29 19:38:02 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-29 19:38:02 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


insuranceModel: org.apache.spark.ml.classification.LogisticRegressionModel = logreg_5634df45c2fe


### Getting Results on Test Set

In [19]:
val results = insuranceModel.transform(test_data)

results: org.apache.spark.sql.DataFrame = [age: int, bought_insurance: int ... 4 more fields]


In [20]:
results.printSchema

root
 |-- age: integer (nullable = true)
 |-- bought_insurance: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
results.show(5)

+---+----------------+--------+--------------------+--------------------+----------+
|age|bought_insurance|features|       rawPrediction|         probability|prediction|
+---+----------------+--------+--------------------+--------------------+----------+
| 18|               0|  [18.0]|[3.80152092585476...|[0.97815125699042...|       0.0|
| 21|               0|  [21.0]|[3.36961622725230...|[0.96674135411404...|       0.0|
| 23|               0|  [23.0]|[3.08167976151734...|[0.95613069585358...|       0.0|
| 25|               1|  [25.0]|[2.79374329578237...|[0.94233678567471...|       0.0|
| 29|               0|  [29.0]|[2.21787036431243...|[0.90184283664434...|       0.0|
+---+----------------+--------+--------------------+--------------------+----------+
only showing top 5 rows



## MODEL EVALUATION

### 1) Converting the data to rdd and evaluating using MulticlassMetrics to print the confusion matrix

In [22]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics

import org.apache.spark.mllib.evaluation.MulticlassMetrics


In [23]:
val clean_result = results.withColumn("bought_insurance",results("bought_insurance").cast("double"))

clean_result: org.apache.spark.sql.DataFrame = [age: int, bought_insurance: double ... 4 more fields]


In [24]:
clean_result.select("bought_insurance","prediction").show(5)

+----------------+----------+
|bought_insurance|prediction|
+----------------+----------+
|             0.0|       0.0|
|             0.0|       0.0|
|             0.0|       0.0|
|             1.0|       0.0|
|             0.0|       0.0|
+----------------+----------+
only showing top 5 rows



In [25]:
val predictionAndLabel = clean_result.select("bought_insurance","prediction").as[(Double,Double)].rdd

predictionAndLabel: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[95] at rdd at <console>:43


In [26]:
val metrics = new MulticlassMetrics(predictionAndLabel)

metrics: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@63b3e208


#### Printing the confusion matrix

In [27]:
println(metrics.confusionMatrix)

4.0  1.0  
0.0  5.0  


#### Printing the Accuracy

In [28]:
println(metrics.accuracy)

0.9


#### Recall

In [29]:
println(metrics.recall)

0.9


#### precision

In [30]:
println(metrics.precision)

0.9


### 2) Evaluating using BinaryClassificationEvaluator

In [31]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator


In [32]:
val bin_eval = new BinaryClassificationEvaluator().setRawPredictionCol("rawPrediction").setLabelCol("bought_insurance")

bin_eval: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_53d0fad1e505


#### Calculating Area Under ROC

In [33]:
val AOC =bin_eval.evaluate(results)

AOC: Double = 0.9583333333333334


#### Printing Area Under ROC

In [34]:
println(AOC)

0.9583333333333334


### 3) Evaluating using MulticlassClassificationEvaluator

In [35]:
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator

import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


In [36]:
val multi_eval = new MulticlassClassificationEvaluator().setPredictionCol("prediction").setLabelCol("bought_insurance")

multi_eval: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_9c617adee1f3


#### Calculating Area Under ROC

In [37]:
val AOC_2 = multi_eval.evaluate(results)

AOC_2: Double = 0.901010101010101


#### Printing Area Under ROC

In [38]:
println(AOC_2)

0.901010101010101


### Stopping the created spark session

In [39]:
spark.stop()

## Thank You!