In [1]:
from pyspark.sql import SparkSession

In [2]:
spark= SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
#Importing all the necessary libraries
import numpy as np

from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline

#ML Logisitic Regression
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [4]:
#Read the dataset
insurance= spark.read.csv('insurance.csv', header=True, inferSchema=True)

In [5]:
insurance.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [6]:
# To count the dataset by sex
insurance.groupBy('sex').count().show()

+------+-----+
|   sex|count|
+------+-----+
|female|  662|
|  male|  676|
+------+-----+



In [7]:
# To count the dataset by region
insurance.groupBy('region').count().show()

+---------+-----+
|   region|count|
+---------+-----+
|northwest|  325|
|southeast|  364|
|northeast|  324|
|southwest|  325|
+---------+-----+



In [8]:
# To count the dataset by smoker type
insurance.groupBy('smoker').count().show()

+------+-----+
|smoker|count|
+------+-----+
|    no| 1064|
|   yes|  274|
+------+-----+



In [9]:
# To count the dataset by children
insurance.groupBy('children').count().show()

+--------+-----+
|children|count|
+--------+-----+
|       1|  324|
|       3|  157|
|       5|   18|
|       4|   25|
|       2|  240|
|       0|  574|
+--------+-----+



With the help of string indexing, we have changed the index string values into integer values and also performed feature engineering on the dataset and cleaned the dataset for modelling


In [11]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(insurance) for column in list(set(insurance.columns)-set(['Provider_Id, Provider_Zip_Code, Total_Discharges, Average_Covered_Charges, Average_Total_Payments, Average_Medicare_Payments'])) ]
pipeline = Pipeline(stages=indexers)
data = pipeline.fit(insurance).transform(insurance)
data.show(10)

+---+------+------+--------+------+---------+-----------+---------+--------------+------------+-------------+---------+---------+------------+
|age|   sex|   bmi|children|smoker|   region|    charges|bmi_index|children_index|region_index|charges_index|age_index|sex_index|smoker_index|
+---+------+------+--------+------+---------+-----------+---------+--------------+------------+-------------+---------+---------+------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|    412.0|           0.0|         2.0|        340.0|      1.0|      1.0|         1.0|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|    283.0|           1.0|         0.0|        358.0|      0.0|      0.0|         0.0|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|     32.0|           3.0|         0.0|        891.0|     17.0|      0.0|         0.0|
| 33|  male|22.705|       0|    no|northwest|21984.47061|    130.0|           0.0|         1.0|        500.0|     30.0|      0.0|         0.0|

In [12]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

# Vector Assembler is used
assembler = VectorAssembler(
    inputCols=["age","children","bmi","region_index", "smoker_index", "sex_index"], outputCol="features")
output = assembler.transform(data)
output.select('features', 'charges').show(10)

+--------------------+-----------+
|            features|    charges|
+--------------------+-----------+
|[19.0,0.0,27.9,2....|  16884.924|
|[18.0,1.0,33.77,0...|  1725.5523|
|[28.0,3.0,33.0,0....|   4449.462|
|[33.0,0.0,22.705,...|21984.47061|
|[32.0,0.0,28.88,1...|  3866.8552|
|[31.0,0.0,25.74,0...|  3756.6216|
|[46.0,1.0,33.44,0...|  8240.5896|
|[37.0,3.0,27.74,1...|  7281.5056|
|[37.0,2.0,29.83,3...|  6406.4107|
|[60.0,0.0,25.84,1...|28923.13692|
+--------------------+-----------+
only showing top 10 rows



The dataset is splitted  into train data and test data for modelling

In [13]:
splits = output.randomSplit([0.7, 0.3])
train_data = splits[0]
test_data = splits[1]

# Linear Regression model training


In [14]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
linr = LinearRegression(maxIter=6, regParam=0.0, labelCol='charges', solver="normal")
model = linr.fit(train_data)

In [16]:
# Finding the Model summary
trainingSummary = model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 5863.932211
r2: 0.757061


In [21]:
# Predicting the test dataset using the trained model
predictions = model.transform(test_data)
predictions.show(10)

+---+------+------+--------+------+---------+-----------+---------+--------------+------------+-------------+---------+---------+------------+--------------------+------------------+
|age|   sex|   bmi|children|smoker|   region|    charges|bmi_index|children_index|region_index|charges_index|age_index|sex_index|smoker_index|            features|        prediction|
+---+------+------+--------+------+---------+-----------+---------+--------------+------------+-------------+---------+---------+------------+--------------------+------------------+
| 18|female| 24.09|       1|    no|southeast|  2201.0971|    385.0|           1.0|         0.0|        502.0|      0.0|      1.0|         0.0|[18.0,1.0,24.09,0...|114.03357212963783|
| 18|female|26.315|       0|    no|northeast| 2198.18985|     44.0|           0.0|         3.0|        499.0|      0.0|      1.0|         0.0|[18.0,0.0,26.315,...|1051.9607290210643|
| 18|female| 27.28|       3|   yes|southeast| 18223.4512|    408.0|           3.0|   

In [22]:
evaluator = RegressionEvaluator(labelCol="charges")
rmse = evaluator.evaluate(predictions,{evaluator.metricName:"rmse" })
np.sqrt(rmse), rmse

(80.47773494414807, 6476.66582174055)

In [23]:
print("R Squared (R2) on test data = %g" % evaluator.evaluate(predictions,{evaluator.metricName:"r2" }))

R Squared (R2) on test data = 0.733913


Through the analysis with Linear Regression model on the dataset, we obtained the RMSE score of 6476.66 and R2 value of 0.733 on the test dataset. 
Hence, we conclude is capable of making predictions of the charges based on the details of the patients and provides best fit for the trained model. We also learnt through our analysis that we get the best fit for the trained model with the help of linear regression.
