In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Ins').getOrCreate()

In [3]:
import numpy as np

from pyspark.ml.feature import StringIndexer, OneHotEncoder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline

In [4]:
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

import time

In [5]:
df = spark.read.csv('insurance.csv' , header=True , inferSchema=True)

In [6]:
df.show()

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [7]:

index = [StringIndexer(inputCol="sex" , outputCol="sex_ind"),
        StringIndexer(inputCol="smoker" , outputCol="smoker_ind"),
        StringIndexer(inputCol="region" , outputCol="region_ind") 
        ]

In [8]:
pipe = Pipeline(stages=index)
NewDF = pipe.fit(df).transform(df)

In [9]:
assemble = VectorAssembler(inputCols=["age" , "bmi" , "sex_ind" , "smoker_ind"] , outputCol="comp_feat")

In [10]:
compiled = assemble.transform(NewDF)

In [12]:
FinalDF = compiled.select("comp_feat" , "charges")

In [13]:
Train_df, Test_df = FinalDF.randomSplit([0.6,0.4])

In [14]:
from pyspark.ml.regression import LinearRegression
LR = LinearRegression(featuresCol="comp_feat" , labelCol="charges")

In [15]:
regressor = LR.fit(Train_df)

In [16]:
predict = regressor.evaluate(Test_df)

In [17]:
from pyspark.ml.evaluation import RegressionEvaluator

In [18]:
evaluated_result = RegressionEvaluator(labelCol="charges" , predictionCol="predict")

In [19]:
trainingSummary = regressor.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RSME: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 1
objectiveHistory: [0.0]
+-------------------+
|          residuals|
+-------------------+
|  4250.747787531364|
|  -8640.17567188698|
| 2170.2782029572304|
| 2186.2998199830204|
| 12251.313584748903|
| 1332.0454885837062|
| 1145.9312372979944|
| 1016.5902143291742|
|  683.2101411233648|
|-55.279134109405504|
| -7179.847690680614|
|-8077.9023022339425|
|  4736.975695874223|
| -767.6165037251315|
|-1014.9006522880916|
|  607.4703553357062|
|-1703.8401245291334|
|-1786.2681740501214|
|-1416.8317568999496|
|-2963.9634693843636|
+-------------------+
only showing top 20 rows

RSME: 5914.919464
r2: 0.756624
