In [101]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.getOrCreate()

### Vector Assembler

In [103]:
df = spark.read.csv(r"C:\Users\ranju\Downloads\iris.csv", inferSchema=True , header=True)
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [104]:
df.printSchema()

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [105]:
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [106]:
assembler = VectorAssembler(inputCols= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'] ,\
                           outputCol='features')


In [107]:
output_df = assembler.transform(df)

In [108]:
output_df.show()

+------------+-----------+------------+-----------+-------+-----------------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|
+------------+-----------+------------+-----------+-------+-----------------+
|         5.1|        3.5|         1.4|        0.2| setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2| setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2| setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2| setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2| setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4| setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3| setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2| setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2| setosa|[4.4,2.9,1.4,0.2]|
|         4.9|        3.1|         1.5|        0.1| setosa|[4.9,

### String indexer

In [109]:
from pyspark.ml.feature import StringIndexer


In [110]:
df.select(col('species')).distinct().show()

+----------+
|   species|
+----------+
| virginica|
|versicolor|
|    setosa|
+----------+



In [111]:
indexer = StringIndexer(inputCol='species', outputCol='species_index')

In [112]:
indx = indexer.fit(df).transform(df)
indx.show(67)

+------------+-----------+------------+-----------+----------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|   species|species_index|
+------------+-----------+------------+-----------+----------+-------------+
|         5.1|        3.5|         1.4|        0.2|    setosa|          0.0|
|         4.9|        3.0|         1.4|        0.2|    setosa|          0.0|
|         4.7|        3.2|         1.3|        0.2|    setosa|          0.0|
|         4.6|        3.1|         1.5|        0.2|    setosa|          0.0|
|         5.0|        3.6|         1.4|        0.2|    setosa|          0.0|
|         5.4|        3.9|         1.7|        0.4|    setosa|          0.0|
|         4.6|        3.4|         1.4|        0.3|    setosa|          0.0|
|         5.0|        3.4|         1.5|        0.2|    setosa|          0.0|
|         4.4|        2.9|         1.4|        0.2|    setosa|          0.0|
|         4.9|        3.1|         1.5|        0.1|    setosa|          0.0|

### One-Hot Encoding

In [113]:
from pyspark.ml.feature import OneHotEncoder

In [114]:
indx.show()

+------------+-----------+------------+-----------+-------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|species|species_index|
+------------+-----------+------------+-----------+-------+-------------+
|         5.1|        3.5|         1.4|        0.2| setosa|          0.0|
|         4.9|        3.0|         1.4|        0.2| setosa|          0.0|
|         4.7|        3.2|         1.3|        0.2| setosa|          0.0|
|         4.6|        3.1|         1.5|        0.2| setosa|          0.0|
|         5.0|        3.6|         1.4|        0.2| setosa|          0.0|
|         5.4|        3.9|         1.7|        0.4| setosa|          0.0|
|         4.6|        3.4|         1.4|        0.3| setosa|          0.0|
|         5.0|        3.4|         1.5|        0.2| setosa|          0.0|
|         4.4|        2.9|         1.4|        0.2| setosa|          0.0|
|         4.9|        3.1|         1.5|        0.1| setosa|          0.0|
|         5.4|        3.7|         1.5

In [115]:
indx.distinct().show()

+------------+-----------+------------+-----------+----------+-------------+
|sepal_length|sepal_width|petal_length|petal_width|   species|species_index|
+------------+-----------+------------+-----------+----------+-------------+
|         5.4|        3.7|         1.5|        0.2|    setosa|          0.0|
|         6.3|        2.5|         4.9|        1.5|versicolor|          1.0|
|         5.1|        3.5|         1.4|        0.3|    setosa|          0.0|
|         6.7|        3.1|         4.4|        1.4|versicolor|          1.0|
|         5.1|        3.5|         1.4|        0.2|    setosa|          0.0|
|         5.7|        4.4|         1.5|        0.4|    setosa|          0.0|
|         5.4|        3.9|         1.3|        0.4|    setosa|          0.0|
|         5.0|        3.6|         1.4|        0.2|    setosa|          0.0|
|         5.2|        3.5|         1.5|        0.2|    setosa|          0.0|
|         5.5|        2.6|         4.4|        1.2|versicolor|          1.0|

In [116]:
encoder = OneHotEncoder(inputCol='species_index',outputCol='species_index_onehot')
models = encoder.fit(indx)
encoded = models.transform(indx)

In [117]:
encoded.show()

+------------+-----------+------------+-----------+-------+-------------+--------------------+
|sepal_length|sepal_width|petal_length|petal_width|species|species_index|species_index_onehot|
+------------+-----------+------------+-----------+-------+-------------+--------------------+
|         5.1|        3.5|         1.4|        0.2| setosa|          0.0|       (2,[0],[1.0])|
|         4.9|        3.0|         1.4|        0.2| setosa|          0.0|       (2,[0],[1.0])|
|         4.7|        3.2|         1.3|        0.2| setosa|          0.0|       (2,[0],[1.0])|
|         4.6|        3.1|         1.5|        0.2| setosa|          0.0|       (2,[0],[1.0])|
|         5.0|        3.6|         1.4|        0.2| setosa|          0.0|       (2,[0],[1.0])|
|         5.4|        3.9|         1.7|        0.4| setosa|          0.0|       (2,[0],[1.0])|
|         4.6|        3.4|         1.4|        0.3| setosa|          0.0|       (2,[0],[1.0])|
|         5.0|        3.4|         1.5|        0.2

### LINEAR Regression

In [118]:
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [119]:
df = spark.read.csv(r"C:\Users\ranju\Downloads\insurance.csv", inferSchema=True, header=True)
df .show()

+---+------+------+--------+------+---------+-----------+
|age|gender|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [120]:
df.count()

1338

In [121]:
df.columns
len(df.columns)

7

In [122]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [123]:
df.describe().show()

+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|summary|               age|gender|               bmi|         children|smoker|   region|           charges|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|  count|              1338|  1338|              1338|             1338|  1338|     1338|              1338|
|   mean| 39.20702541106129|  NULL|30.663396860986538|  1.0949177877429|  NULL|     NULL|13270.422265141257|
| stddev|14.049960379216147|  NULL| 6.098186911679012|1.205492739781914|  NULL|     NULL|12110.011236693992|
|    min|                18|female|             15.96|                0|    no|northeast|         1121.8739|
|    max|                64|  male|             53.13|                5|   yes|southwest|       63770.42801|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+



In [124]:
df.head(5)

[Row(age=19, gender='female', bmi=27.9, children=0, smoker='yes', region='southwest', charges=16884.924),
 Row(age=18, gender='male', bmi=33.77, children=1, smoker='no', region='southeast', charges=1725.5523),
 Row(age=28, gender='male', bmi=33.0, children=3, smoker='no', region='southeast', charges=4449.462),
 Row(age=33, gender='male', bmi=22.705, children=0, smoker='no', region='northwest', charges=21984.47061),
 Row(age=32, gender='male', bmi=28.88, children=0, smoker='no', region='northwest', charges=3866.8552)]

In [125]:
df.corr("age","charges")

0.299008193330648

In [126]:
df.corr("bmi","charges")

0.19834096883362903

In [127]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [128]:
# convert the string into numeric using StringIndexer

indexer =  StringIndexer(inputCol='gender', outputCol='gender_ind')
indexed = indexer.fit(df).transform(df)


In [129]:
indexed.show(10) #convert into categorical

+---+------+------+--------+------+---------+-----------+----------+
|age|gender|   bmi|children|smoker|   region|    charges|gender_ind|
+---+------+------+--------+------+---------+-----------+----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|       1.0|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|       0.0|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|       0.0|
| 33|  male|22.705|       0|    no|northwest|21984.47061|       0.0|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|       0.0|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|       1.0|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|       1.0|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|       1.0|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|       0.0|
| 60|female| 25.84|       0|    no|northwest|28923.13692|       1.0|
+---+------+------+--------+------+---------+-----------+----------+
only showing top 10 rows



In [130]:
# convert the string into numeric using StringIndexer

indexer =  StringIndexer(inputCol='smoker', outputCol='smoker_ind')
indexed = indexer.fit(indexed).transform(indexed)


In [131]:

indexer =  StringIndexer(inputCol='region', outputCol='region_ind')
indexed = indexer.fit(indexed).transform(indexed)


In [132]:
indexed.show(truncate=False)

+---+------+------+--------+------+---------+-----------+----------+----------+----------+
|age|gender|bmi   |children|smoker|region   |charges    |gender_ind|smoker_ind|region_ind|
+---+------+------+--------+------+---------+-----------+----------+----------+----------+
|19 |female|27.9  |0       |yes   |southwest|16884.924  |1.0       |1.0       |2.0       |
|18 |male  |33.77 |1       |no    |southeast|1725.5523  |0.0       |0.0       |0.0       |
|28 |male  |33.0  |3       |no    |southeast|4449.462   |0.0       |0.0       |0.0       |
|33 |male  |22.705|0       |no    |northwest|21984.47061|0.0       |0.0       |1.0       |
|32 |male  |28.88 |0       |no    |northwest|3866.8552  |0.0       |0.0       |1.0       |
|31 |female|25.74 |0       |no    |southeast|3756.6216  |1.0       |0.0       |0.0       |
|46 |female|33.44 |1       |no    |southeast|8240.5896  |1.0       |0.0       |0.0       |
|37 |female|27.74 |3       |no    |northwest|7281.5056  |1.0       |0.0       |1.0       |

In [133]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler


In [134]:
indexed.columns

['age',
 'gender',
 'bmi',
 'children',
 'smoker',
 'region',
 'charges',
 'gender_ind',
 'smoker_ind',
 'region_ind']

In [135]:
assembler = VectorAssembler(inputCols=['age','bmi','children','gender_ind',
 'smoker_ind','region_ind'], outputCol='features')

In [136]:
output = assembler.transform(indexed)

In [137]:
output.show(truncate=False)

+---+------+------+--------+------+---------+-----------+----------+----------+----------+-----------------------------+
|age|gender|bmi   |children|smoker|region   |charges    |gender_ind|smoker_ind|region_ind|features                     |
+---+------+------+--------+------+---------+-----------+----------+----------+----------+-----------------------------+
|19 |female|27.9  |0       |yes   |southwest|16884.924  |1.0       |1.0       |2.0       |[19.0,27.9,0.0,1.0,1.0,2.0]  |
|18 |male  |33.77 |1       |no    |southeast|1725.5523  |0.0       |0.0       |0.0       |[18.0,33.77,1.0,0.0,0.0,0.0] |
|28 |male  |33.0  |3       |no    |southeast|4449.462   |0.0       |0.0       |0.0       |[28.0,33.0,3.0,0.0,0.0,0.0]  |
|33 |male  |22.705|0       |no    |northwest|21984.47061|0.0       |0.0       |1.0       |[33.0,22.705,0.0,0.0,0.0,1.0]|
|32 |male  |28.88 |0       |no    |northwest|3866.8552  |0.0       |0.0       |1.0       |[32.0,28.88,0.0,0.0,0.0,1.0] |
|31 |female|25.74 |0       |no  

In [138]:
final = output.select('features','Charges')
final.show()

+--------------------+-----------+
|            features|    Charges|
+--------------------+-----------+
|[19.0,27.9,0.0,1....|  16884.924|
|[18.0,33.77,1.0,0...|  1725.5523|
|[28.0,33.0,3.0,0....|   4449.462|
|[33.0,22.705,0.0,...|21984.47061|
|[32.0,28.88,0.0,0...|  3866.8552|
|[31.0,25.74,0.0,1...|  3756.6216|
|[46.0,33.44,1.0,1...|  8240.5896|
|[37.0,27.74,3.0,1...|  7281.5056|
|[37.0,29.83,2.0,0...|  6406.4107|
|[60.0,25.84,0.0,1...|28923.13692|
|[25.0,26.22,0.0,0...|  2721.3208|
|[62.0,26.29,0.0,1...| 27808.7251|
|[23.0,34.4,0.0,0....|   1826.843|
|[56.0,39.82,0.0,1...| 11090.7178|
|[27.0,42.13,0.0,0...| 39611.7577|
|[19.0,24.6,1.0,0....|   1837.237|
|[52.0,30.78,1.0,1...| 10797.3362|
|[23.0,23.845,0.0,...| 2395.17155|
|[56.0,40.3,0.0,0....|  10602.385|
|[30.0,35.3,0.0,0....|  36837.467|
+--------------------+-----------+
only showing top 20 rows



In [139]:
train ,test = final.randomSplit([0.7,0.3])

In [140]:
train.show()

+--------------------+-----------+
|            features|    Charges|
+--------------------+-----------+
|(6,[0,1],[18.0,33...|  1136.3994|
|(6,[0,1],[18.0,34...|  1137.4697|
|(6,[0,1],[18.0,37...|  1141.4451|
|(6,[0,1],[18.0,41...|  1146.7966|
|(6,[0,1],[18.0,53...|  1163.4627|
|(6,[0,1],[21.0,23...|  1515.3449|
|(6,[0,1],[21.0,31...|16586.49771|
|(6,[0,1],[21.0,36...|  1534.3045|
|(6,[0,1],[22.0,26...|  1664.9996|
|(6,[0,1],[22.0,33...|  1674.6323|
|(6,[0,1],[23.0,26...|  1815.8759|
|(6,[0,1],[23.0,41...|  1837.2819|
|(6,[0,1],[24.0,32...|  1981.5819|
|(6,[0,1],[25.0,25...|  2137.6536|
|(6,[0,1],[26.0,35...|  2322.6218|
|(6,[0,1],[27.0,32...|  2497.0383|
|(6,[0,1],[27.0,33...|  2498.4144|
|(6,[0,1],[29.0,27...|  2867.1196|
|(6,[0,1],[36.0,29...|   4399.731|
|(6,[0,1],[37.0,36...|19214.70553|
+--------------------+-----------+
only showing top 20 rows



In [141]:
test.show()

+--------------------+-----------+
|            features|    Charges|
+--------------------+-----------+
|(6,[0,1],[18.0,23...|  1121.8739|
|(6,[0,1],[18.0,30...|  1131.5066|
|(6,[0,1],[18.0,33...|  1135.9407|
|(6,[0,1],[18.0,34...|   1137.011|
|(6,[0,1],[18.0,43...|  1149.3959|
|(6,[0,1],[20.0,33...|  1391.5287|
|(6,[0,1],[21.0,35...|  1532.4697|
|(6,[0,1],[23.0,32...|  1824.2854|
|(6,[0,1],[24.0,35...|  1986.9334|
|(6,[0,1],[27.0,23...|   2483.736|
|(6,[0,1],[28.0,38...|  2689.4954|
|(6,[0,1],[33.0,30...|  3704.3545|
|(6,[0,1],[34.0,34...|  3935.1799|
|(6,[0,1],[41.0,33...|  5699.8375|
|(6,[0,1],[48.0,40...|  7804.1605|
|(6,[0,1],[52.0,34...|   9140.951|
|(6,[0,1],[53.0,29...|  9487.6442|
|(6,[0,1],[53.0,31...|27346.04207|
|(6,[0,1],[58.0,49...| 11381.3254|
|(6,[0,1],[59.0,26...|  11743.299|
+--------------------+-----------+
only showing top 20 rows



In [142]:
from pyspark.ml.regression import LinearRegression

In [143]:
lr = LinearRegression(featuresCol='features',labelCol='Charges')

In [144]:
train_model = lr.fit(train)


In [145]:
rslt = train_model.evaluate(train)  #evaluate

print(rslt.r2)

0.7333159902146604


In [146]:
print(rslt.meanSquaredError)

38907295.69582776


In [147]:
print(rslt.meanAbsoluteError)

4294.992350328548


In [156]:
unlabled = test.select('features')   #prediction on test dataset
unlabled.show()

+--------------------+
|            features|
+--------------------+
|(6,[0,1],[18.0,23...|
|(6,[0,1],[18.0,30...|
|(6,[0,1],[18.0,33...|
|(6,[0,1],[18.0,34...|
|(6,[0,1],[18.0,43...|
|(6,[0,1],[20.0,33...|
|(6,[0,1],[21.0,35...|
|(6,[0,1],[23.0,32...|
|(6,[0,1],[24.0,35...|
|(6,[0,1],[27.0,23...|
|(6,[0,1],[28.0,38...|
|(6,[0,1],[33.0,30...|
|(6,[0,1],[34.0,34...|
|(6,[0,1],[41.0,33...|
|(6,[0,1],[48.0,40...|
|(6,[0,1],[52.0,34...|
|(6,[0,1],[53.0,29...|
|(6,[0,1],[53.0,31...|
|(6,[0,1],[58.0,49...|
|(6,[0,1],[59.0,26...|
+--------------------+
only showing top 20 rows



In [157]:
predictions = train_model.transform(unlabled)
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|(6,[0,1],[18.0,23...| -487.954954873323|
|(6,[0,1],[18.0,30...|1878.0788109682617|
|(6,[0,1],[18.0,33...|2967.2054650858154|
|(6,[0,1],[18.0,34...|3230.0981057348818|
|(6,[0,1],[18.0,43...| 6272.141518959774|
|(6,[0,1],[20.0,33...|3485.4174926675023|
|(6,[0,1],[21.0,35...|  4495.64533688425|
|(6,[0,1],[23.0,32...|3999.8428933909727|
|(6,[0,1],[24.0,35...| 5385.631652820668|
|(6,[0,1],[27.0,23...|1806.4430777229827|
|(6,[0,1],[28.0,38...| 7173.177538409947|
|(6,[0,1],[33.0,30...| 5802.225109352223|
|(6,[0,1],[34.0,34...| 7413.350417909687|
|(6,[0,1],[41.0,33...|  9001.75596531783|
|(6,[0,1],[48.0,40...|13068.863553131441|
|(6,[0,1],[52.0,34...|12039.702574623594|
|(6,[0,1],[53.0,29...| 10721.45274452005|
|(6,[0,1],[53.0,31...|11359.906300382061|
|(6,[0,1],[58.0,49...|18701.967104264782|
|(6,[0,1],[59.0,26...|11224.518264668854|
+--------------------+------------