In [39]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col,count,when

In [40]:
sparkObj=SparkSession.builder.appName('Linear Regression').getOrCreate()

In [41]:
dataframe=sparkObj.read.csv('../diabetes.csv',header=True)
dataframe.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|  31|                   0.248| 26|      1|


In [42]:
typecaseDF=dataframe.select(
    *(col(c).cast('float') for c in dataframe.columns)
)
typecaseDF.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [43]:
temp_df=typecaseDF.drop('Outcome')
assembler = VectorAssembler(inputCols=temp_df.columns,outputCol='Status')
data=assembler.transform(typecaseDF)

In [44]:
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|              Status|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|[6.0,148.0,72.0,3...|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|[1.0,85.0,66.0,29...|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|[8.0,183.0,64.0,0...|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|[1.0,89.0,66.0,23...|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|[0.0,137.0,40.0,3...|
|        5.0|  116.0|   

In [45]:
from pyspark.ml.feature import StandardScaler

In [46]:
standardScalar=StandardScaler().setInputCol('Status').setOutputCol('Scaled Status')
data=standardScalar.fit(data).transform(data)
data.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)
 |-- Status: vector (nullable = true)
 |-- Scaled Status: vector (nullable = true)



In [47]:
assembled_data = data.select("Scaled status","Outcome")
assembled_data.show()

+--------------------+-------+
|       Scaled status|Outcome|
+--------------------+-------+
|[1.78063837321943...|    1.0|
|[0.29677306220323...|    0.0|
|[2.37418449762590...|    1.0|
|[0.29677306220323...|    0.0|
|[0.0,4.2849165233...|    1.0|
|[1.48386531101619...|    0.0|
|[0.89031918660971...|    1.0|
|[2.96773062203238...|    0.0|
|[0.59354612440647...|    1.0|
|[2.37418449762590...|    1.0|
|[1.18709224881295...|    0.0|
|[2.96773062203238...|    1.0|
|[2.96773062203238...|    0.0|
|[0.29677306220323...|    1.0|
|[1.48386531101619...|    1.0|
|[2.07741143542266...|    1.0|
|[0.0,3.6906580274...|    1.0|
|[2.07741143542266...|    1.0|
|[0.29677306220323...|    0.0|
|[0.29677306220323...|    1.0|
+--------------------+-------+
only showing top 20 rows



In [48]:
train,test=assembled_data.randomSplit([0.7,0.3])

In [49]:
lin_reg=LinearRegression(featuresCol='Scaled status',labelCol='Outcome',maxIter=40)
model=lin_reg.fit(train)

22/12/30 18:42:23 WARN Instrumentation: [acaa69df] regParam is zero, which might cause numerical instability and overfitting.


In [50]:
predict_test=model.transform(test)
predict_test.show()

+--------------------+-------+--------------------+
|       Scaled status|Outcome|          prediction|
+--------------------+-------+--------------------+
|(8,[0,1,6,7],[0.5...|    0.0|-0.27682773460324106|
|(8,[0,1,6,7],[0.5...|    0.0|-0.19488803533254073|
|(8,[0,1,6,7],[0.8...|    0.0|-0.21235248841212073|
|(8,[1,5,6,7],[2.2...|    0.0|-0.01251871137070...|
|(8,[1,5,6,7],[3.6...|    0.0|  0.5221458359015433|
|(8,[1,5,6,7],[3.7...|    1.0| 0.35617677787767443|
|(8,[1,5,6,7],[4.0...|    1.0|  0.5841729420390375|
|(8,[1,5,6,7],[4.5...|    1.0|  0.7374954108068862|
|[0.0,1.7827754878...|    0.0|-0.07728844364633691|
|[0.0,2.4395875096...|    0.0|0.018055646842821127|
|[0.0,2.6272480873...|    0.0| 0.10772234284331939|
|[0.0,2.8461854279...|    0.0|  0.1719339802119063|
|[0.0,2.9087389538...|    0.0| 0.07532798178006272|
|[0.0,2.9712924797...|    0.0|  0.2561544376701571|
|[0.0,2.9712924797...|    0.0| 0.11972629733517293|
|[0.0,3.1276762944...|    0.0|  0.1260336795065976|
|[0.0,3.1589

In [51]:
predict_test.select("Outcome","prediction").show(10)

+-------+--------------------+
|Outcome|          prediction|
+-------+--------------------+
|    0.0|-0.27682773460324106|
|    0.0|-0.19488803533254073|
|    0.0|-0.21235248841212073|
|    0.0|-0.01251871137070...|
|    0.0|  0.5221458359015433|
|    1.0| 0.35617677787767443|
|    1.0|  0.5841729420390375|
|    1.0|  0.7374954108068862|
|    0.0|-0.07728844364633691|
|    0.0|0.018055646842821127|
+-------+--------------------+
only showing top 10 rows



In [52]:
pred_evalutor=RegressionEvaluator(predictionCol='prediction',labelCol='Outcome',metricName='r2')
pred_evalutor.evaluate(predict_test)

0.3247512411886444

In [53]:
sparkObj.stop()