In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark=SparkSession.builder.appName('Tutorial Objective').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/30 15:33:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/30 15:33:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [24]:
dataframe=spark.read.csv('diabetes.csv',header=True)
dataframe.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|  31|                   0.248| 26|      1|


In [25]:
from pyspark.sql.functions import col,count,when,isnull

In [29]:
typcast_df=dataframe.select(
    *(col(c).cast('float') for c in dataframe.columns)
)

In [30]:
typcast_df.printSchema()

root
 |-- Pregnancies: float (nullable = true)
 |-- Glucose: float (nullable = true)
 |-- BloodPressure: float (nullable = true)
 |-- SkinThickness: float (nullable = true)
 |-- Insulin: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- DiabetesPedigreeFunction: float (nullable = true)
 |-- Age: float (nullable = true)
 |-- Outcome: float (nullable = true)



In [31]:
typcast_df.select(
    [
        count(when(col(c).isNull(),c)) for c in typcast_df.columns
    ]
).show()

+-----------------------------------------------------------+---------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------------------+---------------------------------------------------+-------------------------------------------+-------------------------------------------------------------------------------------+-------------------------------------------+---------------------------------------------------+
|count(CASE WHEN (Pregnancies IS NULL) THEN Pregnancies END)|count(CASE WHEN (Glucose IS NULL) THEN Glucose END)|count(CASE WHEN (BloodPressure IS NULL) THEN BloodPressure END)|count(CASE WHEN (SkinThickness IS NULL) THEN SkinThickness END)|count(CASE WHEN (Insulin IS NULL) THEN Insulin END)|count(CASE WHEN (BMI IS NULL) THEN BMI END)|count(CASE WHEN (DiabetesPedigreeFunction IS NULL) THEN DiabetesPedigreeFunction END)|count(CASE WHEN (Age IS NULL) THEN Age END)|count(CASE WH

In [34]:
new_df=dataframe.drop('Outcome')
assembler=VectorAssembler(inputCols=new_df.columns,outputCol="Diabetes Status")
outcomes=assembler.transform(typcast_df)

In [35]:
outcomes.show()
data=outcomes.select('')

+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction| Age|Outcome|     Diabetes Status|
+-----------+-------+-------------+-------------+-------+----+------------------------+----+-------+--------------------+
|        6.0|  148.0|         72.0|         35.0|    0.0|33.6|                   0.627|50.0|    1.0|[6.0,148.0,72.0,3...|
|        1.0|   85.0|         66.0|         29.0|    0.0|26.6|                   0.351|31.0|    0.0|[1.0,85.0,66.0,29...|
|        8.0|  183.0|         64.0|          0.0|    0.0|23.3|                   0.672|32.0|    1.0|[8.0,183.0,64.0,0...|
|        1.0|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167|21.0|    0.0|[1.0,89.0,66.0,23...|
|        0.0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288|33.0|    1.0|[0.0,137.0,40.0,3...|
|        5.0|  116.0|   

In [36]:
data=outcomes.select('Diabetes Status','Outcome')
data.show()

+--------------------+-------+
|     Diabetes Status|Outcome|
+--------------------+-------+
|[6.0,148.0,72.0,3...|    1.0|
|[1.0,85.0,66.0,29...|    0.0|
|[8.0,183.0,64.0,0...|    1.0|
|[1.0,89.0,66.0,23...|    0.0|
|[0.0,137.0,40.0,3...|    1.0|
|[5.0,116.0,74.0,0...|    0.0|
|[3.0,78.0,50.0,32...|    1.0|
|[10.0,115.0,0.0,0...|    0.0|
|[2.0,197.0,70.0,4...|    1.0|
|[8.0,125.0,96.0,0...|    1.0|
|[4.0,110.0,92.0,0...|    0.0|
|[10.0,168.0,74.0,...|    1.0|
|[10.0,139.0,80.0,...|    0.0|
|[1.0,189.0,60.0,2...|    1.0|
|[5.0,166.0,72.0,1...|    1.0|
|[7.0,100.0,0.0,0....|    1.0|
|[0.0,118.0,84.0,4...|    1.0|
|[7.0,107.0,74.0,0...|    1.0|
|[1.0,103.0,30.0,3...|    0.0|
|[1.0,115.0,70.0,3...|    1.0|
+--------------------+-------+
only showing top 20 rows



In [42]:
train_df,test_df=data.randomSplit([0.7,0.3])
test_df.show()

+--------------------+-------+
|     Diabetes Status|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[2.0...|    0.0|
|(8,[0,1,6,7],[6.0...|    0.0|
|(8,[1,5,6,7],[99....|    0.0|
|(8,[1,5,6,7],[117...|    0.0|
|(8,[1,5,6,7],[131...|    1.0|
|(8,[1,6,7],[94.0,...|    0.0|
|[0.0,78.0,88.0,29...|    0.0|
|[0.0,86.0,68.0,32...|    0.0|
|[0.0,95.0,64.0,39...|    0.0|
|[0.0,95.0,80.0,45...|    0.0|
|[0.0,95.0,85.0,25...|    1.0|
|[0.0,100.0,88.0,6...|    0.0|
|[0.0,101.0,62.0,0...|    0.0|
|[0.0,101.0,64.0,1...|    0.0|
|[0.0,101.0,65.0,2...|    0.0|
|[0.0,104.0,64.0,2...|    0.0|
|[0.0,104.0,76.0,0...|    0.0|
|[0.0,105.0,84.0,0...|    1.0|
|[0.0,106.0,70.0,3...|    0.0|
|[0.0,109.0,88.0,3...|    1.0|
+--------------------+-------+
only showing top 20 rows



In [43]:
linreg=LinearRegression(featuresCol='Diabetes Status',labelCol='Outcome')
linear_model=linreg.fit(train_df)

22/12/30 16:00:42 WARN Instrumentation: [543173b8] regParam is zero, which might cause numerical instability and overfitting.


In [44]:
print(f'{linear_model.coefficients}  {linear_model.intercept}')


[0.024868528046727647,0.005998192460555586,-0.002570783342514894,-0.0009172685330821441,-0.00015405663443399193,0.01533065518347958,0.1446635818744563,0.0031260796684757573]  -0.9081425549965205


In [45]:
trainSummary=linear_model.summary
print(f'RMSE: {trainSummary.rootMeanSquaredError}, r2 Score: {trainSummary.r2}')

RMSE: 0.39388397146855014, r2 Score: 0.3372073544622778


In [46]:
tested_result=linear_model.transform(test_df)
tested_result.select('prediction','Diabetes Status','Outcome').show()

+--------------------+--------------------+-------+
|          prediction|     Diabetes Status|Outcome|
+--------------------+--------------------+-------+
|-0.24493193170268035|(8,[0,1,6,7],[2.0...|    0.0|
|0.033482041503750115|(8,[0,1,6,7],[6.0...|    0.0|
| 0.17431851583002822|(8,[1,5,6,7],[99....|    0.0|
|     0.5841960574234|(8,[1,5,6,7],[117...|    0.0|
|  0.6602422129974856|(8,[1,5,6,7],[131...|    1.0|
|-0.22912659327352602|(8,[1,6,7],[94.0,...|    0.0|
|-0.00514266492053...|[0.0,78.0,88.0,29...|    0.0|
|  0.0649555051251216|[0.0,86.0,68.0,32...|    0.0|
| 0.25067399640965116|[0.0,95.0,64.0,39...|    0.0|
| 0.08915873649136663|[0.0,95.0,80.0,45...|    0.0|
| 0.09881573561757806|[0.0,95.0,85.0,25...|    1.0|
|  0.3470149033150227|[0.0,100.0,88.0,6...|    0.0|
| 7.86613520084245E-4|[0.0,101.0,62.0,0...|    0.0|
|-0.05840216035399...|[0.0,101.0,64.0,1...|    0.0|
|-0.01491640719757...|[0.0,101.0,65.0,2...|    0.0|
| 0.07594088152354328|[0.0,104.0,64.0,2...|    0.0|
|-0.02902766

In [47]:
from pyspark.ml.evaluation import RegressionEvaluator
pred_evalutor=RegressionEvaluator(predictionCol='prediction',labelCol='Outcome',metricName='r2')
pred_evalutor.evaluate(tested_result)

0.18900805159447887

In [48]:
randomForest=RandomForestRegressor(featuresCol='Diabetes Status',labelCol='Outcome')

In [50]:
ranfor_model=randomForest.fit(train_df)
datafromranfor=ranfor_model.transform(test_df)
datafromranfor.show()



+--------------------+-------+--------------------+
|     Diabetes Status|Outcome|          prediction|
+--------------------+-------+--------------------+
|(8,[0,1,6,7],[2.0...|    0.0|0.010514132553606236|
|(8,[0,1,6,7],[6.0...|    0.0| 0.12291251548552058|
|(8,[1,5,6,7],[99....|    0.0|0.060514132553606235|
|(8,[1,5,6,7],[117...|    0.0|  0.6753664100160112|
|(8,[1,5,6,7],[131...|    1.0|  0.6566529702978386|
|(8,[1,6,7],[94.0,...|    0.0|0.010514132553606236|
|[0.0,78.0,88.0,29...|    0.0| 0.08269852182195352|
|[0.0,86.0,68.0,32...|    0.0| 0.09786809202762711|
|[0.0,95.0,64.0,39...|    0.0|  0.0939760102907178|
|[0.0,95.0,80.0,45...|    0.0| 0.13525445566399075|
|[0.0,95.0,85.0,25...|    1.0| 0.07629908440985789|
|[0.0,100.0,88.0,6...|    0.0|  0.2333441712969023|
|[0.0,101.0,62.0,0...|    0.0| 0.01780369484316853|
|[0.0,101.0,64.0,1...|    0.0| 0.01780369484316853|
|[0.0,101.0,65.0,2...|    0.0| 0.01780369484316853|
|[0.0,104.0,64.0,2...|    0.0| 0.03542777536968809|
|[0.0,104.0,

In [52]:
pred_evalutor=RegressionEvaluator(predictionCol='prediction',labelCol='Outcome',metricName='r2')
pred_evalutor.evaluate(datafromranfor)

0.12732356809767154