In [0]:
from sklearn.datasets import load_diabetes

In [0]:
from pyspark.ml.regression import RandomForestRegressor

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

In [0]:
import pandas as pd

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
data = load_diabetes()

In [0]:
data

Out[8]: {'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
          0.01990842, -0.01764613],
        [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
         -0.06832974, -0.09220405],
        [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
          0.00286377, -0.02593034],
        ...,
        [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
         -0.04687948,  0.01549073],
        [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
          0.04452837, -0.02593034],
        [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
         -0.00421986,  0.00306441]]),
 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
         69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
         68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
         87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
        259.,  53., 190., 142.,  75., 142., 155., 2

In [0]:
df = pd.DataFrame(data['data'],columns=data['feature_names'])
df['target'] = data['target']

In [0]:
df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


In [0]:
spark.createDataFrame(df)

Out[11]: DataFrame[age: double, sex: double, bmi: double, bp: double, s1: double, s2: double, s3: double, s4: double, s5: double, s6: double, target: double]

In [0]:
train,test = spark.createDataFrame(df).randomSplit([0.8,0.2],seed=0)

In [0]:
feature_names = data['feature_names']

In [0]:
v = VectorAssembler(inputCols = feature_names, outputCol = 'features')

In [0]:
rf = RandomForestRegressor(featuresCol = 'features',labelCol='target')

In [0]:
pipe = Pipeline(stages = [v,rf])

In [0]:
model = pipe.fit(train)

In [0]:
df2 = model.transform(test)

In [0]:
display(df2)

age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target,features,prediction
-0.0382074010379866,-0.044641636506989,-0.0105172024313319,-0.0366564467985606,-0.0373437341334407,-0.0194764882100115,-0.0286742944356786,-0.0025922619981828,-0.0181182673078967,-0.0176461251598052,97.0,"Map(vectorType -> dense, length -> 10, values -> List(-0.0382074010379866, -0.044641636506989, -0.0105172024313319, -0.0366564467985606, -0.0373437341334407, -0.0194764882100115, -0.0286742944356786, -0.00259226199818282, -0.0181182673078967, -0.0176461251598052))",118.84087384480152
-0.0273097856849279,-0.044641636506989,-0.0180618869484982,-0.0400993174922969,-0.0029449126784124,-0.0113346282034837,0.0375951860378887,-0.0394933828740919,-0.0089440189577978,-0.0549250873933176,168.0,"Map(vectorType -> dense, length -> 10, values -> List(-0.0273097856849279, -0.044641636506989, -0.0180618869484982, -0.0400993174922969, -0.00294491267841247, -0.0113346282034837, 0.0375951860378887, -0.0394933828740919, -0.0089440189577978, -0.0549250873933176))",89.84864858007236
-0.001882016527791,0.0506801187398187,0.0142724752679289,-0.0745280244296595,0.002558898754392,0.0062016856567301,-0.0139477432193303,-0.0025922619981828,0.0191990330785671,0.0030644094143683,90.0,"Map(vectorType -> dense, length -> 10, values -> List(-0.00188201652779104, 0.0506801187398187, 0.0142724752679289, -0.0745280244296595, 0.00255889875439205, 0.00620168565673016, -0.0139477432193303, -0.00259226199818282, 0.0191990330785671, 0.00306440941436832))",192.0817040058396
-0.001882016527791,0.0506801187398187,0.0713965151836166,0.0976155102571536,0.0878679759628621,0.0754074957122168,-0.0213110188275045,0.0712099797536354,0.0714240327805764,0.0237749439885419,252.0,"Map(vectorType -> dense, length -> 10, values -> List(-0.00188201652779104, 0.0506801187398187, 0.0713965151836166, 0.0976155102571536, 0.0878679759628621, 0.0754074957122168, -0.0213110188275045, 0.0712099797536354, 0.0714240327805764, 0.0237749439885419))",240.4021103843801
0.005383060374248,-0.044641636506989,-0.0363846922044735,0.0218723549949558,0.0039348516125931,0.0155961395104161,0.0081420836051921,-0.0025922619981828,-0.0319914449413559,-0.0466408735636482,135.0,"Map(vectorType -> dense, length -> 10, values -> List(0.00538306037424807, -0.044641636506989, -0.0363846922044735, 0.0218723549949558, 0.00393485161259318, 0.0155961395104161, 0.0081420836051921, -0.00259226199818282, -0.0319914449413559, -0.0466408735636482))",102.28244762134132
0.0199132141783263,-0.044641636506989,-0.0234509473179027,-0.0710851537359232,0.0204462859110067,-0.0100820343563255,0.118591217727804,-0.076394503750001,-0.0425721049227942,0.0734802269665584,92.0,"Map(vectorType -> dense, length -> 10, values -> List(0.0199132141783263, -0.044641636506989, -0.0234509473179027, -0.0710851537359232, 0.0204462859110067, -0.0100820343563255, 0.118591217727804, -0.076394503750001, -0.0425721049227942, 0.0734802269665584))",88.60396030359682
0.0271782910803654,0.0506801187398187,-0.0353068801305926,0.0322009670761646,-0.0112006298276192,0.0015044587298871,-0.0102661054152432,-0.0025922619981828,-0.0149564750249113,-0.0507829804784829,53.0,"Map(vectorType -> dense, length -> 10, values -> List(0.0271782910803654, 0.0506801187398187, -0.0353068801305926, 0.0322009670761646, -0.0112006298276192, 0.00150445872988718, -0.0102661054152432, -0.00259226199818282, -0.0149564750249113, -0.0507829804784829))",102.78819839834208
0.030810829531385,-0.044641636506989,-0.0503962491649252,-0.0022277398611979,-0.0442234984244464,-0.0899348921126563,0.118591217727804,-0.076394503750001,-0.0181182673078967,0.0030644094143683,87.0,"Map(vectorType -> dense, length -> 10, values -> List(0.030810829531385, -0.044641636506989, -0.0503962491649252, -0.00222773986119799, -0.0442234984244464, -0.0899348921126563, 0.118591217727804, -0.076394503750001, -0.0181182673078967, 0.00306440941436832))",106.37009667796627
0.0598711371395414,0.0506801187398187,0.0164280994156907,0.0287580963824284,-0.0414715927080441,-0.029184090525487,-0.0286742944356786,-0.0025922619981828,-0.0023966814934142,-0.0217882320746399,225.0,"Map(vectorType -> dense, length -> 10, values -> List(0.0598711371395414, 0.0506801187398187, 0.0164280994156907, 0.0287580963824284, -0.0414715927080441, -0.029184090525487, -0.0286742944356786, -0.00259226199818282, -0.00239668149341427, -0.0217882320746399))",175.39099937909702
0.0707687524926,0.0506801187398187,0.0121168511201671,0.0563010619323185,0.034205814493018,0.0494161733836856,-0.0397192078479398,0.0343088588777263,0.027367707542609,-0.0010776975004663,144.0,"Map(vectorType -> dense, length -> 10, values -> List(0.0707687524926, 0.0506801187398187, 0.0121168511201671, 0.0563010619323185, 0.034205814493018, 0.0494161733836856, -0.0397192078479398, 0.0343088588777263, 0.027367707542609, -0.00107769750046639))",214.499560588545


In [0]:
evaluator = RegressionEvaluator(labelCol="target",predictionCol="prediction",metricName="rmse")

In [0]:
evaluator.evaluate(df2)

Out[23]: 60.80546032453944