# Regression

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

In [2]:
# http://archive.ics.uci.edu/ml/machine-learning-databases/00294/

## Linear Regression

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
pp_df = spark.read.csv("power_plant.csv", header=True, inferSchema=True)


In [5]:
pp_df.show(4, vertical=True)

-RECORD 0------
 AT  | 14.96   
 V   | 41.76   
 AP  | 1024.07 
 RH  | 73.17   
 PE  | 463.26  
-RECORD 1------
 AT  | 25.18   
 V   | 62.96   
 AP  | 1020.04 
 RH  | 59.08   
 PE  | 444.37  
-RECORD 2------
 AT  | 5.11    
 V   | 39.4    
 AP  | 1012.16 
 RH  | 92.14   
 PE  | 488.56  
-RECORD 3------
 AT  | 20.86   
 V   | 57.32   
 AP  | 1010.24 
 RH  | 76.64   
 PE  | 446.48  
only showing top 4 rows



In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
vectorAssembler = VectorAssembler(inputCols=
                                  ["AT", "V", "AP", "RH"],
                                 outputCol="features")

In [8]:
vpp_df = vectorAssembler.transform(pp_df)

In [9]:
vpp_df.show(1, vertical=True)

-RECORD 0------------------------
 AT       | 14.96                
 V        | 41.76                
 AP       | 1024.07              
 RH       | 73.17                
 PE       | 463.26               
 features | [14.96,41.76,1024... 
only showing top 1 row



In [13]:
train_df, test_df = vpp_df.randomSplit([0.6, 0.4], seed=1)

In [14]:
lr = LinearRegression(featuresCol="features", labelCol="PE")

In [16]:
lr_model = lr.fit(train_df)

In [17]:
lr_model.coefficients

DenseVector([-1.9845, -0.2295, 0.0677, -0.1609])

In [18]:
lr_model.intercept

449.08470685482604

In [19]:
lr_model.summary.rootMeanSquaredError

4.4559089777188055

#### saving model

In [20]:
lr_model.save("lr1.model")

## Decision Tree Regression

In [21]:
from pyspark.ml.regression import DecisionTreeRegressor

In [28]:
from pyspark.ml.evaluation import RegressionEvaluator

In [25]:
DecisionTreeRegressor?

In [26]:
dtr = DecisionTreeRegressor(seed=1, featuresCol="features", labelCol="PE")

In [27]:
dtr_model = dtr.fit(train_df)

In [30]:
RegressionEvaluator?

In [31]:
dtr_evaluator = RegressionEvaluator(
    metricName="rmse", 
    predict="PE")


In [32]:
dtr_predict = dtr_model.transform(test_df)