In [0]:
# Import the Pipeline Module
from pyspark.ml import Pipeline
# Modules that contains our Linear Regression & Random Forest Algorithms
from pyspark.ml.regression import LinearRegression , RandomForestRegressor
# Module with the Evaluator
from pyspark.ml.evaluation import RegressionEvaluator 
# One Hot Encode Module
from pyspark.ml.feature import VectorAssembler

In [0]:
# Store the database previous loaded
cars_temp = spark.read.format("csv").option("header", True ).option("inferSchema", True).load("/learn/Cars_Machine") 
cars_temp.show()

In [0]:
cars_temp.show(5)

In [0]:
# Store it in a shorter DF
cars = cars_temp.select("Consumo", "Cilindros" , "Cilindradas" , "HP")
cars.show(10)

In [0]:
# Perform the One Hot Encode
vec_characteristics = VectorAssembler(inputCols=[("Consumo"),("Cilindros"), ("Cilindradas")] , outputCol= "characteristics")

In [0]:
# Transform our One Hot Encoded Data
Cars =  vec_characteristics.transform(cars)

In [0]:
Cars.show(7)

In [0]:
# Defining the proportion of data to be tested and trained
cars_test , cars_train = Cars.randomSplit([0.75 , 0.25])

In [0]:
# Verify the proportion
print(cars_test.count())
print()
print(cars_train.count())

In [0]:
# Doing a Liear Regression of HP  using characteristics as feature
lin_reg = LinearRegression(featuresCol="characteristics" , labelCol="HP")

In [0]:
# Fitting the model
model = lin_reg.fit(cars_train)

In [0]:
# Using our Linear Regression to Predict the HP values (prediction)
predict  = model.transform(cars_test)
predict.show(8)

In [0]:
# We will Evaluate our model = > Comparing the HP (real value) with the predict one (preditions column)
evaluate = RegressionEvaluator(predictionCol="prediction" , labelCol="HP" , metricName="rmse" )

In [0]:
# Let´s see the result of the evaluation of our model
rmse = evaluate.evaluate(predict)
print(rmse)

In [0]:
# Create the pipeline object
pipelines = Pipeline(stages=[vec_characteristics , lin_reg])

In [0]:
# Fitting the pipeline model
model_pipeline = pipelines.fit(cars)

In [0]:
# Consulting the Pred Result
predict_pipeline = model_pipeline.transform(cars)
predict_pipeline.show()

In [0]:
# Let´s verify the action of the Pipeline Model to caompare with the other ones
rmse_pipeline = evaluate.evaluate(predict_pipeline)
print(rmse_pipeline)

In [0]:
# Apply the Random Forest Algorithm
forest = RandomForestRegressor(featuresCol="characteristics" , labelCol="HP")

In [0]:
# Creating the train model
model_forest = forest.fit(cars_train)

In [0]:
# Set the prection model
predict_forest = model_forest.transform(cars_test)
predict_forest.show()

In [0]:
# Evaluating our Random Forest Prediction
rmse_forest = evaluate.evaluate(predict_forest)
print(rmse_forest)