In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TesteNotebook").getOrCreate()

df = spark.createDataFrame([
    (1, "Teste"),
    (2, "Spark")
], ["id", "nome"])

df.show()


+---+-----+
| id| nome|
+---+-----+
|  1|Teste|
|  2|Spark|
+---+-----+



In [5]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [6]:
carros_temp = spark.read.csv('/home/robson/curso_spark_pyspark_udemy/Carros.csv', 
                             inferSchema=True, 
                             header=True,
                             sep=';')


                                                                                

In [7]:
carros_temp.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
|    181|        6|        225|            276| 346| 2022|        1|          0|      3|          1|105|
|    143|        8|        360|            321| 357| 15

In [8]:
carros = carros_temp.select("Consumo", "Cilindros", "Cilindradas", "HP")
carros.show()

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
|    181|        6|        225|105|
|    143|        8|        360|245|
|    244|        4|       1467| 62|
|    228|        4|       1408| 95|
|    192|        6|       1676|123|
|    178|        6|       1676|123|
|    164|        8|       2758|180|
|    173|        8|       2758|180|
|    152|        8|       2758|180|
|    104|        8|        472|205|
|    104|        8|        460|215|
|    147|        8|        440|230|
|    324|        4|        787| 66|
|    304|        4|        757| 52|
|    339|        4|        711| 65|
+-------+---------+-----------+---+
only showing top 20 rows


In [10]:
vec_caracteristicas = VectorAssembler(
    inputCols=["Consumo", "Cilindros", "Cilindradas"],
    outputCol="caracteristicas"
)

In [11]:
carros = vec_caracteristicas.transform(carros)

In [None]:
carros.show()

                                                                                

+-------+---------+-----------+---+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|
+-------+---------+-----------+---+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|     21|        6|        160|110|  [21.0,6.0,160.0]|
|    228|        4|        108| 93| [228.0,4.0,108.0]|
|    214|        6|        258|110| [214.0,6.0,258.0]|
|    187|        8|        360|175| [187.0,8.0,360.0]|
|    181|        6|        225|105| [181.0,6.0,225.0]|
|    143|        8|        360|245| [143.0,8.0,360.0]|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|
|    178|        6|       1676|123|[178.0,6.0,1676.0]|
|    164|        8|       2758|180|[164.0,8.0,2758.0]|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|
|    152|        8|       2758|180|[152.0,8.0,2758.0]|
|    104|        8|        472|205| [104.0,8.0,472.0]|
|    104| 

In [15]:
carros_treino, carros_teste = carros.randomSplit([0.7, 0.3])

In [18]:
print(carros_treino.count(), carros_teste.count())

                                                                                

21 11


In [35]:
reglin = LinearRegression(
    featuresCol="caracteristicas",
    labelCol="HP",
    maxIter=10,
    regParam=0.3,
    elasticNetParam=0.8
)

In [36]:
modelo = reglin.fit(carros_treino)

In [37]:
previsao = modelo.transform(carros_teste)

In [38]:
previsao.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|200.74494786684545|
|     21|        6|        160|110|  [21.0,6.0,160.0]|200.74494786684545|
|    104|        8|        460|215| [104.0,8.0,460.0]| 225.9417736798748|
|    147|        8|        440|230| [147.0,8.0,440.0]|212.36124616417095|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|171.51636583344725|
|    192|        8|        400|175| [192.0,8.0,400.0]|198.41614194726964|
|    214|        4|        121|109| [214.0,4.0,121.0]| 82.92685431999386|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]|  67.4789074338943|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|   60.389409402291|
|    304|        4|        951|113| [304.0,4.0,951.0]| 42.29192178774559|
|    339|        4|        711| 65| [3

In [39]:
avaliar = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="HP",
    metricName="rmse"
)

In [40]:
rmse = avaliar.evaluate(previsao)

In [41]:
print(rmse)

48.81074385527301


In [42]:
rfreg = RandomForestRegressor(
    featuresCol="caracteristicas",
    labelCol="HP",
    numTrees=10,
    maxDepth=5,
    seed=42
)

In [43]:
modelo2 = rfreg.fit(carros_treino) 

25/07/26 17:59:07 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 21 (= number of training instances)


In [44]:
previsao2 = modelo2.transform(carros_teste)

In [45]:
previsao2.show()

[Stage 59:>                                                         (0 + 1) / 1]

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|             188.1|
|     21|        6|        160|110|  [21.0,6.0,160.0]|             188.1|
|    104|        8|        460|215| [104.0,8.0,460.0]|            188.95|
|    147|        8|        440|230| [147.0,8.0,440.0]|185.11666666666665|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|            179.45|
|    192|        8|        400|175| [192.0,8.0,400.0]|            171.26|
|    214|        4|        121|109| [214.0,4.0,121.0]|             99.25|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]|              84.5|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|              77.1|
|    304|        4|        951|113| [304.0,4.0,951.0]|             64.19|
|    339|        4|        711| 65| [3

                                                                                

In [46]:
previsao.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|200.74494786684545|
|     21|        6|        160|110|  [21.0,6.0,160.0]|200.74494786684545|
|    104|        8|        460|215| [104.0,8.0,460.0]| 225.9417736798748|
|    147|        8|        440|230| [147.0,8.0,440.0]|212.36124616417095|
|    173|        8|       2758|180|[173.0,8.0,2758.0]|171.51636583344725|
|    192|        8|        400|175| [192.0,8.0,400.0]|198.41614194726964|
|    214|        4|        121|109| [214.0,4.0,121.0]| 82.92685431999386|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]|  67.4789074338943|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]|   60.389409402291|
|    304|        4|        951|113| [304.0,4.0,951.0]| 42.29192178774559|
|    339|        4|        711| 65| [3

In [47]:
rmse2 = avaliar.evaluate(previsao2)

                                                                                

In [49]:
print(rmse2)
print(rmse)

40.30685584714874
48.81074385527301
