#  Transformação com RFormula
### Inicializando o PySpark

In [3]:
import findspark
import pyspark
from pyspark.sql import SparkSession

# Faz a Interafce entre o Spark e o Jupyter Notebook
findspark.init()

# Inicializando uma Sessão no Spark
spark = SparkSession.builder.appName("RFormula").getOrCreate()

### Carregando a Base de Dados usado nesta Aula

In [6]:
carros = spark.read.load("Material_do_Curso/Carros.csv", format="csv",
                        sep=";", inferSchema=True, header=True)
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



### Aplicando o RFurmula ao Conjunto de Dados Carros 

In [7]:
from pyspark.ml.feature import RFormula

In [8]:
# Instanciando um Objeto RFormula e aplicando a transformação dos dados
rformula = RFormula(formula="HP ~ Consumo + Cilindros + Cilindradas",
                    featuresCol="Independente", labelCol="Dependente")
carros_rf = rformula.fit(carros).transform(carros)
carros_rf.select("Independente", "Dependente").show()

[Stage 9:>                                                          (0 + 1) / 1]

+------------------+----------+
|      Independente|Dependente|
+------------------+----------+
|  [21.0,6.0,160.0]|     110.0|
|  [21.0,6.0,160.0]|     110.0|
| [228.0,4.0,108.0]|      93.0|
| [214.0,6.0,258.0]|     110.0|
| [187.0,8.0,360.0]|     175.0|
| [181.0,6.0,225.0]|     105.0|
| [143.0,8.0,360.0]|     245.0|
|[244.0,4.0,1467.0]|      62.0|
|[228.0,4.0,1408.0]|      95.0|
|[192.0,6.0,1676.0]|     123.0|
|[178.0,6.0,1676.0]|     123.0|
|[164.0,8.0,2758.0]|     180.0|
|[173.0,8.0,2758.0]|     180.0|
|[152.0,8.0,2758.0]|     180.0|
| [104.0,8.0,472.0]|     205.0|
| [104.0,8.0,460.0]|     215.0|
| [147.0,8.0,440.0]|     230.0|
| [324.0,4.0,787.0]|      66.0|
| [304.0,4.0,757.0]|      52.0|
| [339.0,4.0,711.0]|      65.0|
+------------------+----------+
only showing top 20 rows



                                                                                