#  Divisor de Vetores com VectorSlicer
### Inicializando o PySpark

In [13]:
import findspark
import pyspark
from pyspark.sql import SparkSession

# Faz a Interafce entre o Spark e o Jupyter Notebook
findspark.init()

# Inicializando uma Sessão no Spark
spark = SparkSession.builder.appName("VectorSlicer").getOrCreate()

### Carregando a Base de Dados usado nesta Aula

In [14]:
carros = spark.read.load("Material_do_Curso/Carros.csv", format="csv",
                        sep=";", inferSchema=True, header=True)
carros.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



### Aplicando o RFurmula ao Conjunto de Dados Carros 

In [15]:
from pyspark.ml.feature import RFormula

In [16]:
# Instanciando um Objeto RFormula e aplicando a transformação dos dados
rformula = RFormula(formula="HP ~ .",
                    featuresCol="Independente", labelCol="Dependente")
carros_rf = rformula.fit(carros).transform(carros)
carros_rf.select("Independente", "Dependente").show(truncate=False)

+-----------------------------------------------------+----------+
|Independente                                         |Dependente|
+-----------------------------------------------------+----------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |110.0     |
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |110.0     |
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |93.0      |
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|110.0     |
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |175.0     |
|[181.0,6.0,225.0,276.0,346.0,2022.0,1.0,0.0,3.0,1.0] |105.0     |
|[143.0,8.0,360.0,321.0,357.0,1584.0,0.0,0.0,3.0,4.0] |245.0     |
|[244.0,4.0,1467.0,369.0,319.0,20.0,1.0,0.0,4.0,2.0]  |62.0      |
|[228.0,4.0,1408.0,392.0,315.0,229.0,1.0,0.0,4.0,2.0] |95.0      |
|[192.0,6.0,1676.0,392.0,344.0,183.0,1.0,0.0,4.0,4.0] |123.0     |
|[178.0,6.0,1676.0,392.0,344.0,189.0,1.0,0.0,4.0,4.0] |123.0     |
|[164.0,8.0,2758.0,307.0,407.0,174.0,0.0,0.0,3.0,3.0] |180.0  

# Fatiando DataFrame Spark com VectorSlicer

Recebe uma coluna com um vetor de atributos;

Cria uma nova coluna, com os atributos especificados pelo índice;
    


In [17]:
from pyspark.ml.feature import VectorSlicer

In [18]:
fatia = VectorSlicer(inputCol="Independente", outputCol="Caracteristicas", 
                    indices=[1, 2, 6])
carros_fat = fatia.transform(carros_rf)
carros_fat.select("Independente", "Caracteristicas").show(5, truncate=False)

+-----------------------------------------------------+---------------+
|Independente                                         |Caracteristicas|
+-----------------------------------------------------+---------------+
|[21.0,6.0,160.0,39.0,262.0,1646.0,0.0,1.0,4.0,4.0]   |[6.0,160.0,0.0]|
|[21.0,6.0,160.0,39.0,2875.0,1702.0,0.0,1.0,4.0,4.0]  |[6.0,160.0,0.0]|
|[228.0,4.0,108.0,385.0,232.0,1861.0,1.0,1.0,4.0,1.0] |[4.0,108.0,1.0]|
|[214.0,6.0,258.0,308.0,3215.0,1944.0,1.0,0.0,3.0,1.0]|[6.0,258.0,1.0]|
|[187.0,8.0,360.0,315.0,344.0,1702.0,0.0,0.0,3.0,2.0] |[8.0,360.0,0.0]|
+-----------------------------------------------------+---------------+
only showing top 5 rows

