# Categorical Encoding com One Hot Encoding
### Inicializando o PySpark

In [9]:
import findspark
import pyspark
from pyspark.sql import SparkSession

# Faz a Interafce entre o Spark e o Jupyter Notebook
findspark.init()

# Inicializando uma Sessão no Spark
spark = SparkSession.builder.appName("One Hot Encoding").getOrCreate()

In [10]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

### Carregando a Base de Dados usado nesta Aula

In [11]:
churn = spark.read.load("Material_do_Curso/Churn.csv", format="csv",
                        sep=";", inferSchema=True, header=True)
churn.show(5)

+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure| Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|       0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1| 8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8| 1596608|            3|        1|             0|       11393157|     1|
|        699|   France|Female| 39|     1|       0|            2|        0|             0|        9382663|     0|
|        850|    Spain|Female| 43|     2|12551082|            1|        1|             1|         790841|     0|
+-----------+---------+------+---+------+--------+-------------+---------+--------------+-------

### Indexação da Coluna ***Geography***
Aplicando a Indexação na coluna ***Geography*** do conjunto de dados ***Churn***

In [25]:
# Instanciando um Objeto StringIndexer
indice_onehot_geo = StringIndexer(inputCol="Geography", outputCol="Indexer_c1")
indice_onehot_gen = StringIndexer(inputCol="Gender", outputCol="Indexer_c2")

# Criando um Modelo e aplica a transformação para obter a indexação
# da coluna Geography e Gender
indice_transf = indice_onehot_geo.fit(churn).transform(churn)
indice_transf = indice_onehot_gen.fit(churn).transform(indice_transf)

# Mostrando os dados originais da coluna Geography e sua respectiva indexação
indice_transf.select("Geography", "Indexer_c1", "Gender", "Indexer_c2").show(10)

+---------+----------+------+----------+
|Geography|Indexer_c1|Gender|Indexer_c2|
+---------+----------+------+----------+
|   France|       0.0|Female|       1.0|
|    Spain|       2.0|Female|       1.0|
|   France|       0.0|Female|       1.0|
|   France|       0.0|Female|       1.0|
|    Spain|       2.0|Female|       1.0|
|    Spain|       2.0|  Male|       0.0|
|   France|       0.0|  Male|       0.0|
|  Germany|       1.0|Female|       1.0|
|   France|       0.0|  Male|       0.0|
|   France|       0.0|  Male|       0.0|
+---------+----------+------+----------+
only showing top 10 rows



# Aplicando o One Hot Encoding

In [28]:
# Criando uma Instância da Classe OneHotEncoder
onehot = OneHotEncoder(inputCols=["Indexer_c1", "Indexer_c2"], outputCols=["onehot_c1", "onehot_c2"])

# Criando o modelo
modelo = onehot.fit(indice_transf)

# Aplicando a transformação OneHotEncoder
onehot_out = modelo.transform(indice_transf)

# Mostra o resultado.
onehot_out.select("Indexer_c1", "onehot_c1", "Indexer_c2", "onehot_c2" ).show(truncate=False)

+----------+-------------+----------+-------------+
|Indexer_c1|onehot_c1    |Indexer_c2|onehot_c2    |
+----------+-------------+----------+-------------+
|0.0       |(2,[0],[1.0])|1.0       |(1,[],[])    |
|2.0       |(2,[],[])    |1.0       |(1,[],[])    |
|0.0       |(2,[0],[1.0])|1.0       |(1,[],[])    |
|0.0       |(2,[0],[1.0])|1.0       |(1,[],[])    |
|2.0       |(2,[],[])    |1.0       |(1,[],[])    |
|2.0       |(2,[],[])    |0.0       |(1,[0],[1.0])|
|0.0       |(2,[0],[1.0])|0.0       |(1,[0],[1.0])|
|1.0       |(2,[1],[1.0])|1.0       |(1,[],[])    |
|0.0       |(2,[0],[1.0])|0.0       |(1,[0],[1.0])|
|0.0       |(2,[0],[1.0])|0.0       |(1,[0],[1.0])|
|0.0       |(2,[0],[1.0])|0.0       |(1,[0],[1.0])|
|2.0       |(2,[],[])    |0.0       |(1,[0],[1.0])|
|0.0       |(2,[0],[1.0])|1.0       |(1,[],[])    |
|0.0       |(2,[0],[1.0])|1.0       |(1,[],[])    |
|2.0       |(2,[],[])    |1.0       |(1,[],[])    |
|1.0       |(2,[1],[1.0])|0.0       |(1,[0],[1.0])|
|1.0       |

In [None]:
#