# Plantilla de Pre Procesado - Datos Categóricos


# Cómo importar las librerías


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importar el data set


In [3]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

In [4]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


# Codificar datos categóricos

In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

Convirtamos las categorías de **Country** en números

In [6]:
labelencoder_X = LabelEncoder() # Creación del objeto
X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # Transformación de la columna
print(X)

[[0 44.0 72000.0]
 [2 27.0 48000.0]
 [1 30.0 54000.0]
 [2 38.0 61000.0]
 [1 40.0 nan]
 [0 35.0 58000.0]
 [2 nan 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]]


Como la variable **Country** es categórico no ordinal, tenemos que crear una columna  por categoría:

In [8]:
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],    # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                         # Leave the rest of the columns untouched
) # Creación del objeto

#onehotencoder = OneHotEncoder(categorical_features=[0])
#X = onehotencoder.fit_transform(X).toarray()

En general [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html) puede transformar de distintas maneras una columna, como normalizarla.

In [9]:
X = np.array(ct.fit_transform(X), dtype=np.float)
print(X)
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
print(y)

[[1.0e+00 0.0e+00 0.0e+00 4.4e+01 7.2e+04]
 [0.0e+00 0.0e+00 1.0e+00 2.7e+01 4.8e+04]
 [0.0e+00 1.0e+00 0.0e+00 3.0e+01 5.4e+04]
 [0.0e+00 0.0e+00 1.0e+00 3.8e+01 6.1e+04]
 [0.0e+00 1.0e+00 0.0e+00 4.0e+01     nan]
 [1.0e+00 0.0e+00 0.0e+00 3.5e+01 5.8e+04]
 [0.0e+00 0.0e+00 1.0e+00     nan 5.2e+04]
 [1.0e+00 0.0e+00 0.0e+00 4.8e+01 7.9e+04]
 [0.0e+00 1.0e+00 0.0e+00 5.0e+01 8.3e+04]
 [1.0e+00 0.0e+00 0.0e+00 3.7e+01 6.7e+04]]
[0 1 0 0 1 1 0 1 0 1]


En el caso de la variable ```y```, basta una columna para indicar si es *Sí* o *No*.

In [10]:
print(X.shape)

(10, 5)


In [11]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [13]:
result = pd.DataFrame({'France': X[:, 0], 'Spain': X[:, 1],'Germany': X[:, 2], 'Age': X[:, 3],'Salary': X[:, 4],'Purchased': y[:]})
display(result)

Unnamed: 0,France,Spain,Germany,Age,Salary,Purchased
0,1.0,0.0,0.0,44.0,72000.0,0
1,0.0,0.0,1.0,27.0,48000.0,1
2,0.0,1.0,0.0,30.0,54000.0,0
3,0.0,0.0,1.0,38.0,61000.0,0
4,0.0,1.0,0.0,40.0,,1
5,1.0,0.0,0.0,35.0,58000.0,1
6,0.0,0.0,1.0,,52000.0,0
7,1.0,0.0,0.0,48.0,79000.0,1
8,0.0,1.0,0.0,50.0,83000.0,0
9,1.0,0.0,0.0,37.0,67000.0,1
