## Uso de bibliotecas

In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Ver dataset

In [5]:
#--> Importar dataset
data_original = pd.read_csv("Data.csv")
data_original


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Imputación (completar datos faltantes)

#### Imputación sin aplicar clases

In [5]:
# -------------| Opcion 1 - fit_transform |-------------
from sklearn.impute import SimpleImputer

# --> Indicaciones del imputer
imputer2 = SimpleImputer(missing_values=np.nan, strategy='mean')

# --> Entrena y transforma los datos
data_original.iloc[:, 1:3] = imputer2.fit_transform(data_original.iloc[:, 1:3])
data_original


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


#### Imputación por clases

In [6]:
from sklearn.impute import SimpleImputer

# -------------| Clase 'No' |-------------
# --> Convertir dataframe a arreglo
data_no = data_original[data_original["Purchased"] == "No"].iloc[:, :].to_numpy()
# --> Especificar imputer
imputer_no = SimpleImputer(missing_values=np.nan, strategy="mean")
# --> Entrenar y remplazar datos de los rasgos
data_no[:, 1:3] = imputer_no.fit_transform(data_no[:, 1:3])

# -------------| Para clase 'YES' |-------------
data_yes = data_original[data_original["Purchased"] == "Yes"].iloc[:, :].to_numpy()
imputer_yes = SimpleImputer(missing_values=np.nan, strategy="mean")
data_yes[:, 1:3] = imputer_no.fit_transform(data_yes[:, 1:3])

# --> Unir arreglos de dataframes
nuevo_arreglo = np.concatenate((data_yes, data_no))

# --> Crear dataframe
data_imputacion = pd.DataFrame(data=nuevo_arreglo, columns=["Country", "Age", "Salary", "Purchased"])
data_imputacion

Unnamed: 0,Country,Age,Salary,Purchased
0,Spain,27.0,48000.0,Yes
1,Germany,40.0,63000.0,Yes
2,France,35.0,58000.0,Yes
3,France,48.0,79000.0,Yes
4,France,37.0,67000.0,Yes
5,France,44.0,72000.0,No
6,Germany,30.0,54000.0,No
7,Spain,38.0,61000.0,No
8,Spain,40.5,52000.0,No
9,Germany,50.0,83000.0,No


## Datos categóricos

#### En rasgos

In [6]:
# ----------------| Para solo una columna |----------------
from sklearn.preprocessing import OneHotEncoder

# --> Objeto de la clase
one_hot_encoder = OneHotEncoder()
resultado = one_hot_encoder.fit_transform(data_imputacion[["Country"]])

print(resultado.toarray())
print(one_hot_encoder.categories_)

# --> Aplicacion de resultado a dataset original
# data_imputacion[one_hot_encoder.categories_[0]] = resultado.toarray()
# print(data_imputacion)


[[0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]]
[array(['France', 'Germany', 'Spain'], dtype=object)]


In [7]:
# ----------------| Para varias columnas |----------------
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# --> Transformar varias columnas
ct = ColumnTransformer(
    # Columnas para transformar
    [('one_hot_encoder', OneHotEncoder(), [0])],
    # Lo que pasara con las demas columnas
    remainder="passthrough"
)

rasgos = ct.fit_transform(data_imputacion.iloc[:, :-1])
print(rasgos)



[[0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 40.0 63000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 40.5 52000.0]
 [0.0 1.0 0.0 50.0 83000.0]]


#### En clases

In [8]:
# --> LabelEncoder para las clases
from sklearn.preprocessing import LabelEncoder

# --> Objeto label_encoder
label_encoder = LabelEncoder()

clases = label_encoder.fit_transform(data_imputacion.iloc[:, -1])
print(clases)

[1 1 1 1 1 0 0 0 0 0]


## Dividir dataset en entrenamiento y prueba

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(rasgos, clases, test_size=0.2, random_state=0)
x_train

array([[1.0, 0.0, 0.0, 37.0, 67000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 40.0, 63000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 44.0, 72000.0]], dtype=object)

## Escalado de valores (Eliminar valores atípicos)

In [16]:
from sklearn.preprocessing import StandardScaler

# --> Objeto para estandarizar
sc_x = StandardScaler()

# --> Aplicar estandarizacion
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)




array([[ 1.29099445, -0.77459667, -0.57735027, -0.56200266, -0.70546456],
       [-0.77459667, -0.77459667,  1.73205081,  0.1652949 , -1.24296137]])