# Importar bibliotecas

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Importar Conjunto de Datos

In [2]:
df = pd.read_csv('datos.csv')

In [3]:
df

Unnamed: 0,Pais,fecha,Salario,Seleccionado
0,Francia,44.0,72000.0,No
1,Mexico,27.0,48000.0,Si
2,Alemania,30.0,54000.0,No
3,Mexico,38.0,61000.0,No
4,Alemania,40.0,,Si
5,Francia,35.0,58000.0,Si
6,Mexico,,52000.0,No
7,Francia,48.0,79000.0,Si
8,Alemania,50.0,83000.0,No
9,Francia,37.0,67000.0,Si


# Asignar las variables

In [4]:
X = df.iloc[:,:-1].values

In [5]:
X

array([['Francia', 44.0, 72000.0],
       ['Mexico', 27.0, 48000.0],
       ['Alemania', 30.0, 54000.0],
       ['Mexico', 38.0, 61000.0],
       ['Alemania', 40.0, nan],
       ['Francia', 35.0, 58000.0],
       ['Mexico', nan, 52000.0],
       ['Francia', 48.0, 79000.0],
       ['Alemania', 50.0, 83000.0],
       ['Francia', 37.0, 67000.0]], dtype=object)

In [6]:
y = df.iloc[:,3].values

In [7]:
y

array(['No', 'Si', 'No', 'No', 'Si', 'Si', 'No', 'Si', 'No', 'Si'],
      dtype=object)

# Imputación

In [8]:
inputer = SimpleImputer(missing_values= np.NaN,strategy='mean')

In [9]:
inputer.fit(X[:,1:3])
X[:,1:3] = inputer.transform(X[:,1:3])

In [10]:
X

array([['Francia', 44.0, 72000.0],
       ['Mexico', 27.0, 48000.0],
       ['Alemania', 30.0, 54000.0],
       ['Mexico', 38.0, 61000.0],
       ['Alemania', 40.0, 63777.77777777778],
       ['Francia', 35.0, 58000.0],
       ['Mexico', 38.77777777777778, 52000.0],
       ['Francia', 48.0, 79000.0],
       ['Alemania', 50.0, 83000.0],
       ['Francia', 37.0, 67000.0]], dtype=object)

# Datos categóricos

In [11]:
etiqueta = LabelEncoder()

In [12]:
X[:,0] = etiqueta.fit_transform(X[:,0])

In [13]:
X

array([[1, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [0, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [0, 40.0, 63777.77777777778],
       [1, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [1, 48.0, 79000.0],
       [0, 50.0, 83000.0],
       [1, 37.0, 67000.0]], dtype=object)

In [14]:
ls = ColumnTransformer([('State',OneHotEncoder(categories='auto'),[0])], remainder = 'passthrough')


In [15]:
X = ls.fit_transform(X)

In [16]:
X

array([[0.0, 1.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [1.0, 0.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [1.0, 0.0, 0.0, 40.0, 63777.77777777778],
       [0.0, 1.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [0.0, 1.0, 0.0, 48.0, 79000.0],
       [1.0, 0.0, 0.0, 50.0, 83000.0],
       [0.0, 1.0, 0.0, 37.0, 67000.0]], dtype=object)

In [21]:
y = etiqueta.fit_transform(y)

# Dividir el conjunto de datos

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [19]:
X_train,y_train

(array([[1.0, 0.0, 0.0, 40.0, 63777.77777777778],
        [0.0, 1.0, 0.0, 37.0, 67000.0],
        [0.0, 0.0, 1.0, 27.0, 48000.0],
        [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
        [0.0, 1.0, 0.0, 48.0, 79000.0],
        [0.0, 0.0, 1.0, 38.0, 61000.0],
        [0.0, 1.0, 0.0, 44.0, 72000.0],
        [0.0, 1.0, 0.0, 35.0, 58000.0]], dtype=object),
 array(['Si', 'Si', 'Si', 'No', 'Si', 'No', 'No', 'Si'], dtype=object))

In [20]:
X_test,y_test

(array([[1.0, 0.0, 0.0, 30.0, 54000.0],
        [1.0, 0.0, 0.0, 50.0, 83000.0]], dtype=object),
 array(['No', 'No'], dtype=object))

# Escalado de valores

In [24]:
escalado = StandardScaler()
X_train = escalado.fit_transform(X_train)
X_test = escalado.transform(X_test)

In [25]:
X_train

array([[ 2.64575131, -1.        , -0.77459667,  0.26306757,  0.12381479],
       [-0.37796447,  1.        , -0.77459667, -0.25350148,  0.46175632],
       [-0.37796447, -1.        ,  1.29099445, -1.97539832, -1.53093341],
       [-0.37796447, -1.        ,  1.29099445,  0.05261351, -1.11141978],
       [-0.37796447,  1.        , -0.77459667,  1.64058505,  1.7202972 ],
       [-0.37796447, -1.        ,  1.29099445, -0.0813118 , -0.16751412],
       [-0.37796447,  1.        , -0.77459667,  0.95182631,  0.98614835],
       [-0.37796447,  1.        , -0.77459667, -0.59788085, -0.48214934]])

In [26]:
X_test

array([[ 2.64575131, -1.        , -0.77459667, -1.45882927, -0.90166297],
       [ 2.64575131, -1.        , -0.77459667,  1.98496442,  2.13981082]])