# Herramientas para pre-procesar datos

## Importar las librerías

In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importar el conjunto de datos

In [9]:
datos = pd.read_csv('Datos.csv')
X = datos.iloc[:, :-1]
y = datos.iloc[:, -1]

In [11]:
datos.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [13]:
type(X)

pandas.core.frame.DataFrame

In [15]:
X.head(10)

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [17]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [19]:
type(y)

pandas.core.series.Series

In [21]:
y.head(10)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [23]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


## Ver qué hacer con datos faltantes

In [26]:
X = X.fillna(X.mean(numeric_only = True)["Age":"Salary"])

In [28]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


## Codificar datos categóricos

### Codificar la(s) variable(s) independiente(s)

In [32]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


In [34]:
comodin = pd.get_dummies(X['Country'])

In [36]:
print(comodin)

   France  Germany  Spain
0       1        0      0
1       0        0      1
2       0        1      0
3       0        0      1
4       0        1      0
5       1        0      0
6       0        0      1
7       1        0      0
8       0        1      0
9       1        0      0


In [38]:
concatenado = pd.concat([X, comodin], axis = 1)

In [40]:
print(concatenado)

   Country        Age        Salary  France  Germany  Spain
0   France  44.000000  72000.000000       1        0      0
1    Spain  27.000000  48000.000000       0        0      1
2  Germany  30.000000  54000.000000       0        1      0
3    Spain  38.000000  61000.000000       0        0      1
4  Germany  40.000000  63777.777778       0        1      0
5   France  35.000000  58000.000000       1        0      0
6    Spain  38.777778  52000.000000       0        0      1
7   France  48.000000  79000.000000       1        0      0
8  Germany  50.000000  83000.000000       0        1      0
9   France  37.000000  67000.000000       1        0      0


In [42]:
concatenado = concatenado.drop(['Country'], axis = 1)

In [44]:
print(concatenado)

         Age        Salary  France  Germany  Spain
0  44.000000  72000.000000       1        0      0
1  27.000000  48000.000000       0        0      1
2  30.000000  54000.000000       0        1      0
3  38.000000  61000.000000       0        0      1
4  40.000000  63777.777778       0        1      0
5  35.000000  58000.000000       1        0      0
6  38.777778  52000.000000       0        0      1
7  48.000000  79000.000000       1        0      0
8  50.000000  83000.000000       0        1      0
9  37.000000  67000.000000       1        0      0


In [46]:
def codif_y_ligar(dataframe_original, variables_a_codificar):
    dummies = pd.get_dummies(dataframe_original[[variables_a_codificar]])
    resultado = pd.concat([dataframe_original, dummies], axis = 1)
    resultado = resultado.drop([variables_a_codificar], axis = 1)
    return(resultado) 

In [48]:
variables_a_codificar = ['Country']   #  Esta es una lista de variables
for variable in variables_a_codificar:
    X = codif_y_ligar(X, variable)

In [50]:
print(X)

         Age        Salary  Country_France  Country_Germany  Country_Spain
0  44.000000  72000.000000               1                0              0
1  27.000000  48000.000000               0                0              1
2  30.000000  54000.000000               0                1              0
3  38.000000  61000.000000               0                0              1
4  40.000000  63777.777778               0                1              0
5  35.000000  58000.000000               1                0              0
6  38.777778  52000.000000               0                0              1
7  48.000000  79000.000000               1                0              0
8  50.000000  83000.000000               0                1              0
9  37.000000  67000.000000               1                0              0


### Codificar la variable dependiente

In [57]:
y = pd.get_dummies(y)

In [59]:
print(y)

   No  Yes
0   1    0
1   0    1
2   1    0
3   1    0
4   0    1
5   0    1
6   1    0
7   0    1
8   1    0
9   0    1


## Dividir el conjunto de datos en uno de Entrenamiento y otro para Pruebas

In [62]:
from sklearn.model_selection import train_test_split
X_entreno, X_prueba, y_entreno, y_prueba = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [64]:
print(X_entreno)

         Age        Salary  Country_France  Country_Germany  Country_Spain
6  38.777778  52000.000000               0                0              1
4  40.000000  63777.777778               0                1              0
0  44.000000  72000.000000               1                0              0
3  38.000000  61000.000000               0                0              1
1  27.000000  48000.000000               0                0              1
7  48.000000  79000.000000               1                0              0
8  50.000000  83000.000000               0                1              0
5  35.000000  58000.000000               1                0              0


In [66]:
print(X_prueba)

    Age   Salary  Country_France  Country_Germany  Country_Spain
2  30.0  54000.0               0                1              0
9  37.0  67000.0               1                0              0


In [68]:
print(y_entreno)

   No  Yes
6   1    0
4   0    1
0   1    0
3   1    0
1   0    1
7   0    1
8   1    0
5   0    1


In [70]:
print(y_prueba)

   No  Yes
2   1    0
9   0    1


## Escalamiento de características (Feature Scaling)

In [73]:
from sklearn.preprocessing import MinMaxScaler

escalador = MinMaxScaler()
X_entreno['Age'] = escalador.fit_transform(X_entreno['Age'].values.reshape(-1,1))
X_entreno['Salary'] = escalador.fit_transform(X_entreno['Salary'].values.reshape(-1,1))


In [75]:
print(X_entreno)

        Age    Salary  Country_France  Country_Germany  Country_Spain
6  0.512077  0.114286               0                0              1
4  0.565217  0.450794               0                1              0
0  0.739130  0.685714               1                0              0
3  0.478261  0.371429               0                0              1
1  0.000000  0.000000               0                0              1
7  0.913043  0.885714               1                0              0
8  1.000000  1.000000               0                1              0
5  0.347826  0.285714               1                0              0


In [77]:
print(X_prueba)

    Age   Salary  Country_France  Country_Germany  Country_Spain
2  30.0  54000.0               0                1              0
9  37.0  67000.0               1                0              0
