# Procesamiento con pipelines

In [13]:
import pandas as pd
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
# Importamos la librería que necesitamos
from sklearn.tree import export_graphviz
from IPython.display import Image
import pydotplus
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score # métrica de evaluación
from sklearn.metrics import classification_report
from sklearn import metrics
import sys
sys.path.append('../Code')
from CustomTransformers import CollinearityDropper, ColumnSelector, ColumnScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
import numpy as np


## Preprocesamiento usando transformadores "customizados"

In [40]:
df = pd.read_csv('https://raw.githubusercontent.com/pokengineer/DataScience/main/datasets/asteroids_nasa.csv')
X = df.drop("Hazardous",axis=1)
y = df["Hazardous"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=0)


In [41]:
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}
pl = Pipeline(steps=[("QuitarColineales", CollinearityDropper(columns=None, min_coef=0.9)),
                     ("SeleccionoColumnas", ColumnSelector(columns=['Absolute Magnitude', 'Minimum Orbit Intersection'])),
                     #Si llamo al estandar escaler pierdo las columnas, esta mal? Para nada, pero quiero conservarlas
                     ("Escalado", StandardScaler()),
                    #("Escalado", ColumnScaler(scaler=StandardScaler())), #Si columnas=None, se escalan todas    
                     ("Clasificador", GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, scoring='accuracy', cv=5))
                     ])

In [42]:
#
pl.fit(X_train, y_train)
y_pred = pl.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.992181947405828


## Hacemos lo mismo con column transformers

Column transformer puede ser incluso una opcion mas eficiente solo que durante el proceso de generación las pruebas pueden ser mas confusas 

In [43]:
df = pd.read_csv('https://raw.githubusercontent.com/pokengineer/DataScience/main/datasets/asteroids_nasa.csv')
X = df.drop("Hazardous",axis=1)
y = df["Hazardous"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=0)

In [44]:
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]}

In [45]:
ct = ColumnTransformer(transformers=[('num', StandardScaler(), ['Absolute Magnitude', 'Minimum Orbit Intersection'])], remainder='passthrough')

In [46]:
ct.fit_transform(X_train)

array([[-1.230835536269324, 0.04602355587216602, 2053409, ...,
        323.2554578178543, 0.323556652405288, 'J2000'],
       [0.49209737426665956, -0.8933159026661764, 3552665, ...,
        81.96039659073507, 0.6095236904588717, 'J2000'],
       [-0.9207076123728463, -0.5348681391881543, 2038071, ...,
        33.8430264222424, 0.3262218594829722, 'J2000'],
       ...,
       [0.733307981741697, -0.7358530383059827, 3672464, ...,
        31.11384184160581, 1.141298680706784, 'J2000'],
       [1.5603157787989699, -0.9028547164821791, 3477702, ...,
        41.19759327659477, 0.8195002348730062, 'J2000'],
       [1.698150411641848, -0.7760195864022473, 3341510, ...,
        210.0416017621267, 0.8701635643923911, 'J2000']], dtype=object)

En este caso particular no me interesan las otras columnas

In [47]:
ct = ColumnTransformer(transformers=[('num', StandardScaler(), ['Absolute Magnitude', 'Minimum Orbit Intersection'])], remainder='drop')

In [48]:
ct.fit_transform(X_train)

array([[-1.23083554,  0.04602356],
       [ 0.49209737, -0.8933159 ],
       [-0.92070761, -0.53486814],
       ...,
       [ 0.73330798, -0.73585304],
       [ 1.56031578, -0.90285472],
       [ 1.69815041, -0.77601959]])

In [49]:
pl = Pipeline(steps=[("transformacion 1", ct),
                     ("Clasificador", GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid, scoring='accuracy', cv=5))
                     ])

In [50]:
pl

In [51]:
pl.fit(X_train, y_train)
y_pred = pl.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.992181947405828
