# Construcción del Pipeline del Modelo

- Identificar las variables a ser ignoradas
- Implementar los mecanismos de imputación de datos nulos
- Implementar las transformaciones de datos (Escalamientos y Codificaciones)
- Implementar el modelo elegido

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

In [3]:
data = pd.read_csv('../Datasets/garments_worker_productivity.csv')
target = 'actual_productivity'
X = data.drop(columns=[target])
y = data[[target]]

In [4]:
all_num = X.select_dtypes(include = ['int64', 'float64']).columns.to_list()
all_cat = X.select_dtypes(include = ['object']).columns.to_list()
ignored = ['date', 'quarter']
num_features = [c for c in all_num if c not in ignored]
cat_features = [c for c in all_cat if c not in ignored]

In [5]:
# Crear los pipelines de imputación y transformación
# Imputador por vecindad a las variables numéricas
from sklearn.preprocessing import MinMaxScaler

num_pipe = Pipeline(steps = [('imputer', KNNImputer(n_neighbors=10)), ('scaler', MinMaxScaler())])
cat_pipe = Pipeline(steps = [('imputer', SimpleImputer(strategy = 'most_frequent')), ('encoder', OneHotEncoder())])

In [6]:
num_pipe

In [7]:
cat_pipe

In [8]:
preprocessor = ColumnTransformer(transformers= [('num', num_pipe, num_features), ('cat', cat_pipe, cat_features)], remainder = 'drop')
model = GradientBoostingRegressor(criterion = 'squared_error', n_estimators = 500)
pipe_model = Pipeline(steps = [('preprocess', preprocessor), ('model', model)])
pipe_model

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 123)
pipe_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?


In [10]:
y_pred = pipe_model.predict(X_test)
y_pred

array([0.64272033, 0.71829382, 0.66385325, 0.69976923, 0.70221856,
       0.6811756 , 0.87640343, 0.35535763, 0.64867317, 0.65222159,
       0.90003288, 0.7834772 , 0.95239708, 0.71740634, 0.67506168,
       0.67625274, 0.92560976, 0.88483834, 0.64495211, 0.3396428 ,
       0.64215345, 0.68184559, 0.79897962, 0.78884868, 0.83674941,
       0.8511789 , 0.72263797, 0.82832656, 0.54411591, 0.45264162,
       0.77189842, 0.73061364, 0.82251235, 0.55353795, 0.72820657,
       0.73481497, 0.59299455, 0.84251766, 0.78617607, 0.85391609,
       0.44597054, 0.9040417 , 0.88717984, 0.76835439, 0.70522807,
       0.64398958, 0.82633762, 0.92331684, 0.49313939, 0.8019704 ,
       0.46525177, 0.59212709, 0.91203551, 0.66562783, 0.77668106,
       0.45338098, 1.00556882, 0.86500127, 0.6872558 , 1.01755229,
       0.7690236 , 0.88082822, 0.5257217 , 0.27302633, 0.79879889,
       0.55649126, 0.80215557, 0.79904889, 0.76853628, 0.52798444,
       0.66683909, 0.67132371, 0.55548097, 0.82063912, 0.88575

In [11]:
import joblib
joblib.dump(pipe_model, '../Regresion/regression_model.pkl')

['../Regresion/regression_model.pkl']

In [12]:
modelo = joblib.load('regression_model.pkl')
modelo.predict(X_test)

array([0.64272033, 0.71829382, 0.66385325, 0.69976923, 0.70221856,
       0.6811756 , 0.87640343, 0.35535763, 0.64867317, 0.65222159,
       0.90003288, 0.7834772 , 0.95239708, 0.71740634, 0.67506168,
       0.67625274, 0.92560976, 0.88483834, 0.64495211, 0.3396428 ,
       0.64215345, 0.68184559, 0.79897962, 0.78884868, 0.83674941,
       0.8511789 , 0.72263797, 0.82832656, 0.54411591, 0.45264162,
       0.77189842, 0.73061364, 0.82251235, 0.55353795, 0.72820657,
       0.73481497, 0.59299455, 0.84251766, 0.78617607, 0.85391609,
       0.44597054, 0.9040417 , 0.88717984, 0.76835439, 0.70522807,
       0.64398958, 0.82633762, 0.92331684, 0.49313939, 0.8019704 ,
       0.46525177, 0.59212709, 0.91203551, 0.66562783, 0.77668106,
       0.45338098, 1.00556882, 0.86500127, 0.6872558 , 1.01755229,
       0.7690236 , 0.88082822, 0.5257217 , 0.27302633, 0.79879889,
       0.55649126, 0.80215557, 0.79904889, 0.76853628, 0.52798444,
       0.66683909, 0.67132371, 0.55548097, 0.82063912, 0.88575