# Procesamiento de tablas MPG

## 1. Cargando librerías y conjunto de datos

In [18]:
import math
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

SEED = 42
np.random.seed(SEED)


mpg_url = "https://raw.githubusercontent.com/scidatmath2020/DL_Py_25/refs/heads/main/Data/mpg.csv"
df = pd.read_csv(mpg_url)

df.sample(5)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
78,21.0,4,120.0,87.0,2979,19.5,72,europe
275,17.0,6,163.0,125.0,3140,13.6,78,europe
247,39.4,4,85.0,70.0,2070,18.6,78,japan
55,27.0,4,97.0,60.0,1834,19.0,71,europe
388,26.0,4,156.0,92.0,2585,14.5,82,usa


> Se ha importado correctamente los datos

## 2. Selección de columna target $y$

In [19]:
target = "mpg"

y = df[target]
X = df.drop(columns = [target])

In [20]:
## Separación train_full / test y train / val 
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X,y, test_size= 0.2, random_state=SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=SEED
)


## 3. Selección de columnas numéricas y categóricas

In [14]:
X.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model_year', 'origin'],
      dtype='object')

In [23]:
cols_num = ['displacement', 'horsepower', 'weight', 'acceleration', 'cylinders']
cols_cat = ['model_year', 'origin', 'cylinders']



X_train_num = X_train[cols_num]
X_train_cat = X_train[cols_cat]

X_val_num = X_val[cols_num]
X_val_cat = X_val[cols_cat]

X_test_num = X_test[cols_num]
X_test_cat = X_test[cols_cat]

## 4. Selección de nominales y ordinales


In [22]:
df.columns 

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin'],
      dtype='object')

In [24]:
# df.origin.unique() # out: ['usa', 'japan', 'europe']
# df.model_year.unique() #out: [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82]
# df.cylinders.unique() #out: [8, 4, 6, 3, 5])
cols_onehot = ['origin']
cols_ordinal = ['cylinders','model_year']

categorias_ordinales = [[3,4,5,6,8],[70,71,72,73,74,75,76,77,78,79,80,81,82]]


preprocessor_cat = ColumnTransformer(
    transformers=[(
        'onehot', 
        Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")), 
            ("encoder", OneHotEncoder(sparse_output=False, drop=None, handle_unknown="ignore"))
        ]),
        cols_onehot
    ),(
        'ordinal',
        Pipeline(steps =[ 
            ('imputer', SimpleImputer(strategy="most_frequent")), 
            ('encoder', OrdinalEncoder(
                categories = categorias_ordinales, 
                handle_unknown="use_encoded_value", 
                unknown_value=-1   
            ))
        ]),
        cols_ordinal
    )],
    remainder='drop',
    verbose_feature_names_out=False
    )


preprocessor_cat.fit(X_train_cat)

X_train_cat_proc = preprocessor_cat.transform(X_train_cat)
X_val_cat_proc   = preprocessor_cat.transform(X_val_cat)
X_test_cat_proc  = preprocessor_cat.transform(X_test_cat)

cols_out_cat = list(preprocessor_cat.get_feature_names_out())

# -----------------------------------------
# Renombrar One-Hot a formato col___categoria
# ---------------------------------------- 
rename_map = {}
if len(cols_onehot) > 0:
    ohe = preprocessor_cat.named_transformers_["onehot"].named_steps["encoder"]
    ohe_names = list(ohe.get_feature_names_out(cols_onehot))

    for name in ohe_names:
        for col in cols_onehot:
            prefix = col + "_"
            if name.startswith(prefix):
                cat = name[len(prefix):]
                rename_map[name] = f"{col}___{cat}"
                break

cols_out_cat = [rename_map.get(c, c) for c in cols_out_cat]

df_train_cat_encode = pd.DataFrame(X_train_cat_proc, columns=cols_out_cat, index=X_train_cat.index)
df_val_cat_encode   = pd.DataFrame(X_val_cat_proc,   columns=cols_out_cat, index=X_val_cat.index)
df_test_cat_encode  = pd.DataFrame(X_test_cat_proc,  columns=cols_out_cat, index=X_test_cat.index)

df_train_cat_encode


Unnamed: 0,origin___europe,origin___japan,origin___usa,cylinders,model_year
297,1.0,0.0,0.0,2.0,9.0
380,0.0,1.0,0.0,1.0,12.0
136,0.0,0.0,1.0,4.0,4.0
103,0.0,0.0,1.0,4.0,3.0
68,0.0,0.0,1.0,4.0,2.0
...,...,...,...,...,...
371,0.0,0.0,1.0,1.0,12.0
10,0.0,0.0,1.0,4.0,0.0
228,0.0,0.0,1.0,3.0,7.0
134,0.0,0.0,1.0,3.0,4.0


## 5. Númericas: Imputación + Escalado

In [25]:
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

num_pipe.fit(X_train_num)

T_train_num = num_pipe.transform(X_train_num)
T_val_num   = num_pipe.transform(X_val_num)
T_test_num  = num_pipe.transform(X_test_num)

num_cols_out = X_train_num.columns

T_train_num = pd.DataFrame(T_train_num, columns=num_cols_out, index=X_train_num.index)
T_val_num   = pd.DataFrame(T_val_num,   columns=num_cols_out, index=X_val_num.index)
T_test_num  = pd.DataFrame(T_test_num,  columns=num_cols_out, index=X_test_num.index)

T_train_num

Unnamed: 0,displacement,horsepower,weight,acceleration,cylinders
297,-0.110919,-0.712732,0.641443,1.575175,-0.280986
380,-0.723442,-0.418496,-1.008749,-0.404144,-0.873653
136,1.046070,0.972440,1.377404,-0.580869,1.497016
103,1.998884,1.239928,2.408473,-0.580869,1.497016
68,1.512755,1.373672,1.812236,-0.757594,1.497016
...,...,...,...,...,...
371,-0.577604,-0.525491,-0.569100,0.126031,-0.873653
10,1.833600,1.774903,0.681192,-1.994668,1.497016
228,0.540495,-0.151008,0.635420,1.186380,0.311682
134,0.618276,0.169977,0.764304,0.832931,0.311682


## 6. Unión de categorias 

In [None]:
# =========================================================
# Unir numéricas + categóricas
# =========================================================
X_train_final_df = pd.concat([T_train_num, df_train_cat_encode], axis=1)
X_val_final_df   = pd.concat([T_val_num,   df_val_cat_encode],   axis=1)
X_test_final_df  = pd.concat([T_test_num,  df_test_cat_encode],  axis=1)

# A numpy float32 para Keras (si se ocupa después)
X_train_final = X_train_final_df.to_numpy(dtype=np.float32)
X_val_final   = X_val_final_df.to_numpy(dtype=np.float32)
X_test_final  = X_test_final_df.to_numpy(dtype=np.float32)

# =========================================================
# Devolver la columna objetivo (alineando por índice)
# =========================================================
train_final = X_train_final_df.copy()
train_final["target"] = y_train.loc[X_train_final_df.index].to_numpy()

val_final = X_val_final_df.copy()
val_final["target"] = y_val.loc[X_val_final_df.index].to_numpy()

test_final = X_test_final_df.copy()
test_final["target"] = y_test.loc[X_test_final_df.index].to_numpy()

print("X_train_final:", X_train_final.shape)
print("X_val_final  :", X_val_final.shape)
print("X_test_final :", X_test_final.shape)

X_train_final: (251, 10)
X_val_final  : (63, 10)
X_test_final : (79, 10)


## 7. Guardado de archivosa 

In [27]:
# =====================================================
# Guardar artefactos + datasets finales
# =====================================================

import os
import joblib
import json

ARTIFACT_DIR = "artifacts_preprocesamiento"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# 1) Transformadores
joblib.dump(num_pipe, f"{ARTIFACT_DIR}/num_pipe.joblib")
joblib.dump(preprocessor_cat, f"{ARTIFACT_DIR}/cat_preprocessor.joblib")
joblib.dump(list(X_train_final_df.columns), f"{ARTIFACT_DIR}/feature_names.joblib")

# 2) Datasets finales CON target
train_final.to_csv(f"{ARTIFACT_DIR}/train_final.csv", index=False)
val_final.to_csv(f"{ARTIFACT_DIR}/val_final.csv", index=False)
test_final.to_csv(f"{ARTIFACT_DIR}/test_final.csv", index=False)

metadata = {
    "cols_num": cols_num,
    "cols_cat": cols_cat,
    "cols_onehot": cols_onehot,
    "cols_ordinal": cols_ordinal,
    "cat_out_cols": list(X_train_final_df.columns[len(cols_num):]),  # solo las categóricas ya renombradas
    "feature_names": list(X_train_final_df.columns)                  # num + cat, orden final
}

with open(f"{ARTIFACT_DIR}/metadata_preprocesamiento.json", "w") as f:
    json.dump(metadata, f, indent=2)


print("Todos los artefactos y datasets fueron guardados correctamente.")

Todos los artefactos y datasets fueron guardados correctamente.


In [None]:
## 8. Creando el script 

In [None]:

import zipfile
import os

ARTIFACT_DIR = "artifacts_preprocesamiento"
ZIP_NAME = "artifacts_preprocesamiento.zip"

with zipfile.ZipFile(ZIP_NAME, "w", zipfile.ZIP_DEFLATED) as z:
    for root, _, files in os.walk(ARTIFACT_DIR):
        for f in files:
            full_path = os.path.join(root, f)
            z.write(full_path, arcname=os.path.relpath(full_path, ARTIFACT_DIR))

print("ZIP final creado correctamente:", ZIP_NAME)
