# Procesamiento de tablas MPG

## 1. Cargando librerías y conjunto de datos

In [4]:
import math
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

SEED = 42
np.random.seed(SEED)


mpg_url = "https://raw.githubusercontent.com/scidatmath2020/DL_Py_25/refs/heads/main/Data/mpg.csv"
df = pd.read_csv(mpg_url)

df.sample(5)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
78,21.0,4,120.0,87.0,2979,19.5,72,europe
275,17.0,6,163.0,125.0,3140,13.6,78,europe
247,39.4,4,85.0,70.0,2070,18.6,78,japan
55,27.0,4,97.0,60.0,1834,19.0,71,europe
388,26.0,4,156.0,92.0,2585,14.5,82,usa


> Se ha importado correctamente los datos

## 2. Selección de columna target $y$

In [5]:
target = "mpg"

y = df[target]
X = df.drop(columns = [target])

In [8]:
## Separación train_full / test y train / val 
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X,y, test_size= 0.2, random_state=SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=SEED
)


## 3. Selección de columnas numéricas y categóricas

In [10]:
X.columns

Index(['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
       'model_year', 'origin'],
      dtype='object')

In [22]:
cols_num = ['displacement', 'horsepower', 'weight', 'acceleration', 'cylinders']
cols_cat = ['model_year', 'origin']



X_train_num = X_train[cols_num]
X_train_cat = X_train[cols_cat]

X_val_num = X_val[cols_num]
X_val_cat = X_val[cols_cat]

X_test_num = X_test[cols_num]
X_test_cat = X_test[cols_cat]

## 4. Selección de nominales 

In [None]:
# df.origin.unique() # out: ['usa', 'japan', 'europe']
# df.model_year.unique() #out: [70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82]

cols_onehot = ['origin']
cols_ordinal = ['model_year']

categorias_ordinales = [[70,71,72,73,74,75,76,77,78,79,80,81,82]]


preprocessor_cat = ColumnTransformer(
    transformers=[(
        'onehot', 
        Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")), 
            ("encoder", OneHotEncoder(sparse_output=False, drop=None, handle_unknown="ignore"))
        ]),
        cols_onehot
    ),(
        'ordinal',
        Pipeline(steps =[ 
            ('imputer', SimpleImputer(strategy="most_frequent")), 
            ('encoder', OrdinalEncoder(
                categories = categorias_ordinales, 
                handle_unknown="use_encoded_value", 
                unknown_value=-1   
            ))
        ]),
        cols_ordinal
    )],
    remainder='drop',
    verbose_feature_names_out=False
    )


preprocessor_cat.fit(X_train_cat)

X_train_cat_proc = preprocessor_cat.transform(X_train_cat)
X_val_cat_proc   = preprocessor_cat.transform(X_val_cat)
X_test_cat_proc  = preprocessor_cat.transform(X_test_cat)

cols_out_cat = list(preprocessor_cat.get_feature_names_out())

# -----------------------------------------
# Renombrar One-Hot a formato col___categoria
# ---------------------------------------- 
rename_map = {}
if len(cols_onehot) > 0:
    ohe = preprocessor_cat.named_transformers_["onehot"].named_steps["encoder"]
    ohe_names = list(ohe.get_feature_names_out(cols_onehot))

    for name in ohe_names:
        for col in cols_onehot:
            prefix = col + "_"
            if name.startswith(prefix):
                cat = name[len(prefix):]
                rename_map[name] = f"{col}___{cat}"
                break

cols_out_cat = [rename_map.get(c, c) for c in cols_out_cat]

df_train_cat_encode = pd.DataFrame(X_train_cat_proc, columns=cols_out_cat, index=X_train_cat.index)
df_val_cat_encode   = pd.DataFrame(X_val_cat_proc,   columns=cols_out_cat, index=X_val_cat.index)
df_test_cat_encode  = pd.DataFrame(X_test_cat_proc,  columns=cols_out_cat, index=X_test_cat.index)

df_train_cat_encode


Unnamed: 0,origin___europe,origin___japan,origin___usa,model_year
297,1.0,0.0,0.0,9.0
380,0.0,1.0,0.0,12.0
136,0.0,0.0,1.0,4.0
103,0.0,0.0,1.0,3.0
68,0.0,0.0,1.0,2.0
...,...,...,...,...
371,0.0,0.0,1.0,12.0
10,0.0,0.0,1.0,0.0
228,0.0,0.0,1.0,7.0
134,0.0,0.0,1.0,4.0


array([-1.])