In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# 1. Cargar datos
df = pd.read_csv("datasets/adult_joined.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,education,education-num,income
0,39,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,Bachelors,13,<=50K
1,50,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,Bachelors,13,<=50K
2,38,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,HS-grad,9,<=50K
3,53,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,11th,7,<=50K
4,28,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,Bachelors,13,<=50K


In [4]:
# Instanciar OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)  # Crea un array en vez de una matriz

# Aplicar el encoder a las columnas categóricas
categoricas = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex','native-country','education']
valores = encoder.fit_transform(df[categoricas])

# Crear un DataFrame con los valores codificados
columnas_codificadas = encoder.get_feature_names_out(categoricas)
df_codificado = pd.DataFrame(valores, columns=columnas_codificadas, index=df.index)

# Concatenar el DataFrame original con las columnas codificadas
df = pd.concat([df.drop(columns=categoricas, axis=1), df_codificado], axis=1)

# Mostrar el DataFrame final

In [5]:
df.isnull().sum()

age                       0
fnlwgt                    0
capital-gain              0
capital-loss              0
hours-per-week            0
                         ..
education_HS-grad         0
education_Masters         0
education_Preschool       0
education_Prof-school     0
education_Some-college    0
Length: 108, dtype: int64

In [6]:
# 2. Eliminar columnas innecesarias (ej. ID)
df = df.drop(columns=["ID"], errors='ignore')
df = df.drop(columns=["fnlwgt"], errors='ignore')
df = df.drop(columns=["income"], errors='ignore')

# 3. Definir X e y
X = df.drop(columns=["hours-per-week"])
y = df["hours-per-week"]

# 4. Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 5. Entrenar modelo
model = LinearRegression()
model.fit(X_train, y_train)

# 6. Predecir
y_pred = model.predict(X_test)

In [7]:
# 7. Evaluar el modelo
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"R²: {r2:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")

R²: 0.20
MAE: 7.82
MSE: 127.08
RMSE: 11.27
