In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split

# Ler o dataset
file_path = "Vehicles_export_prices_scaled_train_eng.xlsx"
df = pd.read_excel(file_path)

# Remover linhas com LAID_UP_TIME vazio
target = "LAID_UP_TIME"
df = df.dropna(subset=[target])

# Remover colunas com muitos valores ausentes
columns_to_remove = [
    "PAINT_TYPE", "CERTIFICATE_TYPE", "RIM_KEY", "RIMS", "REAR_TIRES", "FRONT_TIRES",
    "FACTORY_NUMBER", "DEMONSTRATION_STATUS", "VARIANT", "LEASING_CONTRACT_DATE",
    "TRANSMISSION_ID", "LEASING_START", "PRICE_LIST", "TRANSMISSION", "LEASING_END",
    "ENGINE_ID", "ENGINE_ID_ALT", "DAY_OF_REGISTRATION", "CUSTOMER_SALE_GROUP_NAME"
]
df = df.drop(columns=columns_to_remove)

# Remover identificadores únicos, exceto CHASSIS_NUMBER
df = df.drop(columns=["RPAKREP_VEHICLE_HKEY", "COMMISSION_NUMBER"])

# Separar variáveis categóricas e numéricas
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Remover colunas categóricas irrelevantes ou com muitos valores únicos
categorical_cols = [col for col in categorical_cols if col not in ["CHASSIS_NUMBER"]]

# Codificar variáveis categóricas
encoder = TargetEncoder(cols=categorical_cols)
df[categorical_cols] = encoder.fit_transform(df[categorical_cols], df[target])

# Separar variáveis explicativas e alvo
X = df.drop(columns=[target])
y = df[target]

# Dividir os dados em treino, validação e teste
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Escalar colunas numéricas
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print(f"Base processada: {X_train.shape[1]} atributos prontos para modelagem.")


AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Criar o modelo Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Avaliar no conjunto de validação
rf_val_predictions = rf_model.predict(X_val)
rf_rmse = mean_squared_error(y_val, rf_val_predictions, squared=False)

print(f"Random Forest RMSE no conjunto de validação: {rf_rmse:.2f}")

# Avaliar no conjunto de teste
rf_test_predictions = rf_model.predict(X_test)
rf_test_rmse = mean_squared_error(y_test, rf_test_predictions, squared=False)

print(f"Random Forest RMSE no conjunto de teste: {rf_test_rmse:.2f}")


In [None]:
from xgboost import XGBRegressor

# Criar o modelo XGBoost
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Avaliar no conjunto de validação
xgb_val_predictions = xgb_model.predict(X_val_scaled)
xgb_rmse = mean_squared_error(y_val, xgb_val_predictions, squared=False)

print(f"XGBoost RMSE no conjunto de validação: {xgb_rmse:.2f}")

# Avaliar no conjunto de teste
xgb_test_predictions = xgb_model.predict(X_test_scaled)
xgb_test_rmse = mean_squared_error(y_test, xgb_test_predictions, squared=False)

print(f"XGBoost RMSE no conjunto de teste: {xgb_test_rmse:.2f}")


In [None]:
from sklearn.neural_network import MLPRegressor

# Criar o modelo MLP
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# Avaliar no conjunto de validação
mlp_val_predictions = mlp_model.predict(X_val_scaled)
mlp_rmse = mean_squared_error(y_val, mlp_val_predictions, squared=False)

print(f"MLP RMSE no conjunto de validação: {mlp_rmse:.2f}")

# Avaliar no conjunto de teste
mlp_test_predictions = mlp_model.predict(X_test_scaled)
mlp_test_rmse = mean_squared_error(y_test, mlp_test_predictions, squared=False)

print(f"MLP RMSE no conjunto de teste: {mlp_test_rmse:.2f}")


In [None]:
from sklearn.linear_model import Ridge

# Criar o modelo Ridge
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)

# Avaliar no conjunto de validação
ridge_val_predictions = ridge_model.predict(X_val_scaled)
ridge_rmse = mean_squared_error(y_val, ridge_val_predictions, squared=False)

print(f"Ridge RMSE no conjunto de validação: {ridge_rmse:.2f}")

# Avaliar no conjunto de teste
ridge_test_predictions = ridge_model.predict(X_test_scaled)
ridge_test_rmse = mean_squared_error(y_test, ridge_test_predictions, squared=False)

print(f"Ridge RMSE no conjunto de teste: {ridge_test_rmse:.2f}")
