In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import root_mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Caminhos dos arquivos Excel
file_train = "Data/Vehicles_export_prices_scaled_train_eng.xlsx"
file_test = "Data/Vehicles_export_prices_scaled_stud_test_eng.xlsx"

# Carregar os dois arquivos
df_train = pd.read_excel(file_train)
df_test = pd.read_excel(file_test)

# Concatenar os dois DataFrames
df_combined = pd.concat([df_train, df_test], ignore_index=True)

# Garantir que não há valores nulos em `CHASSIS_NUMBER`
chasis_col = "CHASSIS_NUMBER"
df_combined = df_combined.dropna(subset=[chasis_col])


In [None]:
# Identificar colunas categóricas e numéricas
from sklearn.impute import SimpleImputer


num_cols = df_combined.select_dtypes(include=["float64", "int64"]).columns.tolist()
cat_cols = df_combined.select_dtypes(include=["object"]).columns.tolist()

# Identificar colunas numéricas completamente vazias
empty_cols = [col for col in num_cols if df_combined[col].isnull().all()]
print(f"Colunas completamente vazias: {empty_cols}")

# Remover colunas completamente vazias
df_combined = df_combined.drop(columns=empty_cols)
num_cols = [col for col in num_cols if col not in empty_cols]

num_imputer = SimpleImputer(strategy="median")

# Preencher valores ausentes nas colunas numéricas
df_combined[num_cols] = pd.DataFrame(
    num_imputer.fit_transform(df_combined[num_cols]),
    columns=num_cols,
    index=df_combined.index,
)
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
# Categóricas: "missing"
df_combined[cat_cols] = pd.DataFrame(
    cat_imputer.fit_transform(df_combined[cat_cols]),
    columns=cat_cols,
    index=df_combined.index,
)

# Codificar variáveis categóricas
one_hot_cols = [col for col in cat_cols if df_combined[col].nunique() <= 30]
label_cols = [col for col in cat_cols if df_combined[col].nunique() > 30]
one_hot_encoder = OneHotEncoder()

# Convert all values in one_hot_cols to strings
df_combined[one_hot_cols] = df_combined[one_hot_cols].astype(str)

# One-Hot Encoding para colunas com até 30 valores únicos
one_hot_encoded = pd.DataFrame(
    one_hot_encoder.fit_transform(df_combined[one_hot_cols]),
    columns=one_hot_encoder.get_feature_names_out(one_hot_cols),
    index=df_combined.index,
)

# Label Encoding para colunas com mais de 30 valores únicos
label_encoded = pd.DataFrame()
for col in label_cols:
    label_encoder = LabelEncoder()
    label_encoded[col] = label_encoder.fit_transform(df_combined[col])

# Combinar dados codificados com o restante do DataFrame
df_combined = pd.concat([df_combined.drop(columns=cat_cols), one_hot_encoded, label_encoded], axis=1)

# Separar os conjuntos novamente
df_train_encoded = df_combined.iloc[:len(df_train)]
df_test_encoded = df_combined.iloc[len(df_train):]

# Limpar `LAID_UP_TIME` no conjunto de treino
target = "LAID_UP_TIME"
df_train_encoded = df_train_encoded.dropna(subset=[target])

# Separar variáveis explicativas e alvo
X_train = df_train_encoded.drop(columns=[target, chasis_col])
y_train = df_train_encoded[target]


Colunas completamente vazias: []


TypeError: Encoders require their input argument must be uniformly strings or numbers. Got ['int', 'str']

In [None]:

# Dividir os dados de treino em treino/validação/teste (70% treino, 15% validação, 15% teste)

X_train, X_temp, y_train, y_temp = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Escalar os dados
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Função para validação cruzada e avaliação
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse_scores = np.sqrt(-scores)
    print(f"Validação cruzada RMSE (média): {rmse_scores.mean():.2f}, (desvio padrão): {rmse_scores.std():.2f}")
    return model.fit(X, y)

# Modelagem com Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model = evaluate_model(rf_model, X_train, y_train)

# Modelagem com MLP
mlp_model = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_model = evaluate_model(mlp_model, X_train, y_train)

# Avaliação no conjunto de teste
rf_test_pred = rf_model.predict(X_test)
mlp_test_pred = mlp_model.predict(X_test)

rf_rmse = root_mean_squared_error(y_test, rf_test_pred, squared=False)
mlp_rmse = root_mean_squared_error(y_test, mlp_test_pred, squared=False)

print(f"RMSE no conjunto de teste - Random Forest: {rf_rmse:.2f}")
print(f"RMSE no conjunto de teste - MLP: {mlp_rmse:.2f}")
