In [13]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error
import numpy as np


In [14]:

# Seção 1: Pré-processamento do Dataset
def process_dataset(train_path):
    train_df = pd.read_excel(train_path)

    # Dicionário para armazenar encoders
    encoders = {}

    # Categorias e tratamentos
    column_actions = {
        "RPAKREP_VEHICLE_HKEY": "drop",
        "COMPANY": "embedding",
        "OFFICE": "label",
        "OFFICE_MAIN_BRAND": "embedding",
        "CHASSIS_NUMBER": "exclude",
        "MANUFACTURER_SHORT": "label",
        "MANUFACTURER": "drop",
        "VEHICLE_GROUP": "label",
        "VEHICLE_TYPE": "label",
        "MODEL_CODE": "drop",
        "VARIANT": "label",
        "MILEAGE": "same",
        "OPERATING_HOURS": "drop",
        "MILAGE_IN_FIELD": "drop",
        "MILAGE_SALES": "same",
        "OPERATING_HOURS_SALES": "drop",
        "RIM_KEY": "drop",
        "COLOR_CODE": "drop",
        "COLOR_CODE_NAME": "drop",
        "COLOR": "label",
        "COLOR_TYPE": "drop",
        "UPHOLSTERY": "embedding",
        "UPHOLSTERY_CODE": "drop",
        "UPHOLSTERY_CODE_ALT": "drop",
        "CERTIFICATE_TYPE": "drop",
        "CERTIFICATE_TYPE_DATE": "drop",
        "FACTORY_NUMBER": "drop",
        "ENGINE_ID": "drop",
        "ENGINE_TYPE": "label",
        "ENGINE_ID_ALT": "drop",
        "TRANSMISSION": "drop",
        "TRANSMISSION_TYPE": "label",
        "TRANSMISSION_ID": "drop",
        "TRANSMISSION_SHORT": "embedding",
        "TRANSMISSION_NAME": "embedding",
        "RIMS": "drop",
        "FRONT_TIRES": "drop",
        "FRONT_TIRES_CONDITION": "drop",
        "REAR_TIRES": "drop",
        "REAR_TIRES_CONDITION": "drop",
        "NUMBER_DOORS": "embedding",
        "NUMBER_SEATS": "embedding",
        "PERMITTED_TOTAL_WEIGHT": "drop",
        "MAX_TRAILOR_LOAD": "drop",
        "CURB_WEIGHT": "same",
        "YEAR_CONSTRUCTION": "same",
        "CONSTRUCTION_MONTH": "same",
        "NUMBER_AXLE": "embedding",
        "NUMBER_ENGINE_CYLINDER": "same",
        "REPAIR_RKZ": "drop",
        "OPTICAL_CONDITION": "drop",
        "TECHNICAL_CONDITION": "drop",
        "ACCIDENT_VEHICLE": "embedding",
        "COMMISSION_NUMBER": "drop",
        "HORSEPOWER": "same",
        "KW": "same",
        "CCM": "same",
        "NUMBER_OWNERS": "same",
        "IS_USED_CAR": "embedding",
        "LEASING_CONTRACT_DATE": "drop",
        "LEASING_START": "drop",
        "LEASING_END": "drop",
        "LEASING_MILAGE": "same",
        "PAINT_TYPE": "drop",
        "FINANCING_TYPE": "embedding",
        "FINANCING_TYPE_NAME": "drop",
        "KAT_VEHICLE": "embedding",
        "FUEL_TYPE": "drop",
        "FUEL_TYPE_NAME": "embedding",
        "DRIVE_TYPE": "embedding",
        "DRIVE_TYPE_NAME": "drop",
        "VEHICLE_MODEL_ID": "embedding",
        "VEHICLE_MODEL_ID_NAME": "drop",
        "COMMISSION_TYPE": "embedding",
        "COMMISSION_TYPE_NAME": "drop",
        "DEMONSTRATION_STATUS": "drop",
        "PURCHASE_DATE": "drop",
        "PURCHASE_BOOKING_DATE": "drop",
        "PURCHASE_MILAGE": "same",
        "PURCHASE_OPERATION_HOURS": "drop",
        "PRICE_LIST": "drop",
        "DAY_OF_REGISTRATION": "drop",
        "AT_LOCATION_SINCE": "same",
        "LAID_UP_TIME": "target",
        "SOLD_CUSTOMER_ID": "drop",
        "SOLD_INVOICE_COSTUMER_ID": "drop",
        "MILAGE_SALE": "same",
        "OPERATION_HOURS_SALE": "drop",
        "SOLD_INVOICE_COSTUMER_ID2": "drop",
        "CUSTOMER_TYPE": "embedding",
        "CUSTOMER_GROUP": "embedding",
        "CUSTOMER_GROUP_NAME": "drop",
        "CUSTOMER_FEATURE": "embedding",
        "CUSTOMER_FEATURE_NAME": "drop",
        "SALE_CUSTOMER_ID2": "drop",
        "CUSTOMER_SALE_GROUP": "embedding",
        "CUSTOMER_SALE_GROUP_NAME": "drop",
        "CUSTOMER_SALE_GROUP2": "embedding",
        "CUSTOMER_SALE_GROUP2_NAME": "drop",
        "SCALED_CURRENT_VALUE": "same",
        "SCALED_INVENTURAL_VALUE": "same",
        "SCALED_REPORT_VALUE": "same",
        "SCALED_VALUATION_PRICE": "same",
        "SCALED_GUIDE_PRICE": "same",
        "SCALED_TOTAL_SALES_PRICE_BASIS": "same",
        "SCALED_TOTAL_SALE_PRICE": "same"
    }

    excluded_columns = []

    for col, action in column_actions.items():
        if action == "drop":
            train_df.drop(columns=[col], inplace=True, errors="ignore")
        elif action == "label":
            encoder = LabelEncoder()
            train_df[col] = encoder.fit_transform(train_df[col].astype(str))
            encoders[col] = encoder
        elif action == "embedding":
            train_df[col] = train_df[col].astype("category").cat.codes
        elif action == "exclude":
            excluded_columns.append(col)

    excluded_data = train_df[excluded_columns].copy()
    train_df.drop(columns=excluded_columns, inplace=True)

    return train_df, encoders, excluded_data

# Caminho do arquivo de treino'
train_file = "Data/Vehicles_export_prices_scaled_train_eng.xlsx"

# Processar os dados
train_processed, encoders, excluded_data = process_dataset(train_file)

In [15]:
# Seção 2: Definição do Dataset para PyTorch
class TabularDataset(Dataset):
    def __init__(self, data, target_column):
        self.features = data.drop(columns=[target_column]).values.astype(np.float32)
        self.targets = data[target_column].values.astype(np.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [16]:
# Seção 3: Definição do Modelo com Embeddings - MLP
class EmbeddingModel(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(EmbeddingModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
        )

    def forward(self, x):
        x = self.fc(x)
        return x

In [17]:
def train_model(train_dataset, model, excluded_data, epochs=20, batch_size=32, learning_rate=0.001):
    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        for features, targets in dataloader:
            if torch.isnan(features).any() or torch.isnan(targets).any():
                raise ValueError("NaN detected in features or targets")

            optimizer.zero_grad()
            outputs = model(features).squeeze()
            loss = criterion(outputs, targets)
            if torch.isnan(loss):
                raise ValueError("Loss became NaN")

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        rmse = torch.sqrt(torch.tensor(epoch_loss / len(dataloader)))
        print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, RMSE: {rmse:.4f}")

    # Fazer previsões finais
    model.eval()
    with torch.no_grad():
        predictions = model(torch.tensor(train_dataset.features)).squeeze().numpy()
        results = pd.DataFrame({
            "CHASSIS_NUMBER": excluded_data["CHASSIS_NUMBER"],
            "PREDICTED": predictions,
            "REAL": train_dataset.targets
        })
        results.to_excel("predicted_results.xlsx", index=False)
        print("Resultados salvos em predicted_results.xlsx")


In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

target_column = "LAID_UP_TIME"
# Identificar colunas numéricas e categóricas
num_cols = train_processed.select_dtypes(include=[np.number]).columns
cat_cols = train_processed.select_dtypes(include=['object', 'category']).columns

# Preencher colunas numéricas com a média
train_processed[num_cols] = train_processed[num_cols].fillna(-1)

# Preencher colunas categóricas com "unknown"
train_processed[cat_cols] = train_processed[cat_cols].fillna('unknown')

X = train_processed.drop(columns=["LAID_UP_TIME"])  # Features
y = train_processed["LAID_UP_TIME"]  # Alvo

# Divisão em treino e validação
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Instanciar o modelo
rf_model = RandomForestRegressor(
    n_estimators=100,  # Número de árvores
    max_depth=100,  # Profundidade máxima (ajuste conforme necessário)
    random_state=42,  # Para reprodutibilidade
    n_jobs=-1  # Usa todos os núcleos disponíveis
)

# Treinar o modelo
rf_model.fit(X_train, y_train)

# Fazer previsões no conjunto de validação
y_pred = rf_model.predict(X_val)

# Avaliar o modelo usando RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE no conjunto de validação: {rmse:.4f}")

RMSE no conjunto de validação: 97.9643


In [None]:
# train_dataset = TabularDataset(train_processed, target_column)
# model = EmbeddingModel(input_dim=train_dataset.features.shape[1], embedding_dim=10)
# train_model(train_dataset, model, excluded_data)