# 1. Import de Bibliotecas e de Algoritmos (v4 - Melhorado)

In [1]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import os
import json
from category_encoders import TargetEncoder
from sklearn.linear_model import Ridge

# Importar modelos
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

# Configura√ß√µes para melhoria de performance
N_ESTIMATORS_XGB = 2000 # Aumentado para melhor performance com learning_rate baixo
LEARNING_RATE_XGB = 0.01 # Diminu√≠do para melhor precis√£o
N_ESTIMATORS_GBM = 1500
LEARNING_RATE_GBM = 0.01
N_ESTIMATORS_RF = 500
KFOLD_SPLITS = 5 # Usar K-Fold para valida√ß√£o mais robusta

# 2. FEATURE ENGINEERING - Extracao e Limpeza de Dados 

In [2]:
def feature_engeneering(df):
    df_eng = df.copy()

    # --- LIMPEZA INICIAL ---
    df_eng = df_eng.drop_duplicates()
    df_eng['hp'] = df['engine'].str.extract(r'(\d+\.?\d*)HP', expand=False).astype(float)
    df_eng['liters'] = df['engine'].str.extract(r'(\d+\.?\d*)L\s', expand=False).astype(float)

    # --- Idade e Uso ---
    var_ano_atual = date.today().year
    df_eng['car_age'] = var_ano_atual - df_eng['model_year']
    df_eng['car_age'] = df_eng['car_age'].replace(0, 1)

    # --- Cilindrada ---
    df_eng['cylinders'] = df['engine'].str.extract(r'(\d+)\s+Cylinder', expand=False)
    df_eng['cylinders'] = df_eng['cylinders'].fillna(df['engine'].str.extract(r'V(\d+)', expand=False))
    df_eng['cylinders'] = df_eng['cylinders'].astype(float)

    # --- Tecnologias de Motor ---
    df_eng['is_turbo'] = df['engine'].str.contains(r'(?i)turbo', na=False).astype(int)
    df_eng['turbo_type'] = df['engine'].str.extract(r'(Twin Turbo|Turbo)', expand=False)
    df_eng['valve_train'] = df['engine'].str.extract(r'(DOHC|OHV|SOHC)', expand=False) 
    df_eng['fuel_injection'] = df['engine'].str.extract(r'(PDI|GDI|MPFI)', expand=False)

    # Miles per year
    df_eng['miles_p_year'] = df_eng['milage'] / df_eng['car_age']

    # --- FUEL TYPE ---
    def clean_fuel(val):
        s = str(val).lower()
        if 'hybrid' in s:
            return 'Hybrid'
        elif 'not supported' in s:
            return 'EV'
        else:
            return val
    df_eng['fuel_type'] = df_eng['fuel_type'].apply(clean_fuel)

    # --- TRANSMISSION TYPE ---
    def clean_transmission(val):
        s = str(val).lower()
        if 'automatic' in s or 'a/t' in s or 'cvt' in s:
            return 'Automatico'
        elif 'manual' in s or 'm/t' in s:
            return 'Manual'
        else:
            return 'Outro'
    df_eng['transmission_type'] = df_eng['transmission'].apply(clean_transmission)

    # --- Cores (Manter original para Target Encoding)
    # top_ext_colors = df_eng['ext_col'].value_counts().nlargest(10).index
    # def simplificar_cor_ext(cor):
    #     return cor if cor in top_ext_colors else 'Other'
    # df_eng['ext_col_simple'] = df_eng['ext_col'].apply(simplificar_cor_ext)
    df_eng['ext_col_simple'] = df_eng['ext_col'] # Usar a coluna original para Target Encoding

    # top_int_colors = df_eng['int_col'].value_counts().nlargest(10).index
    # def simplificar_cor_int(cor):
    #     return cor if cor in top_int_colors else 'Other'
    # df_eng['int_col_simple'] = df_eng['int_col'].apply(simplificar_cor_int)
    df_eng['int_col_simple'] = df_eng['int_col'] # Usar a coluna original para Target Encoding

    # --- Tratamento de Nulos ---
    cols_texto = df_eng.select_dtypes(include=['object']).columns
    df_eng[cols_texto] = df_eng[cols_texto].replace('-', 'Unknown').fillna('Unknown')
    df_eng['clean_title'] = df_eng['clean_title'].replace('Unknown', 'No')

    # --- Acidente ---
    def verificar_acidente(valor):
        return 0 if 'None' in str(valor) else 1
    df_eng['accident_clean'] = df_eng['accident'].apply(verificar_acidente)

    # 1. R√°cio de Pot√™ncia por Litro (Efici√™ncia do motor)
    df_eng['hp_per_liter'] = df_eng['hp'] / (df_eng['liters'].replace(0, 0.001))

    # 2. R√°cio de Pot√™ncia por Cilindro
    df_eng['hp_per_cylinder'] = df_eng['hp'] / (df_eng['cylinders'].replace(0, 0.001))

    # 3. Log na Quilometragem (Milage)
    df_eng['milage_log'] = np.log1p(df_eng['milage'])
    
    # 4. Feature de Pre√ßo/Idade (Indicador de deprecia√ß√£o)
    df_eng['price_depreciation_indicator'] = df_eng['milage'] / (df_eng['car_age'] * df_eng['hp'].replace(0, 1))

    return df_eng

# 3. PREPARA√á√ÉO DE DADOS (Com Target Encoding)

In [3]:
def preparar_dados(df_treino, df_teste):
    """Prepara dados para modelagem usando Target Encoding"""

    # Aplicar feature engineering
    df_treino_eng = feature_engeneering(df_treino)
    df_teste_eng = feature_engeneering(df_teste)

    # Separar target e aplicar Log Transformation
    y = df_treino_eng['price']
    y_log = np.log1p(y)
    X = df_treino_eng.drop('price', axis=1)
    X_test = df_teste_eng.copy()

    # Selecionar features relevantes
    features_numericas = ['hp', 'liters', 'car_age', 'cylinders', 'miles_p_year', 
                          'milage_log', 'model_year', 'is_turbo', 'hp_per_liter', 
                          'hp_per_cylinder', 'price_depreciation_indicator', 'accident_clean']

    features_categoricas = ['brand', 'model', 'fuel_type', 'transmission_type', 
                           'ext_col_simple', 'int_col_simple', 'clean_title', 
                           'turbo_type', 'valve_train', 'fuel_injection']

    # Criar dataset num√©rico
    X_num = X[features_numericas].fillna(0)
    X_test_num = X_test[features_numericas].fillna(0)

    # Target Encoding para vari√°veis categ√≥ricas
    X_cat = X[features_categoricas].copy()
    X_test_cat = X_test[features_categoricas].copy()

    encoders = {}
    for col in features_categoricas:
        # TargetEncoder √© mais robusto que LabelEncoder para regress√£o
        te = TargetEncoder(cols=[col], smoothing=0.2)
        
        # Fit no treino e transform no treino e teste
        X_cat[col] = te.fit_transform(X_cat[col], y_log)
        X_test_cat[col] = te.transform(X_test_cat[col])
        encoders[col] = te

    # Concatenar features
    X_final = pd.concat([X_num, X_cat], axis=1)
    X_test_final = pd.concat([X_test_num, X_test_cat], axis=1)
    
    # Tratar NaN que podem surgir do Target Encoding em categorias novas no teste
    X_test_final = X_test_final.fillna(X_final.mean())

    return X_final, y_log, X_test_final, encoders

# 4. DEFINI√á√ÉO DE MODELOS (Otimizados para Stacking)

In [4]:
def obter_modelos_base():
    """Retorna dicion√°rio com modelos base otimizados para Stacking"""

    modelos = {
        'XGBoost': xgb.XGBRegressor(
            n_estimators=N_ESTIMATORS_XGB, learning_rate=LEARNING_RATE_XGB, 
            max_depth=6, subsample=0.7, colsample_bytree=0.7, 
            reg_alpha=0.1, reg_lambda=0.1, # Adicionado Regulariza√ß√£o
            random_state=42, n_jobs=-1
        ),
        'GradientBoosting': GradientBoostingRegressor(
            n_estimators=N_ESTIMATORS_GBM, learning_rate=LEARNING_RATE_GBM, 
            max_depth=5, subsample=0.7, random_state=42
        ),
        'RandomForest': RandomForestRegressor(
            n_estimators=N_ESTIMATORS_RF, max_depth=18, min_samples_split=5, 
            random_state=42, n_jobs=-1
        )
    }

    return modelos

def obter_meta_learner():
    """Retorna o modelo Meta-Learner para Stacking"""
    # Ridge √© um bom meta-learner, pois √© r√°pido e regularizado
    return Ridge(alpha=1.0)

# 5. Implementa√ß√£o de Stacking com K-Fold Cross-Validation

In [5]:
def treinar_e_prever_stacking(X, y, X_test, modelos_base, meta_learner):
    """Implementa Stacking com K-Fold Cross-Validation"""
    
    kf = KFold(n_splits=KFOLD_SPLITS, shuffle=True, random_state=42)
    
    # Inicializar matrizes para as previs√µes de n√≠vel 1
    # Previs√µes de treino (out-of-fold) e previs√µes de teste (m√©dia das folds)
    S_train = np.zeros((X.shape[0], len(modelos_base)))
    S_test = np.zeros((X_test.shape[0], len(modelos_base)))
    
    # Dicion√°rio para guardar os modelos treinados em 100% dos dados
    modelos_finais = {}
    
    print(f"Iniciando Stacking com {KFOLD_SPLITS}-Fold Cross-Validation...")
    
    for i, (nome, modelo) in enumerate(modelos_base.items()):
        print(f"\n‚öôÔ∏è Processando Modelo Base: {nome}")
        
        # 1. Gerar Previs√µes Out-of-Fold (N√≠vel 1 - Treino)
        S_test_i = np.zeros((X_test.shape[0], kf.n_splits))
        rmse_folds = []
        
        for j, (train_index, val_index) in enumerate(kf.split(X, y)):
            X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
            y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]
            
            modelo.fit(X_train_fold, y_train_fold)
            val_pred = modelo.predict(X_val_fold)
            S_train[val_index, i] = val_pred
            
            # Previs√£o no conjunto de teste para esta fold
            S_test_i[:, j] = modelo.predict(X_test)
            
            # Calcular RMSE (inverter log para m√©trica real)
            rmse_fold = np.sqrt(mean_squared_error(np.expm1(y_val_fold), np.expm1(val_pred)))
            rmse_folds.append(rmse_fold)
            print(f"   Fold {j+1} RMSE: ${rmse_fold:,.2f}")
            
        S_test[:, i] = S_test_i.mean(axis=1)
        rmse_media = np.mean(rmse_folds)
        print(f"   M√©dia RMSE K-Fold: ${rmse_media:,.2f}")
        
        # 2. Re-treinar o modelo base em 100% dos dados (para uso futuro)
        print(f"   üîÑ Re-treinando {nome} em 100% dos dados...")
        modelo.fit(X, y)
        modelos_finais[nome] = modelo
        
    # 3. Treinar Meta-Learner (N√≠vel 2)
    print("\nüèÜ Treinando Meta-Learner (Ridge) no N√≠vel 1...")
    meta_learner.fit(S_train, y)
    
    # 4. Previs√£o Final
    final_pred_log = meta_learner.predict(S_test)
    final_pred_reais = np.expm1(final_pred_log) # Inverter Log
    
    # Calcular RMSE de valida√ß√£o do Stacking (usando as previs√µes out-of-fold)
    val_pred_stacking = meta_learner.predict(S_train)
    rmse_stacking = np.sqrt(mean_squared_error(np.expm1(y), np.expm1(val_pred_stacking)))
    
    metricas_validacao = {
        'Stacking_RMSE': rmse_stacking,
        'Base_Model_RMSEs': {nome: np.mean(rmse_folds) for nome, modelo in modelos_base.items()}
    }
    
    return final_pred_reais, modelos_finais, metricas_validacao

# 6. Fun√ß√µes Auxiliares (Manter a fun√ß√£o de salvar)

In [6]:
def salvar_submissao_log(df_sub, modelos_finais, metricas):
    """
    Salva CSV e JSON incrementando o ID com base no maior n√∫mero encontrado.
    """
    pasta = 'submissoes_v4'
    os.makedirs(pasta, exist_ok=True)

    # 1. Listar arquivos e encontrar o maior ID existente
    arquivos = os.listdir(pasta)
    ids_existentes = []

    for f in arquivos:
        if f.startswith('submission_') and f.endswith('.csv'):
            try:
                numero_str = f.replace('submission_', '').replace('.csv', '')
                ids_existentes.append(int(numero_str))
            except ValueError:
                continue

    if not ids_existentes:
        next_id = 1
    else:
        next_id = max(ids_existentes) + 1

    # 2. Definir nomes dos arquivos
    filename_csv = f"{pasta}/submission_{next_id}.csv"
    filename_json = f"{pasta}/submission_{next_id}_params.json"

    # 3. Salvar CSV
    df_sub.to_csv(filename_csv, index=False)

    # 4. Metadata
    metadata = {
        "id": next_id,
        "modelo": "Stacking_XGB_GB_RF_Ridge",
        "performance_validacao": metricas,
        "hiperparametros_base": {nome: modelo.get_params() for nome, modelo in modelos_finais.items()}
    }

    # 5. Salvar JSON
    with open(filename_json, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=4, default=str)

    print(f"\n‚úÖ Submiss√£o #{next_id} salva com sucesso!")
    print(f"   üìÇ {filename_csv}")

# 7. Execu√ß√£o do Main (Novo Pipeline)

In [7]:
if __name__ == "__main__":
    # 1. Carregar dados
    print("üìÇ Carregando dados...")
    # ATEN√á√ÉO: Certifique-se de que os ficheiros 'dados/train.csv' e 'dados/test.csv' est√£o dispon√≠veis
    try:
        df_treino = pd.read_csv('dados/train.csv', index_col='id')
        df_teste = pd.read_csv('dados/test.csv', index_col='id')
    except FileNotFoundError:
        print("ERRO: Ficheiros de dados n√£o encontrados. Certifique-se de que est√£o em 'dados/train.csv' e 'dados/test.csv'.")
        exit()

    # 2. Prepara√ß√£o (Target Encoding e Log Target inclu√≠dos)
    print("üîÑ Preparando dados com Target Encoding...")
    X, y_log, X_test_final, encoders = preparar_dados(df_treino, df_teste)

    # 3. Definir Modelos
    modelos_base = obter_modelos_base()
    meta_learner = obter_meta_learner()

    # 4. Treinamento e Previs√£o com Stacking e K-Fold
    previsoes_finais, modelos_finais, metricas_validacao = treinar_e_prever_stacking(X, y_log, X_test_final, modelos_base, meta_learner)

    # 5. Submiss√£o
    print(f"\nüèÜ Gerando submiss√£o Stacking...")
    df_submissao = pd.DataFrame({
        'id': df_teste.index,
        'price': previsoes_finais
    })

    salvar_submissao_log(df_submissao, modelos_finais, metricas_validacao)