In [5]:

!git clone https://github.com/PriorLabs/TabPFN.git
!pip install -e "TabPFN[dev]"

Cloning into 'TabPFN'...


Obtaining file:///C:/Users/rayba/Music/mestrado_dados/vale-nova/codigo/TabPFN
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Collecting pre-commit (from tabpfn==2.0.7)
  Downloading pre_commit-4.2.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting ruff (from tabpfn==2.0.7)
  Downloading ruff-0.11.2-py3-none-win_amd64.whl.metadata (26 kB)
Collecting mypy (from tabpfn==2.0.7)
  Downloading mypy-1.15.0-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting pytest (from tabpfn==2.0.7)
  Using cached pytest-8.3.5-py3-none-any.whl.metadata (7.6 kB)
C

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tabpfn-extensions 0.1.0 requires numpy<2,>=1.21.0, but you have numpy 2.2.4 which is incompatible.


In [6]:
from sklearn.datasets import fetch_openml
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assuming there is a TabPFNRegressor (if not, a different regressor should be used)
from tabpfn import TabPFNRegressor  

# Load Boston Housing data
df = fetch_openml(data_id=531, as_frame=True)  # Boston Housing dataset
X = df.data
y = df.target.astype(float)  # Ensure target is float for regression

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Initialize the regressor
regressor = TabPFNRegressor()  
regressor.fit(X_train, y_train)

# Predict on the test set
predictions = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

ModuleNotFoundError: No module named 'tabpfn'

In [None]:
import os
import gc
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Modelos
from tabpfn import TabPFNRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor  # <-- importando XGBRegressor

###############################################################################
# 1) Carregar e pré-processar a base
###############################################################################
caminho_arquivo = "Flotacao_conceicao_II_all.csv"
df = pd.read_csv(caminho_arquivo, sep=';', low_memory=True)

df['Data'] = pd.to_datetime(df['Data'], format='%d/%m/%Y %H:%M')
df.set_index('Data', inplace=True)

# Substituir vírgulas por pontos e converter para float32
df = df.replace(',', '.', regex=True).astype(np.float32)

df = df.sort_index()  # Garantir ordem cronológica

# Exemplos de filtros
df = df.mask(df['Vazão de alimentação da flotação (m³/h)'] <= 350)
df = df.mask(df['Densid alim flot (t/m³)'] <= 1.3)
df = df.dropna()

# Remover valores <= 0
df = df.mask(df <= 0)
df = df.ffill().bfill()

target_col = 'SiO2 C flot (%)'

# Lista de colunas que NÃO queremos usar como features
features_remove = [
    '+0,15mm alim flot (%)', 'Fe alim flot (%)', 'Est Esp Cleaner 2 (m/s)',
    'SiO2 alim flot  (%)', 'Est Esp Cleaner 1 (m/s)', 'Fe (%)',
    'Tem Res Rougher 2 (min)', 'MgO alim flot (%)', 'Tem Res Cleaner 1 (min)',
    'Tem Res Re-Cleaner  (min)', 'Tem Res Cleaner 2 (min)',
    'Pressão desl 3ª S (kgf/cm²)', '2Cleaner (cm)', 'Est Esp Re-Cleaner (m/s)',
    '+0,25mm alim flot (%)', 'SiO2  (%)', '+6,3mm (%)',
    'Al2O3 alim flot (%)', 'Densid alim flot (t/m³)',
    'Pressão desl 2ª S (kgf/cm²)', 'Fe C flot (%)','SiO2 C flot (%)'
]

###############################################################################
# 2) Criar lags (defasagens) da coluna-alvo
###############################################################################
# Exemplo: criar 3 lags (t-1, t-2, t-3)
# n_lags = 3
# for i in range(1, n_lags + 1):
#     df[f'{target_col}_lag{i}'] = df[target_col].shift(i)

# Após criar lags, haverá NaN nas primeiras linhas (onde não há valor anterior)
df = df.dropna()

###############################################################################
# 3) Definir X e y (incluindo as colunas de lag)
###############################################################################
# Agora, 'SiO2 C flot (%)' é a coluna a prever; as colunas lags viram parte de X
X = df.drop(columns=features_remove, errors='ignore')  # remove colunas irrelevantes
y = df[target_col]

###############################################################################
# 4) Separar treino e teste (por exemplo, 20% para teste)
###############################################################################
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

# Padronizar (scaler)
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)

###############################################################################
# 5) Função de Inductive Conformal Prediction
###############################################################################
def inductive_conformal_prediction(model, X_train_prop, y_train_prop,
                                   X_cal, y_cal, X_test, alpha=0.05):
    """
    Treina o modelo em X_train_prop, y_train_prop.
    Calcula o erro no conjunto de calibração (X_cal, y_cal).
    Extrai quantil (q_hat).
    Faz predição no X_test e gera intervalos [y_pred ± q_hat].
    Retorna: (y_pred_test, lower, upper, q_hat, training_time).
    """
    start_time = time.time()
    model.fit(X_train_prop, y_train_prop)
    training_time = time.time() - start_time

    # Previsões no calibration set
    y_cal_pred = model.predict(X_cal)
    cal_errors = np.abs(y_cal - y_cal_pred)
    n_cal = len(y_cal)

    # Cálculo do quantil ajustado => cobertura 1 - alpha
    alpha_adj = np.ceil((n_cal + 1) * (1 - alpha)) / n_cal
    q_hat = np.quantile(cal_errors, alpha_adj)

    # Previsões no teste
    y_pred_test = model.predict(X_test)
    lower = y_pred_test - q_hat
    upper = y_pred_test + q_hat

    return y_pred_test, lower, upper, q_hat, training_time

###############################################################################
# 6) Separar o Train (80%) em ProperTrain e Calib
###############################################################################
X_train_prop, X_cal, y_train_prop, y_cal = train_test_split(
    X_train_full_scaled, y_train_full,
    test_size=0.1,  # Ajuste conforme desejar a fração para calibração
    random_state=42,
    shuffle=False
)
print("Tamanhos:")
print("  Proper Train =", X_train_prop.shape)
print("  Calib        =", X_cal.shape)
print("  Test         =", X_test_scaled.shape)

###############################################################################
# 7) (Opcional) Função para limitar tamanho do treinamento TabPFN
###############################################################################
def limit_training_size(X_data, y_data, max_size=10000):
    if len(X_data) > max_size:
        return X_data[:max_size], y_data[:max_size]
    else:
        return X_data, y_data

###############################################################################
# 8) Definir dicionário de modelos (incluindo XGBRegressor)
###############################################################################
models = {
    "TabPFN": TabPFNRegressor(ignore_pretraining_limits=True),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "XGB": XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
}

###############################################################################
# 9) Treinar cada modelo e aplicar Conformal
###############################################################################
alpha = 0.05  # Cobertura de 95%
results = {}

for model_name, model in models.items():
    print(f"\n=== {model_name} ===")

    # Para o TabPFN, limitar se necessário
    if model_name == "TabPFN":
        X_train_prop_used, y_train_prop_used = limit_training_size(X_train_prop, y_train_prop, max_size=10000)
    else:
        # RandomForest e XGB => usar todo o conjunto
        X_train_prop_used, y_train_prop_used = X_train_prop, y_train_prop

    y_pred_test, lower_test, upper_test, q_hat, training_time = inductive_conformal_prediction(
        model,
        X_train_prop_used, y_train_prop_used,
        X_cal, y_cal,
        X_test_scaled,
        alpha=alpha
    )

    # Métricas
    mse = mean_squared_error(y_test, y_pred_test)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred_test)

    print(f"{model_name} => MSE: {mse:.4f} | RMSE: {rmse:.4f} | R2: {r2:.4f} | Tempo treino: {training_time:.2f}s | q_hat={q_hat:.2f}")

    # Cobertura no teste
    within_interval = (y_test >= lower_test) & (y_test <= upper_test)
    coverage = within_interval.mean()
    print(f"Coverage (95% esperado): {coverage:.3f}")

    # Construir DataFrame de resultado
    df_res = pd.DataFrame({
        'actual': y_test,
        'predicted': y_pred_test,
        'lower': lower_test,
        'upper': upper_test
    }, index=y_test.index)
    results[model_name] = df_res

###############################################################################
# 10) Plotar resultados no teste
###############################################################################
for model_name, df_res in results.items():
    plt.figure(figsize=(12,5))
    plt.plot(df_res.index, df_res['actual'], label='Valor Real', color='blue')
    plt.plot(df_res.index, df_res['predicted'], label=f'{model_name}_Pred', color='red')
    plt.fill_between(
        df_res.index,
        df_res['lower'],
        df_res['upper'],
        color='gray',
        alpha=0.2,
        label='Conformal Interval'
    )
    plt.xlabel('Data')
    plt.ylabel('SiO2 C flot (%)')
    plt.title(f'Conformal Prediction - {model_name} (com lags)')
    plt.legend()
    plt.tight_layout()
    plt.show()

# Opcional: salvar CSV de cada modelo
for model_name, df_res in results.items():
    csv_name = f"predictions_{model_name}_lags.csv"
    df_res.to_csv(csv_name)
    print(f"Arquivo salvo: {csv_name}")

print("\nPipeline concluído!")