In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [1]:
from google.colab import auth
from google.cloud import bigquery
auth.authenticate_user()



In [4]:
# 2. Conexão com BigQuery (substitua pelo seu ID de projeto)
PROJECT_ID = "dynamic-camp-468900-t5"
client = bigquery.Client(project=PROJECT_ID)

# 3. Consulta dados
query_train = "SELECT * FROM `dynamic-camp-468900-t5.enem.ml_treino` LIMIT 1000"
query_test = "SELECT * FROM `dynamic-camp-468900-t5.enem.ml_teste` LIMIT 1000"

df_train = client.query(query_train).to_dataframe()
df_test = client.query(query_test).to_dataframe()

In [5]:
df_train

Unnamed: 0,NU_INSCRICAO,NU_ANO,CO_MUNICIPIO_RESIDENCIA,NO_MUNICIPIO_RESIDENCIA,CO_UF_RESIDENCIA,SG_UF_RESIDENCIA,NU_IDADE,TP_SEXO,TP_ESTADO_CIVIL,TP_COR_RACA,...,Q018,Q019,Q020,Q021,Q022,Q023,Q024,Q025,Q026,Q027
0,180012620312,2018,1100015,Alta Floresta D'Oeste,11,RO,19,F,0,3,...,A,A,A,A,D,A,B,B,A,A
1,180011030871,2018,1100015,Alta Floresta D'Oeste,11,RO,19,F,0,3,...,A,B,B,A,C,A,B,B,A,A
2,180012122612,2018,1100023,Ariquemes,11,RO,21,F,0,5,...,A,C,B,A,D,A,A,A,A,A
3,180008657039,2018,1100023,Ariquemes,11,RO,19,M,0,3,...,A,B,A,A,C,B,B,B,A,A
4,180013153931,2018,1100023,Ariquemes,11,RO,21,M,0,3,...,A,B,A,A,C,A,B,B,A,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,180008324503,2018,1501402,Belém,15,PA,23,F,0,3,...,A,B,B,A,B,A,A,A,A,A
996,180012348388,2018,1501402,Belém,15,PA,23,F,0,1,...,A,B,B,A,C,A,B,A,A,A
997,180012558346,2018,1501402,Belém,15,PA,29,F,1,2,...,A,B,B,B,B,A,A,B,A,A
998,180007861870,2018,1501402,Belém,15,PA,23,F,0,3,...,A,B,A,A,D,A,B,B,A,C


In [6]:
# 4. Seleção de colunas
colunas = [
    "NU_ANO", "Q001", "Q002", "Q003", "Q004", "Q005", "Q006", "Q007", "Q008", "Q009", "Q010",
    "Q011", "Q012", "Q013", "Q014", "Q015", "Q016", "Q017", "Q018", "Q019", "Q020",
    "Q021", "Q022", "Q023", "Q024", "Q025", "Q026", "Q027",
    "TP_ESTADO_CIVIL", "TP_COR_RACA", "TP_ESCOLA", "CO_UF_PROVA"
]
alvo = "NU_NOTA_MT"

X_train = df_train[colunas]
y_train = df_train[alvo]
X_test = df_test[colunas]
y_test = df_test[alvo]

In [16]:
# 1. Substituir pd.NA por np.nan em todo o DataFrame
X_train = X_train.replace({pd.NA: np.nan})
X_test = X_test.replace({pd.NA: np.nan})
y_train = y_train.replace({pd.NA: np.nan})
y_test = y_test.replace({pd.NA: np.nan})

# 2. Converter todas as colunas para string (evita ambiguidades de tipos mistos)
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# 3. Garantir que y_train não tenha NaN e seja numérico
y_train = pd.to_numeric(y_train, errors="coerce")
y_test = pd.to_numeric(y_test, errors="coerce")

In [17]:
# 5. Pipeline de pré-processamento
preprocessador = ColumnTransformer(transformers=[
    ("cat", Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="0")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ]), colunas)
])

# 6. Modelo com Random Forest
modelo = Pipeline(steps=[
    ("preprocessador", preprocessador),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# 7. Treinamento
modelo.fit(X_train, y_train)

In [18]:
# 8. Previsões
y_pred = modelo.predict(X_test)

In [20]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# 9. Avaliação
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.3f}")

MAE: 64.51
RMSE: 80.94
R²: 0.056
