In [None]:
# --- 1. Libraries ---
import pandas as pd
import numpy as np
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"C:\Users\Eros\Documents\__Proyecto Integral__Kaggle__ASHRAE Energy Predictor\ashrae-bq-project-f47cdb69be15.json"
from google.cloud import bigquery
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# --- 2. Conection to BigQuery ---
client = bigquery.Client(location="EU")

# --- 3. Query ---
query = """
SELECT *
FROM `ashrae-bq-project.ashrae_dataset.silver_norm_features_v5`
LIMIT 1000000
"""

df = client.query(query).to_dataframe()

df = df.drop(columns=["data_split"], errors="ignore")  # column created to split train/test at origin, no needed here

print("Downloaded Data:", df.shape)

In [None]:
# --- 4. Basic Preprocessed ---
# Delete rows with nulls
df = df.dropna()

# Categorical Variables → one-hot encoding
df = pd.get_dummies(df, columns=["primary_use", "primary_use_grouped", "hour_block"], drop_first=True)

# Features & target
X = df.drop("meter_reading_log", axis=1)
y = df["meter_reading_log"]

In [None]:
# --- 5. Function for training & evaluate ---
def evaluate_model(model, X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return r2, rmse

In [4]:
# --- 6. Evaluación con todos los datos ---

r2_lin, rmse_lin = evaluate_model(LinearRegression(), X, y)
r2_rf, rmse_rf = evaluate_model(RandomForestRegressor(n_estimators=50, random_state=42), X, y)
r2_xgb, rmse_xgb = evaluate_model(XGBRegressor(tree_method="hist", n_estimators=300, learning_rate=0.1, max_depth=8, random_state=42), X, y)
r2_gb, rmse_gb = evaluate_model(GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42), X, y)

# Diccionario con resultados
results = {
    "Model": ["LinearRegression", "RandomForest", "XGBRegressor", "GradientBoostingRegressor"],
    "R2_Score": [r2_lin, r2_rf, r2_xgb, r2_gb],
    "RMSE": [rmse_lin, rmse_rf, rmse_xgb, rmse_gb]
}

df_results = pd.DataFrame(results)

# Mostrar como tabla ordenada por R2
df_results = df_results.sort_values(by="R2", ascending=False).reset_index(drop=True)
display(df_results)

Unnamed: 0,Modelo,R2,RMSE
0,XGBRegressor,0.980956,0.241944
1,GradientBoostingRegressor,0.978828,0.255106
2,RandomForest,0.977836,0.261013
3,LinearRegression,0.685116,0.983821
