In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
# Load the dataset safely
df = pd.read_csv("/content/ashrae_db2.01.csv", low_memory=False)

# Drop rows with NaN target
df = df.dropna(subset=["PMV"])

# Attempt to convert object columns to numeric if possible
for col in df.columns:
    if df[col].dtype == 'object':
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError:
            pass  # Keep as object if not convertible

In [None]:
# Separate features and target
target_col = "PMV"
X = df.drop(columns=target_col)
y = df[target_col]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
# Re-identify column types after cleaning
numeric_features = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine preprocessing
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Models to evaluate
models = {
    "AdaBoost": AdaBoostRegressor(random_state=69,n_estimators=30),
    "DecisionTree": DecisionTreeRegressor(random_state=69,max_depth=3),
    "ElasticNet": ElasticNet(random_state=69)
}

In [None]:
# Train and evaluate each model
results = []

for name, model in models.items():
    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": name,
        "MSE": round(mse, 4),
        "R² Score": round(r2, 4)
    })

# Display results
results_df = pd.DataFrame(results)
print(results_df)

          Model     MSE  R² Score
0      AdaBoost  0.0323    0.9648
1  DecisionTree  0.0506    0.9447
2    ElasticNet  0.4728    0.4841
