In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import SGDRegressor, ElasticNet, HuberRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor  # Ensure to install xgboost

# Load datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

# Data preprocessing
for dataset in [train_data, test_data]:
    dataset['DATE'] = pd.to_datetime(dataset['DATE'], dayfirst=True)
    dataset.drop(columns=['PRCP_A', 'PRCP_B', 'PRCP_C', 'Unnamed: 0', 'SNWD_A', 'SNWD_B', 'SNWD_C', 'ELEVATION_A', 'ELEVATION_B', 'ELEVATION_C'], errors='ignore', inplace=True)

temp_columns = ['TMAX_A', 'TMIN_A', 'TAVG_A', 'TMAX_B', 'TMIN_B', 'TAVG_B', 'TMAX_C', 'TMIN_C', 'TAVG_C']
imputer = SimpleImputer(strategy='median')
train_data[temp_columns] = imputer.fit_transform(train_data[temp_columns])
test_data[temp_columns] = imputer.transform(test_data[temp_columns])

train_data['TAVG_A'] = train_data[['TAVG_A', 'TMAX_A']].min(axis=1)
test_data['TAVG_A'] = test_data[['TAVG_A', 'TMAX_A']].min(axis=1)

# Function to extract date features
def extract_date_features(dataset):
    dataset['YEAR'] = dataset['DATE'].dt.year
    dataset['MONTH'] = dataset['DATE'].dt.month
    dataset['DAY'] = dataset['DATE'].dt.day
    dataset['DAYOFWEEK'] = dataset['DATE'].dt.dayofweek
    dataset['WEEKEND'] = (dataset['DAYOFWEEK'] >= 5).astype(int)
    dataset['DAYOFYEAR'] = dataset['DATE'].dt.dayofyear
    dataset['MONTH_DAY'] = dataset['MONTH'] * dataset['DAY']

for dataset in [train_data, test_data]:
    extract_date_features(dataset)

X = train_data.drop(columns=['DATE', 'TAVG'])
y = train_data['TAVG']
X_test = test_data.drop(columns=['INDEX', 'DATE'])

# Numeric transformer pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),
    ('scaler', StandardScaler())
])

X = numeric_transformer.fit_transform(X)
X_test = numeric_transformer.transform(X_test)

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

# Feature selection using GradientBoostingRegressor
selection_model = GradientBoostingRegressor(random_state=0)
selection_model.fit(X_train, y_train)
selector = SelectFromModel(selection_model, threshold="mean", prefit=True)

X_train_selected = selector.transform(X_train)
X_valid_selected = selector.transform(X_valid)
X_test_selected = selector.transform(X_test)

# Define models with Huber loss and other models
sgd_best = SGDRegressor(alpha=0.001, eta0=0.01, learning_rate='invscaling', loss='huber', penalty='elasticnet', max_iter=1000, tol=1e-3, random_state=0)
mlp_best = MLPRegressor(hidden_layer_sizes=(100, 100), alpha=0.1, learning_rate_init=0.01, solver='adam', learning_rate='adaptive', max_iter=500, random_state=0)  # Adam optimizer is used here
svr_best = SVR(C=10, epsilon=0.1, kernel='rbf')
xgb_best = XGBRegressor(learning_rate=0.1, max_depth=3, n_estimators=200, objective='reg:squarederror', random_state=0)
elastic_best = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=1000, tol=0.01, random_state=0)

ada_best = AdaBoostRegressor(learning_rate=1.0, n_estimators=200, random_state=0)
rf_best = RandomForestRegressor(n_estimators=200, max_depth=None, random_state=0)
gbr_best = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=5, loss='huber', random_state=0)

# Stacking Regressor with the models
stacked_model = StackingRegressor(
    estimators=[
        ('mlp', mlp_best),
        ('xgb', xgb_best),
        ('gbr', gbr_best),
        ('rf', rf_best),
        ('elastic', elastic_best),
        ('huber', HuberRegressor(alpha=10, epsilon=1.5))
    ],
    final_estimator=SVR(C=10, epsilon=0.1, kernel='rbf')
)

# Fit the stacked model and predict
stacked_model.fit(X_train_selected, y_train)
y_valid_pred = stacked_model.predict(X_valid_selected)

# Evaluation metrics
mae = mean_absolute_error(y_valid, y_valid_pred)
mse = mean_squared_error(y_valid, y_valid_pred)
r2 = r2_score(y_valid, y_valid_pred)

print(f"Stacking Regressor - MAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")

# Predict on the test set and save submission
y_test_pred = stacked_model.predict(X_test_selected)
submission = sample_submission.copy()
submission['TAVG'] = y_test_pred
submission.to_csv('/content/submission.csv', index=False)

Stacking Regressor - MAE: 1.6799, MSE: 4.5854, R²: 0.9747
