In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import StackingRegressor
import tensorflow as tf

# Data Loading and Preparation
train_df = pd.read_csv('/content/transformed_train_2.csv')
test_df = pd.read_csv('/content/transformed_test_2.csv')
train_df = train_df.drop(columns=['DATE'])

# Feature Selection
features = [col for col in train_df.columns if col != 'TAVG' and col != 'Unnamed: 0']
X_train = train_df[features]
y_train = train_df['TAVG']
X_test = test_df[features]

# Feature Importance Analysis using ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor

extra_trees = ExtraTreesRegressor(random_state=42)
extra_trees.fit(X_train, y_train)
feature_importances = extra_trees.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print("Feature Importance:")
print(importance_df)

# Feature Reduction
top_features = importance_df.head(20)['Feature'].tolist()
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]

corr_matrix = X_train_reduced.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]
print(f"Features to drop due to high correlation: {to_drop}")
X_train_reduced = X_train_reduced.drop(columns=to_drop)
X_test_reduced = X_test_reduced.drop(columns=to_drop)

# Recursive Feature Elimination (RFE)
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10)
fit = rfe.fit(X_train_reduced, y_train)
selected_features_rfe = X_train_reduced.columns[fit.support_]
print(f"Selected features using RFE: {selected_features_rfe}")

X_train_reduced = X_train_reduced[selected_features_rfe]
X_test_reduced = X_test_reduced[selected_features_rfe]

# Define NAG optimizer using TensorFlow
class NAGOptimizer(tf.keras.optimizers.Optimizer):
    def __init__(self, learning_rate=0.01, momentum=0.9, name="NAGOptimizer", **kwargs):
        super().__init__(name, **kwargs)
        self._learning_rate = learning_rate
        self._momentum = momentum

    def _resource_apply_dense(self, grad, var, apply_state=None):
        momentum_var = self.add_weight(name="momentum", shape=var.shape, initializer="zeros", trainable=False)
        momentum = self._momentum * momentum_var + grad
        var.assign_sub(self._learning_rate * (grad + self._momentum * momentum))
        momentum_var.assign(momentum)

    def get_config(self):
        config = super().get_config()
        config.update({"learning_rate": self._learning_rate, "momentum": self._momentum})
        return config

# Hyperparameter Tuning with RandomizedSearchCV and GridSearchCV
# RandomForestRegressor
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2']
}

rf_random_search = RandomizedSearchCV(
    RandomForestRegressor(random_state=42),
    param_distributions=param_grid_rf,
    n_iter=10, cv=3, random_state=42
)
rf_random_search.fit(X_train_reduced, y_train)
best_rf_params = rf_random_search.best_params_

rf_grid_search = GridSearchCV(
    RandomForestRegressor(**best_rf_params, random_state=42),
    param_grid={'bootstrap': [True, False]},
    cv=3
)
rf_grid_search.fit(X_train_reduced, y_train)
best_rf_model = rf_grid_search.best_estimator_

# GradientBoostingRegressor
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

gb_random_search = RandomizedSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_distributions=param_grid_gb,
    n_iter=10, cv=3, random_state=42
)
gb_random_search.fit(X_train_reduced, y_train)
best_gb_params = gb_random_search.best_params_

gb_grid_search = GridSearchCV(
    GradientBoostingRegressor(**best_gb_params, random_state=42),
    param_grid={'subsample': [0.8, 1.0]},
    cv=3
)
gb_grid_search.fit(X_train_reduced, y_train)
best_gb_model = gb_grid_search.best_estimator_

# SGDRegressor with ADAM-like optimization
adam_sgd = SGDRegressor(
    max_iter=1000,
    tol=1e-3,
    learning_rate='constant',
    eta0=0.01,  # Learning rate
    penalty='elasticnet',  # ADAM-like approach using elasticnet
    random_state=42
)

param_grid_sgd = {
    'alpha': [0.0001, 0.001, 0.01],
    'l1_ratio': [0.15, 0.3, 0.5],
    'learning_rate': ['constant', 'optimal', 'invscaling']
}

sgd_random_search = RandomizedSearchCV(
    adam_sgd,
    param_distributions=param_grid_sgd,
    n_iter=10, cv=3, random_state=42
)
sgd_random_search.fit(X_train_reduced, y_train)
best_sgd_params = sgd_random_search.best_params_

sgd_grid_search = GridSearchCV(
    adam_sgd.set_params(**best_sgd_params),
    param_grid={'eta0': [0.01, 0.05, 0.1]},
    cv=3
)
sgd_grid_search.fit(X_train_reduced, y_train)
best_sgd_model = sgd_grid_search.best_estimator_

# Model Initialization and Stacking
base_models = [
    ('rf', best_rf_model),
    ('gb', best_gb_model),
    ('sgd', best_sgd_model)  # SGD replaces ExtraTrees in the base models
]

stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=best_sgd_model  # Using the SGD model with ADAM-like optimization
)

# Stacking Model Hyperparameter Tuning
stacking_param_grid = {
    'final_estimator__alpha': [0.0001, 0.001, 0.01],
    'final_estimator__l1_ratio': [0.15, 0.3, 0.5]
}

stacking_grid_search = GridSearchCV(
    stacking_regressor,
    param_grid=stacking_param_grid,
    cv=3
)
stacking_grid_search.fit(X_train_reduced, y_train)
best_stacking_model = stacking_grid_search.best_estimator_

# Pipeline Creation
stacking_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('stacking', best_stacking_model)
])

# Model Training and Evaluation
stacking_pipeline.fit(X_train_reduced, y_train)
y_train_pred = stacking_pipeline.predict(X_train_reduced)
mae = mean_absolute_error(y_train, y_train_pred)
mse = mean_squared_error(y_train, y_train_pred)
print(f"Stacking Regressor - Mean Absolute Error on Training Set: {mae}")
print(f"Stacking Regressor - Mean Squared Error on Training Set: {mse}")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(stacking_pipeline, X_train_reduced, y_train, cv=kf, scoring='neg_mean_absolute_error')
print(f"Stacking Regressor - Cross-validated MAE: {-np.mean(cv_scores)}\n")

# Making Predictions and Saving Results
test_predictions = stacking_pipeline.predict(X_test_reduced)
submission_df = pd.DataFrame({'INDEX': test_df.index, 'TAVG': test_predictions})
submission_df.to_csv('/content/submission.csv', index=False)
print("Model training and prediction completed. Results saved to 'submission.csv'.")


Feature Importance:
                Feature  Importance
32              DAY_COS    0.254134
64          MEDIAN_TEMP    0.146496
62            MEAN_TEMP    0.100877
51       TEMP_ANOMALY_B    0.050472
7                TAVG_A    0.048787
..                  ...         ...
38         ELEV_DIFF_AB    0.000000
39         ELEV_DIFF_AC    0.000000
1           LONGITUDE_A    0.000000
53  LAT_LONG_INTERACT_A    0.000000
80     LOCATION_CLUSTER    0.000000

[81 rows x 2 columns]
Features to drop due to high correlation: ['MEAN_TEMP', 'ELEV_TEMP_INTERACT_A', 'AVG_TEMP_B', 'MONTH_COS', 'TAVG_C', 'DAY_SIN', 'TMAX_B', 'MONTH', 'TEMP_ANOMALY_A']
Selected features using RFE: Index(['DAY_COS', 'MEDIAN_TEMP', 'TEMP_ANOMALY_B', 'TAVG_A', 'TMIN_B',
       'MONTH_SIN', 'DAY_OF_YEAR', 'TMAX_A', 'TMAX_C', 'TMIN_C'],
      dtype='object')


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

Stacking Regressor - Mean Absolute Error on Training Set: 0.9326088260388901
Stacking Regressor - Mean Squared Error on Training Set: 1.2821688484146592
Stacking Regressor - Cross-validated MAE: 1.8251744386986501

Model training and prediction completed. Results saved to 'submission.csv'.
