In [None]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import (GradientBoostingRegressor, StackingRegressor,
                              ExtraTreesRegressor, RandomForestRegressor)
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer

# Load the datasets
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

# Convert the DATE column to datetime and drop the unwanted columns
# Unnecessary columns like PRCP (Precipitation), SNWD (Snow Depth), and ELEVATION columns are dropped
drop_columns = ['PRCP_A', 'PRCP_B', 'PRCP_C', 'Unnamed: 0', 'ELEVATION_A',
                'ELEVATION_B', 'ELEVATION_C', 'SNWD_A', 'SNWD_B', 'SNWD_C']
for dataset in [train_data, test_data]:
    dataset['DATE'] = pd.to_datetime(dataset['DATE'], dayfirst=True)  # Convert the 'DATE' column to datetime type
    dataset.drop(columns=drop_columns, errors='ignore', inplace=True)  # Drop the unnecessary columns

# Fill missing temperature values using median and correct TAVG_A
# Impute missing values for temperature columns with median values
temp_columns = ['TMAX_A', 'TMIN_A', 'TAVG_A', 'TMAX_B', 'TMIN_B',
                'TAVG_B', 'TMAX_C', 'TMIN_C', 'TAVG_C']
imputer = SimpleImputer(strategy='median')
for dataset in [train_data, test_data]:
    dataset[temp_columns] = imputer.fit_transform(dataset[temp_columns])  # Apply imputation to temperature columns
    dataset['TAVG_A'] = dataset[['TAVG_A', 'TMAX_A']].min(axis=1)  # Ensure TAVG_A is not greater than TMAX_A

# Extract date-related features such as year, month, and day from the 'DATE' column
def extract_date_features(dataset):
    dataset['YEAR'] = dataset['DATE'].dt.year  # Extract the year
    dataset['MONTH'] = dataset['DATE'].dt.month  # Extract the month
    dataset['DAY'] = dataset['DATE'].dt.day  # Extract the day of the month
    dataset['DAYOFWEEK'] = dataset['DATE'].dt.dayofweek  # Extract the day of the week
    dataset['WEEKEND'] = (dataset['DAYOFWEEK'] >= 5).astype(int)  # Mark if it's a weekend (1 if true, else 0)
    dataset['DAYOFYEAR'] = dataset['DATE'].dt.dayofyear  # Extract the day of the year
    dataset['MONTH_DAY'] = dataset['MONTH'] * dataset['DAY']  # Create a feature by multiplying month and day

for dataset in [train_data, test_data]:
    extract_date_features(dataset)  # Apply the date feature extraction function to both datasets

# Define features (X) and target (y) for the training data
X = train_data.drop(columns=['DATE', 'TAVG'])  # Drop 'DATE' and target column 'TAVG' from training features
y = train_data['TAVG']  # 'TAVG' is the target variable (average temperature)
X_test = test_data.drop(columns=['INDEX', 'DATE'])  # Drop 'INDEX' and 'DATE' from the test set

# Preprocessing pipeline for numerical and categorical data
numeric_features = temp_columns + ['YEAR', 'MONTH', 'DAY', 'DAYOFWEEK', 'WEEKEND', 'DAYOFYEAR', 'MONTH_DAY']
categorical_features = []  # No categorical features in this dataset

# Preprocessing pipeline for numerical data: imputing missing values, polynomial feature generation, and scaling
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values using the median
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)),  # Generate interaction features
    ('scaler', StandardScaler())  # Standardize features
])

# ColumnTransformer to apply preprocessing to numerical data and any potential categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Apply the numeric_transformer to numeric columns
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # Placeholder for categorical features
    ])

# Apply the transformer to both training and test data
X = preprocessor.fit_transform(X)  # Fit and transform the training data
X_test = preprocessor.transform(X_test)  # Transform the test data

# Splitting the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)  # 80-20 train-test split

# Feature selection using ExtraTreesRegressor's feature importance
# ExtraTreesRegressor is used to rank the features based on importance
selection_model = ExtraTreesRegressor(random_state=0)
selection_model.fit(X_train, y_train)
selector = SelectFromModel(selection_model, threshold="mean", prefit=True)  # Select features above the mean importance

X_train_selected = selector.transform(X_train)  # Transform the training data using the selected features
X_valid_selected = selector.transform(X_valid)  # Transform the validation data
X_test_selected = selector.transform(X_test)  # Transform the test data

# Define the models with optimized hyperparameters
gb_model = GradientBoostingRegressor(
    n_estimators=200,  # Number of boosting stages
    learning_rate=0.1,  # Learning rate
    max_depth=5,  # Maximum depth of the tree
    min_samples_split=5,  # Minimum number of samples to split an internal node
    min_samples_leaf=3,  # Minimum number of samples to be at a leaf node
    random_state=0
)

nn_model = MLPRegressor(
    hidden_layer_sizes=(150, 75),  # Two hidden layers with 150 and 75 neurons
    activation='relu',  # Activation function for the hidden layers
    solver='adam',  # Solver for weight optimization
    alpha=0.001,  # L2 penalty (regularization term)
    learning_rate_init=0.01,  # Initial learning rate
    random_state=0
)

et_model = ExtraTreesRegressor(
    n_estimators=300,  # Number of trees in the forest
    max_depth=7,  # Maximum depth of the tree
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=0
)

rf_model = RandomForestRegressor(
    n_estimators=300,  # Number of trees in the forest
    max_depth=7,
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=0
)

# Create the stacking regressor with the optimized models
# The final estimator is Ridge regression
stacked_model = StackingRegressor(
    estimators=[('gb', gb_model), ('nn', nn_model), ('et', et_model), ('rf', rf_model)],
    final_estimator=Ridge(alpha=1.0, random_state=0)
)

# Train the stacked model
stacked_model.fit(X_train_selected, y_train)

# Evaluate the model on validation data
y_pred = stacked_model.predict(X_valid_selected)  # Predict on validation data
mae = mean_absolute_error(y_valid, y_pred)  # Calculate mean absolute error
mse = mean_squared_error(y_valid, y_pred)  # Calculate mean squared error
r2 = r2_score(y_valid, y_pred)  # Calculate R^2 score

print("Optimized Stacked Model Evaluation:")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Perform K-fold cross-validation to evaluate the model's stability
kf = KFold(n_splits=5, random_state=42, shuffle=True)
scores = cross_val_score(stacked_model, X_train_selected, y_train, cv=kf, scoring='r2')

print("Cross-Validation Mean R^2 Score:", scores.mean())  # Average R^2 score across folds
print("Cross-Validation Mean Absolute Error:", -1 * cross_val_score(stacked_model, X_train_selected, y_train, cv=kf, scoring='neg_mean_absolute_error').mean())
print("Cross-Validation Mean Squared Error:", -1 * cross_val_score(stacked_model, X_train_selected, y_train, cv=kf, scoring='neg_mean_squared_error').mean())

# Make predictions on the test set
test_predictions = stacked_model.predict(X_test_selected)  # Predict on the test set

# Prepare the submission file
submission = sample_submission.copy()
submission['TAVG'] = test_predictions  # Add the predicted 'TAVG' to the submission file

# Save the submission file
submission_file_path = '/content/sub6.csv'
submission.to_csv(submission_file_path, index=False)

print("Optimized submission file saved to:", submission_file_path)


Optimized Stacked Model Evaluation:
Mean Absolute Error: 1.6081753301507995
Mean Squared Error: 4.192502738234582
R^2 Score: 0.9768340729657486
Cross-Validation Mean R^2 Score: 0.9733306236370355
Cross-Validation Mean Absolute Error: 1.6035586132961008
Cross-Validation Mean Squared Error: 4.924674705375823
Optimized submission file saved to: /content/sub6.csv
