In [1]:
# In[0]: IMPORT AND FUNCTIONS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.exceptions import ConvergenceWarning
from scipy.stats import randint, uniform
from datetime import timedelta
import joblib
import warnings
import os

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)

os.makedirs('figures', exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('saved_objects', exist_ok=True)

tscv = TimeSeriesSplit(n_splits=5)

# In[1]: LOAD DATA AND INITIAL PREPROCESSING
raw_data = pd.read_csv('datasets/NewYork.csv')
raw_data['datetime'] = pd.to_datetime(raw_data['datetime'])
raw_data.drop(columns=["name", "icon", "stations", "description"], inplace=True)

print("\nMissing values before imputation:")
print(raw_data.isnull().sum())

def remove_outliers(df, column, factor=1.5):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

raw_data = remove_outliers(raw_data, 'tempmax')
print("\nShape of data after removing outliers:", raw_data.shape)

for column in raw_data.columns:
    if raw_data[column].dtype == 'object':
        raw_data[column].fillna('Unknown', inplace=True)
    else:
        raw_data[column].fillna(raw_data[column].median(), inplace=True)

print("\nMissing values after imputation:")
print(raw_data.isnull().sum())

# In[2]: DISCOVER THE DATA
print('\n____________ Dataset info ____________')
print(raw_data.info())              
print('\n____________ Some first data examples ____________')
print(raw_data.head(3)) 
print('\n____________ Statistics of numeric features ____________')
print(raw_data.describe())    

# Visualizations
plt.figure(figsize=(12, 10))
corr = raw_data.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig('figures/correlation_heatmap.png', format='png', dpi=300)
plt.close()

# Histogram of all numeric features
numeric_features = raw_data.select_dtypes(include=[np.number]).columns
n_features = len(numeric_features)
n_rows = (n_features + 1) // 2
plt.figure(figsize=(15, 5 * n_rows))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(n_rows, 2, i)
    sns.histplot(raw_data[feature], kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.savefig('figures/hist_raw_data.png', format='png', dpi=300)
plt.close()

# Scatter plots of tempmax vs other numeric features
numeric_features = [col for col in numeric_features if col != 'tempmax']
n_features = len(numeric_features)
n_rows = (n_features + 1) // 2
plt.figure(figsize=(15, 5 * n_rows))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(n_rows, 2, i)
    sns.scatterplot(data=raw_data, x=feature, y='tempmax', alpha=0.5)
    plt.title(f'Max Temperature vs {feature}')
plt.tight_layout()
plt.savefig('figures/scatter_tempmax_vs_features.png', format='png', dpi=300)
plt.close()

# Pairplot of main features
main_features = ['tempmax', 'temp', 'humidity', 'windspeed', 'cloudcover']
plt.figure(figsize=(15, 15))
sns.pairplot(raw_data[main_features], diag_kind='kde')
plt.suptitle("Pairplot of Main Features", y=1.02)
plt.savefig('figures/pairplot_main_features.png', format='png', dpi=300)
plt.close()

# In[3]: PREPARE THE DATA 
class EnhancedFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_features=True):
        self.add_features = add_features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        if self.add_features:
            X_['day_of_year'] = X_['datetime'].dt.dayofyear
            X_['month'] = X_['datetime'].dt.month
            X_['day_of_week'] = X_['datetime'].dt.dayofweek
            X_['is_weekend'] = X_['day_of_week'].isin([5, 6]).astype(int)
        return X_.drop('datetime', axis=1)

numeric_features = ['temp', 'humidity', 'precip', 'windspeed', 'cloudcover']
categorical_features = ['conditions', 'preciptype']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

enhanced_pipeline = Pipeline([
    ('feature_adder', EnhancedFeatureAdder()),
    ('preprocessor', preprocessor)
])

X = raw_data.drop('tempmax', axis=1)
y = raw_data['tempmax']

X_processed = enhanced_pipeline.fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# In[4]: TRAIN AND EVALUATE MODELS
models = {
    'RandomForestReg': RandomForestRegressor(random_state=42),
    'GradientBoostingReg': GradientBoostingRegressor(random_state=42),
    'LGBMReg': LGBMRegressor(random_state=42),
    'XGBBoost': XGBRegressor(random_state=42),
    'SVR': SVR(),
    'ElasticNet': ElasticNet(random_state=42),
    'Lasso': Lasso(random_state=42),
    'Ridge': Ridge(random_state=42),
    'MLPRegressor': MLPRegressor(random_state=42)
}

def evaluate_model(model, data, labels): 
    prediction = model.predict(data)
    rmse = np.sqrt(mean_squared_error(labels, prediction))
    return rmse

# Store RMSE for untuned models
rmse_before_tuning = {}

print('\n____________ Train and Evaluate Models ____________')
for name, model in models.items():
    model.fit(X_train, y_train)
    rmse = evaluate_model(model, X_train, y_train)
    rmse_before_tuning[name] = rmse
    print(f'{name:<20} RMSE before tuning: {rmse:.4f}')

# In[5]: FINE-TUNE MODELS
print('\n____________ Fine-tune models ____________')

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)

param_grids = {
    'RandomForestReg': {
        'n_estimators': randint(100, 1000),
        'max_depth': randint(5, 30),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'max_features': uniform(0.1, 0.9)
    },
    'GradientBoostingReg': {
        'n_estimators': randint(100, 1000),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(3, 10),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 20),
        'subsample': uniform(0.5, 0.5)
    },
    'LGBMReg': {
        'num_leaves': randint(20, 100),
        'learning_rate': uniform(0.01, 0.2),
        'n_estimators': randint(100, 1000),
        'min_child_samples': randint(1, 50),
        'subsample': uniform(0.5, 0.5),
        'colsample_bytree': uniform(0.5, 0.5)
    },
    'XGBBoost': {
        'n_estimators': randint(100, 1000),
        'learning_rate': uniform(0.01, 0.2),
        'max_depth': randint(3, 10),
        'min_child_weight': randint(1, 10),
        'subsample': uniform(0.5, 0.5),
        'colsample_bytree': uniform(0.5, 0.5),
        'gamma': uniform(0, 0.5)
    },
    'SVR': {
        'C': uniform(0.1, 100),
        'kernel': ['rbf', 'linear', 'poly'],
        'gamma': ['scale', 'auto'] + list(uniform(0.001, 0.1).rvs(10)),
        'epsilon': uniform(0.01, 1)
    },
    'ElasticNet': {
        'alpha': uniform(0.001, 1),
        'l1_ratio': uniform(0, 1),
        'max_iter': [5000, 10000]
    },
    'Lasso': {
        'alpha': uniform(0.001, 1),
        'max_iter': [5000, 10000]
    },
    'Ridge': {
        'alpha': uniform(0.001, 1),
        'max_iter': [5000, 10000]
    },
    'MLPRegressor': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'alpha': uniform(0.00001, 0.01),
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [1000, 2000],
        'activation': ['relu', 'tanh']
    }
}

best_models = {}

for name, model in models.items():
    print(f"\nFine-tuning {name}")
    
    grid_search = RandomizedSearchCV(model, param_distributions=param_grids[name], 
                                     n_iter=100, cv=tscv, 
                                     scoring='neg_mean_squared_error', n_jobs=-1, 
                                     random_state=42, verbose=1, error_score='raise')
    try:
        grid_search.fit(X_train, y_train)
        best_rmse = np.sqrt(-grid_search.best_score_)
        print(f"Best RMSE for {name}: {best_rmse:.4f}")
        
        # Compare RMSE before and after tuning
        if best_rmse < rmse_before_tuning[name]:
            print(f"{name} is improved after tuning. Using tuned version.")
            best_models[name] = (grid_search.best_estimator_, best_rmse)
        else:
            print(f"{name} is not improved after tuning. Using untuned version.")
            best_models[name] = (model, rmse_before_tuning[name])
        
        joblib.dump(grid_search, f'saved_objects/{name}_gridsearch.pkl')
    except Exception as e:
        print(f"An error occurred while tuning {name}: {str(e)}")
        print("Skipping this model and continuing with the next one.")

# In[6]: SELECT BEST MODEL
best_model_name = min(best_models, key=lambda name: best_models[name][1])
best_model, best_rmse = best_models[best_model_name]

print(f"\nBest model after fine-tuning: {best_model_name} with RMSE: {best_rmse:.4f}")

# Save the best model
joblib.dump(best_model, 'models/SOLUTION_model.pkl')

# In[7]: ANALYZE AND TEST THE SOLUTION
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'\nPerformance on test data: RMSE: {test_rmse:.4f}')

# In[8]: MAKE FUTURE PREDICTIONS
last_date = raw_data['datetime'].max()
future_dates = pd.date_range(start=last_date + timedelta(days=1), periods=200)
future_data = pd.DataFrame({'datetime': future_dates})

def find_closest_date(target_date, data):
    try:
        target_date = target_date.replace(year=target_date.year - 1)
    except ValueError:
        target_date = target_date.replace(year=target_date.year - 1, day=28)
    closest_date = data['datetime'].iloc[(data['datetime'] - target_date).abs().argsort()[0]]
    return data.loc[data['datetime'] == closest_date].iloc[0]

for col in raw_data.columns:
    if col not in ['datetime', 'tempmax']:
        future_data[col] = future_data['datetime'].apply(lambda x: find_closest_date(x, raw_data)[col])

future_processed = enhanced_pipeline.transform(future_data)
future_pred = best_model.predict(future_processed)
future_data['predicted_tempmax'] = future_pred

future_data[['datetime', 'predicted_tempmax']].to_csv('future_predictions_200days.csv', index=False)
print("Future predictions have been saved to 'future_predictions_200days.csv'")

plt.figure(figsize=(20, 10))
plt.plot(future_data['datetime'], future_data['predicted_tempmax'], label='Predicted Max Temperature', alpha=0.7)
plt.title('Predicted Maximum Temperature for the Next 200 Days')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('figures/future_predictions_200days_plot.png', format='png', dpi=300)
plt.close()

print("\nPredicted Max Temperature:")
print(f"Min: {future_data['predicted_tempmax'].min():.2f}°C")
print(f"Max: {future_data['predicted_tempmax'].max():.2f}°C")
print(f"Avg: {future_data['predicted_tempmax'].mean():.2f}°C")

# In[8]: CONCLUSION
print("\n____________ CONCLUSION ____________")
print(f"""
1. Data Preprocessing:
   - Removed outliers and handled missing values.
   - Added engineered features: day of year, month, day of week, is_weekend.
   - Applied scaling and one-hot encoding.

2. Model Selection and Hyperparameter Tuning:
   - Evaluated multiple models using RMSE as the sole metric.
   - Used RandomizedSearchCV for hyperparameter optimization.
   - The best performing model was: {best_model_name}

3. Model Performance:
   - Best model RMSE on test data: {test_rmse:.4f}

4. Future Predictions:
   - Generated predictions for the next 200 days.
   - The predicted temperatures range from {future_data['predicted_tempmax'].min():.2f}°C to {future_data['predicted_tempmax'].max():.2f}°C.

Suggestions for Further Improvement:
1. Collect more historical data or external data sources.
2. Experiment with more advanced time series models.
3. Implement online learning for continuous model updates.
4. Consider using deep learning models for complex pattern capture.
5. Analyze prediction intervals for more robust forecasts.
""")

print("\nPrediction and analysis complete.")



Missing values before imputation:
datetime              0
tempmax               0
tempmin               0
temp                  0
feelslikemax          0
feelslikemin          0
feelslike             0
dew                   0
humidity              0
precip                0
precipprob            0
precipcover           0
preciptype          386
snow                  0
snowdepth             0
windgust              0
windspeed             0
winddir               0
sealevelpressure      0
cloudcover            0
visibility            0
solarradiation        0
solarenergy           0
uvindex               0
severerisk           40
sunrise               0
sunset                0
moonphase             0
conditions            0
dtype: int64

Shape of data after removing outliers: (1000, 29)

Missing values after imputation:
datetime            0
tempmax             0
tempmin             0
temp                0
feelslikemax        0
feelslikemin        0
feelslike           0
dew              

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data[column].fillna(raw_data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  raw_data[column].fillna(raw_data[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate o


____________ Train and Evaluate Models ____________
RandomForestReg      RMSE before tuning: 0.5292
GradientBoostingReg  RMSE before tuning: 0.9778
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1023
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 14
[LightGBM] [Info] Start training from score 17.578500
LGBMReg              RMSE before tuning: 0.6710
XGBBoost             RMSE before tuning: 0.0964
SVR                  RMSE before tuning: 1.9776
ElasticNet           RMSE before tuning: 3.6741
Lasso                RMSE before tuning: 1.8004
Ridge                RMSE before tuning: 1.2780
MLPRegressor         RMSE before tuning: 1.3025

____________ Fine-tune models ____________

Fine-tuning RandomForestReg
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best RMSE for RandomForestReg: 1.4983


<Figure size 1500x1500 with 0 Axes>