In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb

# Load the updated dataset
data = pd.read_csv('data/part_1/historical_weather_without_outliers.csv')

# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Filter data for the first week of 2019
start_date = '2019-01-01'
end_date = '2019-01-07'
forecast_dates = pd.date_range(start=start_date, end=end_date)

# Initialize a dictionary to store predictions and RMSE for each city
predictions = {}
rmse_scores = {}

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Iterate over each city
for city_id in data['city_id'].unique():
    city_data = data[data['city_id'] == city_id]
    
    # Prepare the data for training
    X = city_data['date'].dt.dayofyear.values.reshape(-1, 1)
    y = city_data['avg_temp_c'].values
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the XGBoost Regressor
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    
    # Initialize Grid Search with cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
    
    # Fit Grid Search to the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best model from Grid Search
    best_model = grid_search.best_estimator_
    
    # Make predictions on the test set
    y_test_pred = best_model.predict(X_test)
    
    # Calculate the RMSE for the test set
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    rmse_scores[city_id] = rmse
    
    # Make predictions for the first week of 2019
    X_forecast = forecast_dates.dayofyear.values.reshape(-1, 1)
    y_pred = best_model.predict(X_forecast)
    
    # Store the predictions in the dictionary
    predictions[city_id] = y_pred

    print(f"City {city_id} - RMSE: {rmse:.2f}")

# Display the best parameters and RMSE for each city
for city_id, rmse in rmse_scores.items():
    best_params = grid_search.best_params_
    print(f"Best parameters for city {city_id}: {best_params}")
    print(f"RMSE for city {city_id}: {rmse:.2f}")

# Display or save the predictions
for city_id, y_pred in predictions.items():
    print(f"Predictions for {city_id}:")
    for date, temp in zip(forecast_dates, y_pred):
        print(f"{date.strftime('%Y-%m-%d')}: {temp:.2f}°C")


City C001 - RMSE: 2.47
City C002 - RMSE: 1.93
City C003 - RMSE: 0.80
City C004 - RMSE: 3.63
City C005 - RMSE: 2.72
City C007 - RMSE: 3.01
City C008 - RMSE: 3.33
City C009 - RMSE: 1.72
City C010 - RMSE: 1.37
City C011 - RMSE: 4.16
City C012 - RMSE: 3.31
City C013 - RMSE: 3.84
City C014 - RMSE: 3.39
City C015 - RMSE: 1.57
City C016 - RMSE: 1.20
City C017 - RMSE: 1.06
City C018 - RMSE: 4.35
City C020 - RMSE: 1.27
City C022 - RMSE: 2.85
City C023 - RMSE: 1.12
City C024 - RMSE: 3.41
City C025 - RMSE: 3.54
City C027 - RMSE: 5.57
City C028 - RMSE: 1.05
City C029 - RMSE: 1.26
City C030 - RMSE: 0.75
City C031 - RMSE: 3.67
City C033 - RMSE: 3.50
City C034 - RMSE: 3.15
City C035 - RMSE: 0.77
City C036 - RMSE: 3.34
City C037 - RMSE: 3.58
City C038 - RMSE: 0.71
City C039 - RMSE: 2.52
City C040 - RMSE: 0.80
City C042 - RMSE: 1.15
City C043 - RMSE: 3.33
City C044 - RMSE: 2.20
City C045 - RMSE: 0.80
City C046 - RMSE: 2.79
City C047 - RMSE: 2.46
City C048 - RMSE: 2.27
City C049 - RMSE: 2.56
City C051 -

In [3]:
# Create a list to store the forecasted values
forecast_data = []

# Iterate over the forecasted values for each city
submission_id = 1
for city_id, y_pred in predictions.items():
    for temp in y_pred:
        forecast_data.append({'submission_ID': submission_id, 'avg_temp_c': round(temp,2)})
        submission_id += 1

# Convert the list into a DataFrame
forecast_df = pd.DataFrame(forecast_data)

# Save the DataFrame to a CSV file
forecast_df.to_csv('data/part_1/forcasted_data/xgboost_optimized.csv', index=False)