In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load the updated dataset
data = pd.read_csv('data/part_1/historical_weather_without_outliers.csv')

# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Filter data for the first week of 2019
start_date = '2019-01-01'
end_date = '2019-01-07'
forecast_dates = pd.date_range(start=start_date, end=end_date)

# Initialize a dictionary to store predictions for each city
predictions = {}

# Iterate over each city
for city_id in data['city_id'].unique():
    city_data = data[data['city_id'] == city_id]
    
    # Prepare the data for training
    X = city_data['date'].dt.dayofyear.values.reshape(-1, 1)
    y = city_data['avg_temp_c'].values
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train a Random Forest Regressor model
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions for the first week of 2019
    X_forecast = forecast_dates.dayofyear.values.reshape(-1, 1)
    y_pred = model.predict(X_forecast)
    
    # Store the predictions in the dictionary
    predictions[city_id] = y_pred

# Display or save the predictions
for city_id, y_pred in predictions.items():
    print(f"Predictions for {city_id}:")
    for date, temp in zip(forecast_dates, y_pred):
        print(f"{date.strftime('%Y-%m-%d')}: {temp:.2f}°C")


Predictions for C001:
2019-01-01: 5.97°C
2019-01-02: 7.81°C
2019-01-03: 8.37°C
2019-01-04: 6.58°C
2019-01-05: 6.93°C
2019-01-06: 5.79°C
2019-01-07: 7.59°C
Predictions for C002:
2019-01-01: 13.75°C
2019-01-02: 13.18°C
2019-01-03: 13.40°C
2019-01-04: 15.96°C
2019-01-05: 13.74°C
2019-01-06: 13.88°C
2019-01-07: 13.50°C
Predictions for C003:
2019-01-01: 27.15°C
2019-01-02: 26.91°C
2019-01-03: 27.19°C
2019-01-04: 26.12°C
2019-01-05: 26.66°C
2019-01-06: 27.21°C
2019-01-07: 26.88°C
Predictions for C004:
2019-01-01: -2.64°C
2019-01-02: -1.68°C
2019-01-03: -2.20°C
2019-01-04: -4.12°C
2019-01-05: -0.89°C
2019-01-06: 0.22°C
2019-01-07: -1.84°C
Predictions for C005:
2019-01-01: 22.64°C
2019-01-02: 24.63°C
2019-01-03: 25.89°C
2019-01-04: 25.41°C
2019-01-05: 24.36°C
2019-01-06: 23.66°C
2019-01-07: 24.68°C
Predictions for C007:
2019-01-01: 19.12°C
2019-01-02: 20.91°C
2019-01-03: 21.45°C
2019-01-04: 23.54°C
2019-01-05: 20.49°C
2019-01-06: 20.22°C
2019-01-07: 21.14°C
Predictions for C008:
2019-01-01: 4.

In [3]:
# Create a list to store the forecasted values
forecast_data = []

# Iterate over the forecasted values for each city
submission_id = 1
for city_id, y_pred in predictions.items():
    for temp in y_pred:
        forecast_data.append({'submission_ID': submission_id, 'avg_temp_c': round(temp,2)})
        submission_id += 1

# Convert the list into a DataFrame
forecast_df = pd.DataFrame(forecast_data)

# Save the DataFrame to a CSV file
forecast_df.to_csv('data/part_1/forcasted_data/randomforestregressor.csv', index=False)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load the updated dataset
data = pd.read_csv('data/part_1/historical_weather_without_outliers.csv')

# Convert the 'date' column to datetime format
data['date'] = pd.to_datetime(data['date'])

# Filter data for the first week of 2019
start_date = '2019-01-01'
end_date = '2019-01-07'
forecast_dates = pd.date_range(start=start_date, end=end_date)

# Initialize a dictionary to store predictions and RMSE for each city
predictions = {}
rmse_scores = {}

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Iterate over each city
for city_id in data['city_id'].unique():
    city_data = data[data['city_id'] == city_id]
    
    # Prepare the data for training
    X = city_data['date'].dt.dayofyear.values.reshape(-1, 1)
    y = city_data['avg_temp_c'].values
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize the Random Forest Regressor
    model = RandomForestRegressor(random_state=42)
    
    # Initialize Grid Search with cross-validation
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)
    
    # Fit Grid Search to the training data
    grid_search.fit(X_train, y_train)
    
    # Get the best model from Grid Search
    best_model = grid_search.best_estimator_
    
    # Make predictions on the test set
    y_test_pred = best_model.predict(X_test)
    
    # Calculate the RMSE for the test set
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    rmse_scores[city_id] = rmse
    
    # Make predictions for the first week of 2019
    X_forecast = forecast_dates.dayofyear.values.reshape(-1, 1)
    y_pred = best_model.predict(X_forecast)
    
    # Store the predictions in the dictionary
    predictions[city_id] = y_pred

# Display the best parameters and RMSE for each city
for city_id, rmse in rmse_scores.items():
    best_params = grid_search.best_params_
    print(f"Best parameters for city {city_id}: {best_params}")
    print(f"RMSE for city {city_id}: {rmse:.2f}")

# Display or save the predictions
for city_id, y_pred in predictions.items():
    print(f"Predictions for {city_id}:")
    for date, temp in zip(forecast_dates, y_pred):
        print(f"{date.strftime('%Y-%m-%d')}: {temp:.2f}°C")

Best parameters for city C001: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
RMSE for city C001: 2.58
Best parameters for city C002: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
RMSE for city C002: 2.03
Best parameters for city C003: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
RMSE for city C003: 0.83
Best parameters for city C004: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
RMSE for city C004: 3.75
Best parameters for city C005: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
RMSE for city C005: 2.73
Best parameters for city C007: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
RMSE for city C007: 3.1

In [4]:
# Create a list to store the forecasted values
forecast_data = []

# Iterate over the forecasted values for each city
submission_id = 1
for city_id, y_pred in predictions.items():
    for temp in y_pred:
        forecast_data.append({'submission_ID': submission_id, 'avg_temp_c': round(temp,2)})
        submission_id += 1

# Convert the list into a DataFrame
forecast_df = pd.DataFrame(forecast_data)

# Save the DataFrame to a CSV file
forecast_df.to_csv('data/part_1/forcasted_data/randomforestregressor_optimized.csv', index=False)