# Training Model

In [25]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pickle
import json

# Load the preprocessed feature file and the powerline file
features_file = pd.read_csv('Preprocess_Feature_MEX.csv')
powerline_file = pd.read_csv('Power_Satellite.csv')

# Convert 'ut_ms' columns to datetime and align both datasets
features_file['ut_ms'] = pd.to_datetime(features_file['ut_ms'])
powerline_file['ut_ms'] = pd.to_datetime(powerline_file['ut_ms'])

# Merge datasets on the timestamp ('ut_ms')
merged_data = pd.merge(features_file, powerline_file[['ut_ms', 'NPWD2372']], on='ut_ms')

# Set the timestamp as index
merged_data.set_index('ut_ms', inplace=True)

# One-hot encode categorical columns (e.g., 'type')
merged_data = pd.get_dummies(merged_data)

# Define window parameters
window_size = pd.Timedelta(hours=7)  # 7 hours window
overlap_size = pd.Timedelta(hours=1)  # 1 hour overlap
step_size = window_size - overlap_size

# Prepare a scaler to normalize the features
scaler = StandardScaler()

# Create a function to train models and return the model with the best RMSE score
def train_models(X_train, y_train, X_test, y_test):
    models = {
        'RandomForest': RandomForestRegressor(),
        'XGBoost': XGBRegressor(),
        'CatBoost': CatBoostRegressor(verbose=0)
    }
    best_model = None
    best_rmse = float('inf')
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = (name, model)
    
    return best_model, best_rmse

# Create a directory for saving pickle files
output_dir = 'models_pickle'
os.makedirs(output_dir, exist_ok=True)

# Initialize metadata dictionary
metadata = []

# Track previous best model
prev_best_model = None
current_window_data = []
file_counter = 1

# Iterate over time windows
start_time = merged_data.index.min()
end_time = merged_data.index.max()

while start_time + window_size <= end_time:
    window_end = start_time + window_size
    window_data = merged_data.loc[start_time:window_end]
    
    # Split into features (X) and target (y)
    X = window_data.drop(columns=['NPWD2372'])
    y = window_data['NPWD2372']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Normalize the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Train the models and get the best model for this window
    best_model, best_rmse = train_models(X_train, y_train, X_test, y_test)
    
    # Check if this window's model is the same as the previous window
    if prev_best_model and prev_best_model[0] == best_model[0]:
        # If the model is the same, extend the current window
        current_window_data.append((start_time, window_end, best_model[1], best_rmse))
    else:
        # If the model has changed, save the previous model data to a pickle file and update metadata
        if prev_best_model:
            # Save the pickle file
            model_file_path = os.path.join(output_dir, f'model_{prev_best_model[0]}_window_{file_counter}.pkl')
            with open(model_file_path, 'wb') as f:
                pickle.dump(prev_best_model[1], f)
            
            # Update metadata with the start and end time for the model
            metadata.append({
                'model_name': prev_best_model[0],
                'file_path': model_file_path,
                'start_time': str(current_window_data[0][0]),
                'end_time': str(current_window_data[-1][1]),
                'rmse': best_rmse
            })
        
        # Start a new window for the current model
        file_counter += 1
        prev_best_model = best_model
        current_window_data = [(start_time, window_end, best_model[1], best_rmse)]
    
    # Shift the window by the step size (7 hours - 1 hour overlap)
    start_time += step_size

# Save the final model and metadata
model_file_path = os.path.join(output_dir, f'model_{prev_best_model[0]}_window_{file_counter}.pkl')
with open(model_file_path, 'wb') as f:
    pickle.dump(prev_best_model[1], f)

# Update metadata for the final model
metadata.append({
    'model_name': prev_best_model[0],
    'file_path': model_file_path,
    'start_time': str(current_window_data[0][0]),
    'end_time': str(current_window_data[-1][1]),
    'rmse': best_rmse
})

# Save metadata to a JSON file
with open(os.path.join(output_dir, 'metadata.json'), 'w') as f:
    json.dump(metadata, f, indent=4)

print(f"Model training and saving complete. All models are saved in the '{output_dir}' folder.")




KeyboardInterrupt: 

In [6]:
pip install catboost

Collecting catboostNote: you may need to restart the kernel to use updated packages.

  Using cached catboost-1.2.7-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Using cached graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-5.24.1-py3-none-any.whl.metadata (7.3 kB)
Using cached catboost-1.2.7-cp312-cp312-win_amd64.whl (101.7 MB)
Using cached graphviz-0.20.3-py3-none-any.whl (47 kB)
Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
   ---------------------------------------- 0.0/19.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/19.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/19.1 MB 1.3 MB/s eta 0:00:15
   ---------------------------------------- 0.1/19.1 MB 1.3 MB/s eta 0:00:15
   - -------------------------------------- 0.5/19.1 MB 4.5 MB/s eta 0:00:05
   ------ --------------------------------- 2.9/19.1 MB 18.4 MB/s eta 0:00:01
   -------------- ---


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# calulating rmse of overall model

In [61]:
# Variables to track overall RMSE and window sizes
total_rmse_sum = 0
total_window_size = 0

# Iterate again over the time windows to recalculate the RMSE
start_time = merged_data.index.min()
end_time = merged_data.index.max()

while start_time + window_size <= end_time:
    window_end = start_time + window_size
    window_data = merged_data.loc[start_time:window_end]
    
    # Split into features (X) and target (y)
    X = window_data.drop(columns=['NPWD2372'])
    y = window_data['NPWD2372']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Normalize the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Train the models and get the best model for this window
    best_model, best_rmse = train_models(X_train, y_train, X_test, y_test)
    
    # Track the total RMSE and window size for calculating the final overall RMSE
    window_size_in_points = len(y)
    total_rmse_sum += best_rmse * window_size_in_points
    total_window_size += window_size_in_points
    
    # Shift the window by the step size (7 hours - 1 hour overlap)
    start_time += step_size

# Calculate overall RMSE as a weighted average
overall_rmse = total_rmse_sum / total_window_size

# Print overall RMSE
print(f"Overall RMSE for the model across all windows: {overall_rmse:.4f}")




Overall RMSE for the model across all windows: 0.0306


