In [1]:
import sys
sys.path.append('./aienginepackages')

import pandas as pd
import numpy as np
from aienginepackages import process_data, determine_feature_set

In [2]:
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import pandas as pd

In [3]:
from joblib import dump, load

In [4]:
def prepare_time_features(dates):
    """Convert datetime series into numeric features for XGBoost"""
    df = pd.DataFrame({'date_time': pd.to_datetime(dates)})
    
    # Extract useful time features
    df['hour'] = df['date_time'].dt.hour
    df['day'] = df['date_time'].dt.day
    df['month'] = df['date_time'].dt.month
    df['year'] = df['date_time'].dt.year
    df['day_of_week'] = df['date_time'].dt.dayofweek
    
    # Return features without datetime column
    return df.drop('date_time', axis=1)

In [5]:
# import pandas as pd
# from xgboost import XGBRegressor
# from sklearn.preprocessing import LabelEncoder

# # Convert datetime to numeric features
# def prepare_time_features(df):
#     """
#     Convert datetime index into useful numeric features for XGBoost
#     """
#     # Convert to pandas datetime if not already
#     df['date_time'] = pd.to_datetime(df['date_time'])
    
#     # Extract useful time features
#     df['hour'] = df['date_time'].dt.hour
#     df['day'] = df['date_time'].dt.day
#     df['month'] = df['date_time'].dt.month
#     df['year'] = df['date_time'].dt.year
#     df['day_of_week'] = df['date_time'].dt.dayofweek
    
#     # Drop original datetime column
#     return df.drop('date_time', axis=1)

# # Prepare your data
# train_data = pd.DataFrame({
#     'date_time': train_x,
#     'value': train_y
# })

# test_data = pd.DataFrame({
#     'date_time': test_x,
#     'value': test_y
# })

# # Create features
# train_features = prepare_time_features(train_data)
# test_features = prepare_time_features(test_data)

# # Separate features and target
# X_train = train_features.drop('value', axis=1)
# y_train = train_features['value']
# X_test = test_features.drop('value', axis=1)
# y_test = test_features['value']

# # Train model
# # xgb_model = XGBRegressor(eval_metric='mae')
# # xgb_model.fit(
# #     X_train,
# #     y_train,
# #     eval_set=[(X_train, y_train), (X_test, y_test)],
# #     verbose=False
# # )

In [6]:
def _prep_params():
        """
        Generates randomized hyperparameters for XGBoost within reasonable ranges.
        Returns a dictionary of hyperparameters.
        """
        param_ranges = {
            'n_estimators': (100, 1000),  
            'max_depth': (3, 10),        
            'learning_rate': (0.01, 0.3), 
            'subsample': (0.6, 1.0),     
            'colsample_bytree': (0.6, 1.0), 
            'min_child_weight': (1, 7),    
            'gamma': (0, 0.5),            
        }
        
        # Initialize parameters dictionary with random values
        params = {
            'random_state': np.random.randint(0, 10000),  # Random seed
            'eval_metric': 'mae',
            # Randomly sample continuous parameters
            'learning_rate': np.random.uniform(
                param_ranges['learning_rate'][0],
                param_ranges['learning_rate'][1]
            ),
            'subsample': np.random.uniform(
                param_ranges['subsample'][0],
                param_ranges['subsample'][1]
            ),
            'colsample_bytree': np.random.uniform(
                param_ranges['colsample_bytree'][0],
                param_ranges['colsample_bytree'][1]
            ),
            'gamma': np.random.uniform(
                param_ranges['gamma'][0],
                param_ranges['gamma'][1]
            ),
            
            # Randomly sample integer parameters
            'n_estimators': np.random.randint(
                param_ranges['n_estimators'][0],
                param_ranges['n_estimators'][1]
            ),
            'max_depth': np.random.randint(
                param_ranges['max_depth'][0],
                param_ranges['max_depth'][1]
            ),
            'min_child_weight': np.random.randint(
                param_ranges['min_child_weight'][0],
                param_ranges['min_child_weight'][1]
            ),
        }
        
        return params

In [7]:
df = pd.read_csv('datasets/modifiedkaggletraffic2.csv', names=['date_time', 'value','id'])

In [8]:
#Process_Data
proc_data = process_data(df, quick_start=False)
train_x, test_x, train_y, test_y = train_test_split(
            proc_data.dataset.index.values, proc_data.dataset['value'], test_size=0.2, shuffle=False
            )


In [None]:
type(train_x)

In [9]:
X_train = prepare_time_features(train_x)
X_test = prepare_time_features(test_x)
y_train = train_y
y_test = test_y

In [None]:
type(X_train)

In [17]:
hyperparameters = _prep_params()

In [None]:
type(hyperparameters)

In [18]:
xgb_model = XGBRegressor(**hyperparameters)

In [None]:
model_filename = 'xgboost_model.joblib'
dump(xgb_model, model_filename)

In [None]:
xgb_model

In [None]:
xgb_model

In [None]:
loaded_model = load('xgboost_model.joblib')

In [None]:
xgb_model = XGBRegressor(eval_metric='mae')

In [None]:
xgb_model = XGBRegressor(
    # Tree-specific parameters
    max_depth=6,          # Maximum depth of trees (3-10)
    min_child_weight=1,   # Minimum sum of instance weight in a child (1-10)
    gamma=0,              # Minimum loss reduction for split (0-1)
    
    # # Boosting parameters
    learning_rate=0.1,    # Learning rate/eta (0.01-0.3)
    n_estimators=100,     # Number of trees/boosting rounds (50-2000)
    subsample=0.8,        # Fraction of samples used for tree building (0.5-1)
    colsample_bytree=0.8, # Fraction of features used for tree building (0.5-1)
    
    # # Regularization parameters
    reg_alpha=0,          # L1 regularization (0-1)
    reg_lambda=1,         # L2 regularization (0-1)
    
    # # Other parameters
    random_state=42,      # For reproducibility
    eval_metric='mae',    # Evaluation metric
    early_stopping_rounds=10  # Stop if no improvement after N rounds
)

In [19]:
xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False
)

In [20]:
mae = mean_absolute_error(y_test, xgb_model.predict(X_test))
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 8.856323012835198


In [None]:
mae = mean_absolute_error(y_test, xgb_model.predict(X_test))
print(f"Mean Absolute Error: {mae}")

In [16]:
model_filename = 'xgboost_model.joblib'
state = {
                'xgb_model': xgb_model,
                'model_error': mae
            }
dump(state, model_filename)

['xgboost_model.joblib']

In [22]:
saved_state = load('xgboost_model.joblib')

In [25]:
type(saved_state["model_error"])

numpy.float64

In [None]:
X_full = prepare_time_features(proc_data.dataset.index.values)
y_full = proc_data.dataset['value']

In [None]:
xgb_model.fit(
    X_full,
    y_full,
    # eval_set=[(X_full, y_full)],  # Use full dataset for evaluation
    verbose=False
)

In [None]:
def predict_future(model, last_date, periods=168):  # default to 1 week (168 hours)
    """
    Generate predictions for future periods
    
    Args:
        model: Trained XGBoost model
        last_date: Last date in training data
        periods: Number of hours to predict into future
    """
    # Generate future dates
    future_dates = pd.date_range(
        start=pd.Timestamp(last_date) + pd.Timedelta(proc_data.sampling_frequency),
        periods=periods,
        freq=proc_data.sampling_frequency
    )
    
    # Prepare features for future dates
    future_features = prepare_time_features(future_dates)
    
    # Make predictions
    predictions = model.predict(future_features)
    
    # Create results dataframe
    results = pd.DataFrame({
        'date_time': future_dates,
        'predicted_value': predictions
    })
    
    return results

# Get last date from your data
last_date = pd.Timestamp(proc_data.dataset.index[-1])
type(last_date)
# Make future predictions (e.g., next week)
future_predictions = predict_future(xgb_model, last_date)

In [None]:
type(last_date)

In [None]:
future_predictions

In [None]:
future_predictions

In [None]:
len(predictions)

In [None]:
predictions