In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor as knr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from utils.model_utils import Model_utils 
from utils.preprocess import LoadData 
from datetime import datetime

### XGboost

#### Load and preprocess Data

XGboost and RF models will use the same preprocess pipeline (what can be explained by both are tree-based models)

In [None]:
load_data = LoadData()

# lagging columns
lag_columns_list = ['medio_diario']*7
lag_values = [1, 2, 3, 4, 5, 6, 7]

# load train/validation data
data = load_data.data

# create the lagged columns in data
data = load_data.create_lag_columns(data, lag_columns_list, lag_values)
data = data.iloc[7:]

features = load_data.features
target = load_data.target

X = data[features]
y = data[target]

# Scale is not needed for XGBoost (it is a tree-based model)
preprocessor = load_data.create_preprocessor(scale_std=False, scale_minmax=False)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Preprocess the data
X_train = preprocessor.fit_transform(X_train)

##### Train

In [None]:
# comments to be saved in the history
comments = 'best xgboost removing last month data before shuffle'

# Train the model
model_name = 'xgboost' #name the model to save it in models + the metrics in history.csv

# Create the XGBRegressor model
model = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical='True',
                         n_estimators= 1300, max_depth= 3, learning_rate= 0.01, 
                         gamma= 0, subsample= 0.3, reg_alpha= 0.5, 
                         reg_lambda= 0, random_state= 42, device='cuda'
                         )

model_utils = Model_utils()

# Train the model with the best parameters
model_utils.train_model(model, X_train, y_train, model_name, preprocessor=preprocessor, grid_search=False, comments=comments)


##### Validation

In [None]:
# Load the model with the best parameters + the preprocessor
model, preprocessor = model_utils.load_model()

# Preprocess the test data
X_test = preprocessor.transform(X_test) 

# Test the model
y_pred = model_utils.test_model(X_test, y_test)

In [None]:
y_pred

In [None]:
model_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # or '2024-06-26_21-06-25' 
model_utils.plot_predictions(X_test, y_test, model_name+'__'+model_date) 

#### Test

This step is like the validation step. But here we upload only the last month and seven days before it (this data is not into the training/validation data) to do a sequencial (in time) prediction and a simulation that what will be done in production

In [None]:
# Creating the data (it's not the preprocessing)
load_data = LoadData()
# load the test data
test_data = load_data.last_month_data
data_last_7_days = load_data.data.tail(7) # the last 7 days before the last month will be used as lagged features

# Union the last 7 days data with the last month data
test_data = pd.concat([data_last_7_days,test_data], ignore_index=True)

In [None]:
# Preprocessing here!

# lag feature and create a dataframe to each model
lag_columns_list = ['medio_diario']*7
lag_values = [1, 2, 3, 4, 5, 6, 7]

test_data = load_data.create_lag_columns(test_data, lag_columns_list, lag_values)

# removing the first 7 rows after lagging
test_data = test_data.iloc[7:]

# Define the features and targets
X_test = test_data[load_data.features]
y_test = test_data[load_data.target]

# Preprocess the data (doing the inputation, scaling if it was used in training)
X_test = preprocessor.transform(X_test)

In [None]:
# loading the model
model_path=f'models/{model_name}__{model_date}.pkl'
preprocessor_path = f'models/preprocessors/{model_name}__{model_date}_preprocessor.pkl'


model, preprocessor = model_utils.load_model(model_path=model_path, preprocessor_path=preprocessor_path)

In [None]:
# Model prediction
y_pred = model.predict(X_test)

display(y_pred)

In [None]:
model_utils.plot_predictions(X_test, y_test, model_name+'__'+model_date) 

## Random Forest

#### Load and preprocess Data

In [None]:
load_data = LoadData()

# lagging columns
lag_columns_list = ['medio_diario']*7
lag_values = [1, 2, 3, 4, 5, 6, 7]

# load train/validation data
data = load_data.data

# create the lagged columns in data
data = load_data.create_lag_columns(data, lag_columns_list, lag_values)
data = data.iloc[7:]

features = load_data.features
target = load_data.target

X = data[features]
y = data[target]

# Scale is not needed for XGBoost (it is a tree-based model)
preprocessor = load_data.create_preprocessor(scale_std=False, scale_minmax=False)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

# Preprocess the data
X_train = preprocessor.fit_transform(X_train)

##### Train

In [None]:
# Train the model
model_name = 'Random_Forest' 

# Create the Random Forest model
model = RandomForestRegressor(n_estimators=400, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', random_state=42)
model_utils = Model_utils()

# Train the model with the best parameters
model_utils.train_model(model, X_train, y_train, model_name, preprocessor=preprocessor, grid_search=False, comments=comments)


##### Validation

In [None]:
# Load the model with the best parameters + the preprocessor
model, preprocessor = model_utils.load_model()

# Preprocess the test data (already preprocessed)
X_test = preprocessor.transform(X_test) 

# Test the model
y_pred = model_utils.test_model(X_test, y_test)

In [None]:
y_pred

In [None]:
model_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") #'2024-06-26_21-15-21'
model_utils.plot_predictions(X_test, y_test, model_name+'__'+model_date)

#### Test

This step is like the validation step. But here we upload only the last month and seven days before it (this data is not into the training/validation data) to do a sequencial (in time) prediction and a simulation that what will be done in production

In [None]:
# Creating the data (it's not the preprocessing)
load_data = LoadData()

# load the test data
test_data = load_data.last_month_data
data_last_7_days = load_data.data.tail(7) # the last 7 days before the last month will be used as lagged features

# Union the last 7 days data with the last month data
test_data = pd.concat([data_last_7_days,test_data], ignore_index=True)

In [None]:
# Preprocessing here!

# lag features
lag_columns_list = ['medio_diario']*7
lag_values = [1, 2, 3, 4, 5, 6, 7]

test_data = load_data.create_lag_columns(test_data, lag_columns_list, lag_values)

# removing the first 7 rows after lagging
test_data = test_data.iloc[7:]

# Define the features and targets
X_test = test_data[load_data.features]
y_test = test_data[load_data.target]

# Preprocess the data (doing the inputation, scaling if it was used in training)
X_test = preprocessor.transform(X_test)

In [None]:
# loading the model
model_path=f'models/{model_name}__{model_date}.pkl'
preprocessor_path = f'models/preprocessors/{model_name}__{model_date}_preprocessor.pkl'


model, preprocessor = model_utils.load_model(model_path=model_path, preprocessor_path=preprocessor_path)

In [None]:
# Model prediction
y_pred = model.predict(X_test)

display(y_pred)

In [None]:
model_utils.plot_predictions(X_test, y_test, model_name+'__'+model_date) 

## KNN

#### Load and preprocess Data

In [None]:
load_data = LoadData()

# load train/validation data
data = load_data.data

# lagging columns
lag_columns_list = ['medio_diario']*7
lag_values = [1, 2, 3, 4, 5, 6, 7]
lag_columns_list += load_data.features
lag_values += [1]*len(load_data.features)

# create the lagged columns in data
data = load_data.create_lag_columns(data, lag_columns_list, lag_values)
data = data.iloc[7:]

features = load_data.features
target = load_data.target

X = data[features]
y = data[target]

# Scale is not needed for XGBoost (it is a tree-based model)
preprocessor = load_data.create_preprocessor(scale_std=True, scale_minmax=False)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
X_train = preprocessor.fit_transform(X_train)

##### Train

In [None]:
# Explain the train
comments = 'best KNN model. Removed last month data before shuffle. lagging all features.'

# Train the model
model_name = 'KNN'

# Create the KN-Regressor model
model = knr(algorithm='auto', leaf_size=1, n_neighbors= 5, p=1, weights ='distance')

model_utils = Model_utils()

# Train the model with the best parameters
#model_utils.train_model(model, X_train, y_train, model_name, preprocessor=preprocessor, grid_search=True, param_grid=param_grid, comments=comments)
model_utils.train_model(model, X_train, y_train, model_name, preprocessor=preprocessor, grid_search=False, comments=comments)

##### Validation

In [None]:
# Load the model with the best parameters + the preprocessor
model, preprocessor = model_utils.load_model()

# Preprocess the test data
X_test = preprocessor.transform(X_test) 

# Test the model
y_pred = model_utils.test_model(X_test, y_test)

In [None]:
y_pred

In [None]:
model_date = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") #'2024-06-27_13-41-11' 
model_utils.plot_predictions(X_test, y_test, model_name+'__'+model_date) 

#### Test

This step is like the validation step. But here we upload only the last month and seven days before it (this data is not into the training/validation data) to do a sequencial (in time) prediction and a simulation that what will be done in production

In [None]:
# Creating the data (it's not the preprocessing)
load_data = LoadData()
# load the test data
test_data = load_data.last_month_data
data_last_7_days = load_data.data.tail(7) # the last 7 days before the last month will be used as lagged features

# Union the last 7 days data with the last month data
test_data = pd.concat([data_last_7_days,test_data], ignore_index=True)

In [None]:
# Preprocessing here!

# lag features 
lag_columns_list = ['medio_diario']*7
lag_values = [1, 2, 3, 4, 5, 6, 7]
lag_columns_list += load_data.features
lag_values += [1]*len(load_data.features)

# create the lagged columns in data
test_data = load_data.create_lag_columns(test_data, lag_columns_list, lag_values)

# removing the first 7 rows after lagging
test_data = test_data.iloc[7:]

# Define the features and targets
X_test = test_data[load_data.features]
y_test = test_data[load_data.target]

# Preprocess the data (doing the inputation, scaling if it was used in training)
X_test = preprocessor.transform(X_test)

In [None]:
# loading the model
model_path=f'models/{model_name}__{model_date}.pkl'
preprocessor_path = f'models/preprocessors/{model_name}__{model_date}_preprocessor.pkl'


model, preprocessor = model_utils.load_model(model_path=model_path, preprocessor_path=preprocessor_path)

In [None]:
# Model prediction
y_pred = model.predict(X_test)

display(y_pred)

In [None]:
model_utils.plot_predictions(X_test, y_test, model_name+'__'+model_date) 