In [7]:
import pandas as pd
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn import metrics

from xgboost import XGBRegressor

#read trained_modified.csv
sales_predict_df = pd.read_csv("/home/mo/Documents/UNC/Sales_Automated_Tool/Alex's Model/Resources/train_modified.csv")
sales_predict_df.head()


#Create features and target array
features_df = sales_predict_df.drop(columns= ['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
target_df = sales_predict_df['Item_Outlet_Sales']
#Set the target array values
X = features_df.values
y = target_df.values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)




## Hyperparameter Tuning with Nested Loops

In this section, we are using nested `for` loops to iterate through various combinations of hyperparameters for our XGBoost model. This manual grid search approach aims to find the combination that yields the highest R-squared value.

### Considerations:
- **Time Complexity**: Running this code could take a significant amount of time depending on the dataset size and the number of combinations to be tested.
- **Optimization Strategy**: The loop iterates through different combinations of `learning_rate`, `n_estimators`, and `max_depth`. After each iteration, the model is trained and its R-squared score is calculated. The hyperparameters that provide the highest R-squared are then selected.

In [12]:

# Initialize variables to hold the best parameters and best R^2 score
best_r2 = 0
best_params = {}

# Define possible parameter values (You can extend this list)
learning_rates = [0.01, 0.015, 0.1]
n_estimators = [100, 430, 600]
max_depths = [2, 3, 10]
min_child_weights = [20, 23, 64]
gammas = [3, 4, 8]

# Nested for loops to check various combinations
for lr in learning_rates:
    for est in n_estimators:
        for depth in max_depths:
            for weight in min_child_weights:
                for gamma in gammas:
                    # Create and fit the regressor
                    regressor = XGBRegressor(learning_rate=lr, n_estimators=est, max_depth=depth, min_child_weight=weight, gamma=gamma)
                    regressor.fit(X_train_scaled, y_train)
                    
                    # Predict and calculate R^2
                    preds = regressor.predict(X_test_scaled)
                    current_r2 = r2_score(y_test, preds)
                    
                    # Check if this R^2 is greater than the previous best
                    if current_r2 > best_r2:
                        best_r2 = current_r2
                        best_params = {'learning_rate': lr, 'n_estimators': est, 'max_depth': depth, 'min_child_weight': weight, 'gamma': gamma}
                        
print(f"Best R^2 score: {best_r2}")
print(f"Best parameters: {best_params}")


Best R^2 score: 0.6276390397922874
Best parameters: {'learning_rate': 0.015, 'n_estimators': 600, 'max_depth': 2, 'min_child_weight': 64, 'gamma': 3}


In [11]:

# Uee XGBoost to define regressor model
regressor = XGBRegressor(learning_rate=.015,
    n_estimators=600,
    max_depth=2,
    min_child_weight=64,
    gamma=3,
    subsample=.9,
    colsample_bytree=.65)

# Fit the model to the training data
regressor.fit(X_train_scaled, y_train)

#Running the model on the training data to predict sales
sales_data_predictions = regressor.predict(X_test_scaled)

# In order to check the performance of the model we find the R squared Value
r2_sales = metrics.r2_score(y_test, sales_data_predictions)
print('R Squared value = ', r2_sales)

R Squared value =  0.6261376994075071
