In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split , cross_val_score, KFold
from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBRegressor

2023-08-28 10:19:34.883066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Read the data
# sales_predict_df = pd.read_csv("./Resources/train_modified.csv")
sales_predict_df = pd.read_csv('../Resources/Train_Output_CSV.csv')
sales_predict_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                8523 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                8523 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [3]:
#Drop the target column "Item_Outlet_Sales" fro the feature DataFrame
features_df = sales_predict_df.drop(columns=['Item_Outlet_Sales','Item_Identifier'])




In [4]:
# One-hot encoding using pd.get_dummies
features_df = pd.get_dummies(features_df, columns=['Outlet_Size' ,'Item_Type' ,'Outlet_Location_Type','Outlet_Establishment_Year','Outlet_Identifier','Outlet_Type', 'Item_Fat_Content','Item_Weight'])

# Extract target variables
target_df = sales_predict_df['Item_Outlet_Sales']
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Columns: 465 entries, Item_Visibility to Item_Weight_21.35
dtypes: float64(2), uint8(463)
memory usage: 3.9 MB


In [5]:
# Convert to numpy arrays
X = features_df.values
y = target_df.values


In [6]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
print("Any missing in X_train_scaled:", np.isnan(X_train_scaled).any())
print("Any missing in y_train:", np.isnan(y_train).any())


Any missing in X_train_scaled: False
Any missing in y_train: False


## Hyperparameter Tuning with Exhaustive Grid Search

In this code snippet, we manually perform hyperparameter tuning for our XGBoost model using exhaustive grid search. The goal is to find the best combination of hyperparameters that yields the highest R-squared value (`R^2`).

### Steps:

1. **Initialize Variables**: Two variables `best_r2` and `best_params` are initialized to keep track of the best R-squared value and the corresponding parameters.

2. **Define Parameters**: Lists of possible values for different hyperparameters like `learning_rate`, `n_estimators`, `max_depth`, `min_child_weight`, and `gamma` are defined.

3. **Nested For Loops**: We use nested for loops to iterate through all possible combinations of the defined hyperparameters.

4. **Train and Evaluate**: Inside the loops, the XGBoost model is trained with each combination and evaluated using R-squared on the test set.

5. **Check and Update**: If the model's R-squared value is better than the previous best, we update `best_r2` and `best_params`.

### Code Execution Time:
Be cautious, as running this exhaustive search could take a considerable amount of time depending on the number of combinations.

### Results:
At the end, the code prints the best R-squared value obtained and the hyperparameters that led to it.
## Best R^2 score: 0.6253234489918966
## Best parameters: {'learning_rate': 0.015, 'n_estimators': 450, 'max_depth': 2, 'min_child_weight': 25, 'gamma': 3}

> **Note**: This method is computationally expensive and might not be feasible for very large datasets or a high number of hyperparameters.


In [9]:



# Initialize variables to hold the best parameters and best R^2 score
best_r2 = 0
best_params = {}

# Define possible parameter values (You can extend this list)
learning_rates = [0.01, 0.013, 0.015]
n_estimators = [400, 430, 450]
max_depths = [2, 3, 4]
min_child_weights = [20, 23, 25]
gammas = [3, 4, 5]

# Nested for loops to check various combinations
for lr in learning_rates:
    for est in n_estimators:
        for depth in max_depths:
            for weight in min_child_weights:
                for gamma in gammas:
                    # Create and fit the regressor
                    regressor = XGBRegressor(learning_rate=lr, n_estimators=est, max_depth=depth, min_child_weight=weight, gamma=gamma)
                    regressor.fit(X_train_scaled, y_train)
                    
                    # Predict and calculate R^2
                    preds = regressor.predict(X_test_scaled)
                    current_r2 = r2_score(y_test, preds)
                    
                    # Check if this R^2 is greater than the previous best
                    if current_r2 > best_r2:
                        best_r2 = current_r2
                        best_params = {'learning_rate': lr, 'n_estimators': est, 'max_depth': depth, 'min_child_weight': weight, 'gamma': gamma}
                        
print(f"Best R^2 score: {best_r2}")
print(f"Best parameters: {best_params}")


Best R^2 score: 0.6253234489918966
Best parameters: {'learning_rate': 0.015, 'n_estimators': 450, 'max_depth': 2, 'min_child_weight': 25, 'gamma': 3}


## Hyperparameter Tuning with Nested Loops

In this section, we are using nested `for` loops to iterate through various combinations of hyperparameters for our XGBoost model. This manual grid search approach aims to find the combination that yields the highest R-squared value.

### Considerations:
- **Time Complexity**: Running this code could take a significant amount of time depending on the dataset size and the number of combinations to be tested.
- **Optimization Strategy**: The loop iterates through different combinations of `learning_rate`, `n_estimators`, and `max_depth`. After each iteration, the model is trained and its R-squared score is calculated. The hyperparameters that provide the highest R-squared are then selected.

### Tips for Speeding Up Execution:
1. **Reduce Iterations**: Consider reducing the number of iterations in your loops to make the process faster.
2. **Subset of Data**: Use a smaller subset of your training data for quicker execution.
3. **Parallel Computing**: Utilize parallel computing capabilities, if available, to speed up the process.




In [10]:

# imputer = SimpleImputer(strategy="mean")
# y_train = imputer.fit_transform(y_train.reshape(-1, 1)).ravel()


## refrence 
# Parameter Tuning: A Complete Guide with Python Codes
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#h-general-parameters

In [19]:
# Uee XGBoost to define regressor model
regressor = XGBRegressor(learning_rate=0.015,
    n_estimators=450,
    max_depth=2,
    min_child_weight=25,
    gamma=3,
    subsample=.9,
    colsample_bytree=.65)



## Cross-Validation Using KFold

In this section, we will apply K-Fold Cross-Validation to assess how well the model generalizes to new data.
K-Fold Cross-Validation splits the training dataset into 'K' number of subsets, or folds.
The model is trained on K-1 of these folds and validated on the remaining one.
The process is repeated K times, each time using a different fold as the validation set.
The average of all K runs gives us a more robust measure of model performance.

We will use `KFold` from scikit-learn to create 5 folds. Our performance metric is \( R^2 \).


In [20]:
# # Create a KFold object
# kf = KFold(n_splits=5, shuffle=True, random_state=3)

# # Run cross-validation
# cv_scores = cross_val_score(regressor, X_train_scaled, y_train, cv=kf, scoring='r2')

# # Print the mean R^2 score
# print(f'Mean R^2 Score: {np.mean(cv_scores)}')
# # Fit the model to the training data
regressor.fit(X_train_scaled, y_train)


In [21]:
#Running the model on the training data to predict sales
sales_data_predictions = regressor.predict(X_test_scaled)


In [22]:

# Re-run the prediction on modified X_test_scaled
sales_data_predictions = regressor.predict(X_test_scaled)

# Calculate the R-squared value again
r2_sales = metrics.r2_score(y_test, sales_data_predictions)
print('R Squared value = ', r2_sales)


R Squared value =  0.6238868891857636



1. **Grid Search in Scikit-learn**: A useful guide on how grid search is usually done in scikit-learn.  
   - [Scikit-learn GridSearchCV Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

2. **XGBoost Parameters**: Comprehensive guide on XGBoost's parameters, what they mean, and how they affect the model.
   - [XGBoost Parameters](https://xgboost.readthedocs.io/en/latest/parameter.html)

3. **R-squared metric**: Explanation and uses of the R-squared metric in regression problems.
   - [Wikipedia: Coefficient of Determination](https://en.wikipedia.org/wiki/Coefficient_of_determination)

4. **Cross-Validation**: Useful if you'd like to understand why you might opt for K-Fold cross-validation.
   - [Cross-validation: evaluating estimator performance](https://scikit-learn.org/stable/modules/cross_validation.html)

5. **Hyperparameter Optimization**: A broader look at strategies for hyperparameter optimization.
   - [Hyperparameter Optimization in Machine Learning Models](https://www.datacamp.com/community/tutorials/parameter-optimization-machine-learning-models)

6. **Computational Cost of Grid Search**: An article discussing the computational expenses and trade-offs of using Grid Search.
   - [The Computational Complexity of Grid Search](https://stats.stackexchange.com/questions/29133/the-computational-complexity-of-grid-search)
