# In this File, I've engineered a new dataset to examine possible performance gains. 
- I've dissected the 'Item_Identifier' and fine-tuned the 'Fat Content' categories.
- Zero values have been addressed for more accurate results.
- Optimal parameters obtained from grid search have been applied to enhance the analysis further.
- Additionally, I have feature-engineered the 'Item_Sales' column to create a new column for 'Item_Sales_Bins'.



In [4]:
import pandas as pd
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
from sklearn import metrics

from xgboost import XGBRegressor

#read trained_modified.csv
sales_predict_df = pd.read_csv("Resources/Train_modified_Binned_400.csv")
sales_predict_df.head()


#Create features and target array
features_df = sales_predict_df.drop(columns= ['Item_Identifier','Item_Outlet_Sales'])
# One-hot encoding using pd.get_dummies
features_df = pd.get_dummies(features_df, columns=['Outlet_Identifier','Item_Type','Item_Outlet_Sales_Binned',"Item_Fat_Content","Outlet_Type" ,'Outlet_Location_Type','Outlet_Size' ,'Outlet_Establishment_Year'])

# Extract target variables
target_df = sales_predict_df['Item_Outlet_Sales']
#Set the target array values
X = features_df.values
y = target_df.values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)




In [4]:

# Uee XGBoost to define regressor model
regressor = XGBRegressor(learning_rate=.015,
    n_estimators=600,
    max_depth=2,
    min_child_weight=64,
    gamma=3,
    subsample=.9,
    colsample_bytree=.65)

# Fit the model to the training data
regressor.fit(X_train_scaled, y_train)

#Running the model on the training data to predict sales
sales_data_predictions = regressor.predict(X_test_scaled)

# In order to check the performance of the model we find the R squared Value
r2_sales = metrics.r2_score(y_test, sales_data_predictions)
print('R Squared value = ', r2_sales)

R Squared value =  0.8205777997440779
