In [1]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import pandas as pd

# Load the dataset
sales_predict_df = pd.read_csv("./Resources/train_modified.csv")

# Prepare features and target
features_df = sales_predict_df.drop(columns=['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales'])
target_df = sales_predict_df['Item_Outlet_Sales']

X = features_df.values
y = target_df.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a Random Forest regressor
regressor = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid for Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=regressor,
    param_distributions=param_grid,
    n_iter=10,  # Number of parameter settings that are sampled
    scoring='neg_mean_squared_error',  # Metric to optimize
    cv=5,  # Cross-validation folds
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train_scaled, y_train)

# Get the best model from RandomizedSearchCV
best_regressor = random_search.best_estimator_

# Make predictions on the test set
sales_data_predictions = best_regressor.predict(X_test_scaled)

# Calculate R squared value
r2_sales = metrics.r2_score(y_test, sales_data_predictions)
print('R Squared value = ', r2_sales)


R Squared value =  0.5931222080674248
