# Training Datasets Generation

In [5]:
import pandas as pd
import numpy as np
import random

# Define shifts
shifts = [1, 2, 3]

# Generate multiple sets of training data
data_list = []

for i in range(1000):  # Generate 1000 different datasets
    data = {
        "Shift": [],
        "Customers": []
    }
    

    for shift in shifts:
            data["Shift"].append(shift)
            
            # Ensure the higher value
            if ( shift == 2):
                customers = random.randint(50, 70)
            elif (shift == 1):
                customers = random.randint(40, 55)
            else:
                customers = random.randint(20, 40)
                
            data["Customers"].append(customers)
    
    data_list.append(pd.DataFrame(data))

# Generate keys where every 3 rows get the same key
keys = range(1, len(data_list) + 1)  # 1, 2, 3,... based on the number of dataframes

# Concatenate with keys
#final_df = pd.concat(data_list, keys=keys, names=["Group"])

#Save to CSV file
#csv_customer_prediction = "training_data_customers.csv"
#final_df.to_csv(csv_customer_prediction)

#Import Training Data
df = pd.read_csv("training_data_customers.csv")
final_df = df.drop(columns="Unnamed: 1")

# Machine Learning Model Creation

In [6]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler

In [7]:
# Prepare features (X) and target variable (y)
X = final_df[["Shift"]]
y = final_df["Customers"]

# Split data into training (70%), validation (15%), and testing (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
## Hyperparameter Tuning and Choose the best parameter

# **Feature Scaling for Linear Regression**
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# **1. Hyperparameter Tuning for Linear Regression**
linear_param_grid = {"fit_intercept": [True, False]}

lin_reg = LinearRegression()
lin_grid = GridSearchCV(lin_reg, linear_param_grid, cv=5, scoring="r2", n_jobs=-1)
lin_grid.fit(X_train_scaled, y_train)

best_lin_reg = lin_grid.best_estimator_
print(f"Best Linear Regression Parameters: {lin_grid.best_params_}")

# **2. Hyperparameter Tuning for Random Forest**
rf_param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring="r2", n_jobs=-1)
rf_grid.fit(X_train, y_train)

best_rf = rf_grid.best_estimator_
print(f"Best Random Forest Parameters: {rf_grid.best_params_}")

# **3. Hyperparameter Tuning for XGBoost**
xgb_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42)
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, cv=5, scoring="r2", n_jobs=-1)
xgb_grid.fit(X_train, y_train)

best_xgb = xgb_grid.best_estimator_
print(f"Best XGBoost Parameters: {xgb_grid.best_params_}")

# **Train Models with Best Hyperparameters**
models = {
    "Linear Regression": best_lin_reg,
    "Random Forest": best_rf,
    "XGBoost": best_xgb
}

Best Linear Regression Parameters: {'fit_intercept': True}
Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best XGBoost Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}


## Model Evaluation

In [11]:
results = {}
predictions = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred
    
    results[name] = {
        "R2 Score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "Cross-Validation R2": cross_val_score(model, X_train, y_train, cv=5, scoring="r2").mean()
    }

# Print model performance
for name, metrics in results.items():
    print(f"📊 {name} Model Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print("-" * 30)


📊 Linear Regression Model Performance:
R2 Score: 0.2548
MAE: 10.0073
MSE: 138.2747
Cross-Validation R2: 0.2750
------------------------------
📊 Random Forest Model Performance:
R2 Score: 0.8289
MAE: 4.8504
MSE: 31.7502
Cross-Validation R2: 0.8223
------------------------------
📊 XGBoost Model Performance:
R2 Score: 0.8288
MAE: 4.8516
MSE: 31.7743
Cross-Validation R2: 0.8223
------------------------------


### Example of number of customer prediction

In [12]:
# Generate predictions a for Week
week_1_data = np.array([[1], [2], [3]])
week_1_predictions = {}

for name, model in models.items():
    week_1_pred = model.predict(week_1_data)
    week_1_predictions[name] = {
            "Shift 1": round(week_1_pred[0]),
            "Shift 2": round(week_1_pred[1]),
            "Shift 3": round(week_1_pred[2])
        }

# Print predicted customer numbers
for name, preds in week_1_predictions.items():
    print(f"Predicted Customers for a Week ({name}):")
    print(preds)

Predicted Customers for a Week (Linear Regression):
{'Shift 1': 55, 'Shift 2': 46, 'Shift 3': 38}
Predicted Customers for a Week (Random Forest):
{'Shift 1': 48, 'Shift 2': 60, 'Shift 3': 30}
Predicted Customers for a Week (XGBoost):
{'Shift 1': 48, 'Shift 2': 60, 'Shift 3': 30}


