In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load dataset
df = pd.read_csv("/content/cleaned_data.csv")

# Define features and target
X = df.drop(columns=["item_outlet_sales"])
y = df["item_outlet_sales"]

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = []  # Add categorical columns if needed

# Define preprocessing steps
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, numerical_features),
    ("cat", cat_transformer, categorical_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to compare
models = {
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.1, objective="reg:squarederror", random_state=42)
}

# Train and evaluate each model
results = {}
best_model = None
best_score = -np.inf

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {"RMSE": rmse, "R² Score": r2}

    print(f"{name}: RMSE = {rmse:.2f}, R² Score = {r2:.4f}")

    if r2 > best_score:
        best_score = r2
        best_model = pipeline

# Save the best model
joblib.dump(best_model, "best_model.pkl")
print("Best model saved as best_model.pkl")

# Print final results
print("\nFinal Model Comparison:")
for model, metrics in results.items():
    print(f"{model}: RMSE = {metrics['RMSE']:.2f}, R² Score = {metrics['R² Score']:.4f}")


Decision Tree: RMSE = 1495.68, R² Score = 0.1769
Random Forest: RMSE = 1087.89, R² Score = 0.5646
Gradient Boosting: RMSE = 1042.79, R² Score = 0.5999
XGBoost: RMSE = 1092.16, R² Score = 0.5611
Best model saved as best_model.pkl

Final Model Comparison:
Decision Tree: RMSE = 1495.68, R² Score = 0.1769
Random Forest: RMSE = 1087.89, R² Score = 0.5646
Gradient Boosting: RMSE = 1042.79, R² Score = 0.5999
XGBoost: RMSE = 1092.16, R² Score = 0.5611


In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Define features and target
X = df.drop(columns=["item_outlet_sales"])
y = df["item_outlet_sales"]

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = []  # Add categorical columns if needed

# Define preprocessing steps
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, numerical_features),
    ("cat", cat_transformer, categorical_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

# Hyperparameter tuning grid
param_dist = {
    "LightGBM": {
        "n_estimators": [200, 500, 1000],
        "learning_rate": [0.01, 0.1, 0.2],
        "num_leaves": [31, 50, 100],
        "max_depth": [-1, 10, 20]
    },
    "CatBoost": {
        "iterations": [200, 500, 1000],
        "learning_rate": [0.01, 0.1, 0.2],
        "depth": [6, 10, 12]
    }
}

# Train and evaluate each model
results = {}
best_model = None
best_score = -np.inf

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", model)
    ])

    # Hyperparameter tuning
    search = RandomizedSearchCV(
        model, param_distributions=param_dist[name], n_iter=10, cv=3, scoring="r2", n_jobs=-1, random_state=42
    )

    search.fit(X_train, y_train)
    best_model_instance = search.best_estimator_

    pipeline.set_params(regressor=best_model_instance)
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {"RMSE": rmse, "R² Score": r2}

    print(f"{name}: RMSE = {rmse:.2f}, R² Score = {r2:.4f}")

    if r2 > best_score:
        best_score = r2
        best_model = pipeline

# Save the best model
joblib.dump(best_model, "best_advanced_model.pkl")
print("Best model saved as best_advanced_model.pkl")

# Print final results
print("\nFinal Model Comparison:")
for model, metrics in results.items():
    print(f"{model}: RMSE = {metrics['RMSE']:.2f}, R² Score = {metrics['R² Score']:.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 781
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 9
[LightGBM] [Info] Start training from score 2202.365232




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 783
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 9
[LightGBM] [Info] Start training from score 2202.365232




LightGBM: RMSE = 1038.38, R² Score = 0.6033




CatBoost: RMSE = 1023.92, R² Score = 0.6143
Best model saved as best_advanced_model.pkl

Final Model Comparison:
LightGBM: RMSE = 1038.38, R² Score = 0.6033
CatBoost: RMSE = 1023.92, R² Score = 0.6143


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Define features and target
X = df.drop(columns=["item_outlet_sales"])
y = df["item_outlet_sales"]

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = []  # Add categorical columns if needed

# Define preprocessing steps
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, numerical_features),
    ("cat", cat_transformer, categorical_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Define Neural Network model
model = keras.Sequential([
    keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(1)  # Output layer for regression
])

# Compile model
model.compile(optimizer="adam", loss="mse", metrics=["mae"])

# Train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32, verbose=1)

# Evaluate model
mse, mae = model.evaluate(X_test, y_test)
print(f"Test MAE: {mae:.2f}")

# Save model and preprocessing pipeline
model.save("neural_network_model.h5")
joblib.dump(preprocessor, "preprocessing_pipeline.pkl")
print("Model and preprocessing pipeline saved.")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 7129044.5000 - mae: 2070.8801 - val_loss: 1506737.7500 - val_mae: 959.5886
Epoch 2/100
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1514639.8750 - mae: 941.4814 - val_loss: 1235328.6250 - val_mae: 821.3552
Epoch 3/100
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1383304.3750 - mae: 867.3333 - val_loss: 1177551.3750 - val_mae: 795.1107
Epoch 4/100
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1351706.7500 - mae: 850.3723 - val_loss: 1159288.6250 - val_mae: 786.9567
Epoch 5/100
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1319466.8750 - mae: 833.1027 - val_loss: 1142660.2500 - val_mae: 777.0204
Epoch 6/100
[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1299050.8750 - mae: 816.5542 - val_loss: 1122360.8750 - val_mae: 761.87



Test MAE: 717.01
Model and preprocessing pipeline saved.


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Define features and target
X = df.drop(columns=["item_outlet_sales"])
y = df["item_outlet_sales"]

# Identify numerical and categorical columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = []  # Add categorical columns if needed

# Define preprocessing steps
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_transformer, numerical_features),
    ("cat", cat_transformer, categorical_features)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Define optimized Neural Network model
model = keras.Sequential([
    Dense(256, activation="relu", input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation="relu"),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation="relu"),
    Dense(32, activation="relu"),

    Dense(1)  # Output layer for regression
])

# Compile model with AdamW optimizer & loss function
model.compile(optimizer=keras.optimizers.AdamW(learning_rate=0.001), loss="mse", metrics=["mae"])

# Callbacks for early stopping & learning rate reduction
callbacks = [
    EarlyStopping(monitor="val_loss", patience=15, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5)
]

# Train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),
                    epochs=200, batch_size=64, verbose=1, callbacks=callbacks)

# Evaluate model
y_pred = model.predict(X_test)
mse, mae = model.evaluate(X_test, y_test)
r2 = r2_score(y_test, y_pred)

print(f"Test MAE: {mae:.2f}")
print(f"Test R² Score: {r2:.4f}")

# Save model and preprocessing pipeline
model.save("optimized_neural_network_model.h5")
joblib.dump(preprocessor, "preprocessing_pipeline.pkl")
print("Optimized model and preprocessing pipeline saved.")



Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 7361430.0000 - mae: 2133.8879 - val_loss: 6048858.0000 - val_mae: 1934.8318 - learning_rate: 0.0010
Epoch 2/200
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 3509829.0000 - mae: 1438.5391 - val_loss: 1792265.8750 - val_mae: 962.0882 - learning_rate: 0.0010
Epoch 3/200
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1687996.1250 - mae: 981.6470 - val_loss: 1231679.0000 - val_mae: 802.1645 - learning_rate: 0.0010
Epoch 4/200
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1425796.8750 - mae: 881.9928 - val_loss: 1078898.2500 - val_mae: 754.7499 - learning_rate: 0.0010
Epoch 5/200
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 1265366.5000 - mae: 805.6190 - val_loss: 1056013.3750 - val_mae: 731.6070 - learning_rate: 0.0010
Epoch 6/200
[1m107/107[0m [32m━━━━━━━━━━━



Test MAE: 712.14
Test R² Score: 0.6194
Optimized model and preprocessing pipeline saved.
