In [1]:
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
store = pd.read_csv('store.csv')
train = pd.read_csv('train.csv', low_memory=False)

# Merge the DataFrames on the 'Store' column
trainStore = train.merge(store, on='Store').dropna()

# Encode categorical variables
label_encoder = LabelEncoder()
for column in trainStore.columns:
    if trainStore[column].dtype == 'object':
        trainStore[column] = label_encoder.fit_transform(trainStore[column])

# Split the dataframe into input features (X) and target variable (y)
X = trainStore.drop(['Sales', 'Date'], axis=1)  # Remove 'Date' column as it's not used
y = trainStore['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# KNN Model with GridSearch
param_grid_knn = {'n_neighbors': [3, 5, 7]}
with tqdm(desc='KNN', total=1) as pbar_knn:
    grid_search_knn = GridSearchCV(KNeighborsRegressor(), param_grid_knn, cv=5)
    grid_search_knn.fit(X_train, y_train)
    best_knn = grid_search_knn.best_estimator_
    pbar_knn.update(1)

# RandomForest Model with GridSearch
param_grid_rf = {'n_estimators': [100, 200, 300], 'max_depth': [3, 5, 7]}
with tqdm(desc='RandomForest', total=1) as pbar_rf:
    grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5)
    grid_search_rf.fit(X_train, y_train)
    best_rf = grid_search_rf.best_estimator_
    pbar_rf.update(1)

# GradientBoosting Model with GridSearch
param_grid_gb = {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.5], 'max_depth': [3, 5, 7]}
with tqdm(desc='GradientBoosting', total=1) as pbar_gb:
    grid_search_gb = GridSearchCV(GradientBoostingRegressor(), param_grid_gb, cv=5)
    grid_search_gb.fit(X_train, y_train)
    best_gb = grid_search_gb.best_estimator_
    pbar_gb.update(1)

# DecisionTree Model with GridSearch
param_grid_dt = {'max_depth': [3, 5, 7]}
with tqdm(desc='DecisionTree', total=1) as pbar_dt:
    grid_search_dt = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, cv=5)
    grid_search_dt.fit(X_train, y_train)
    best_dt = grid_search_dt.best_estimator_
    pbar_dt.update(1)

# MLPRegressor Model with GridSearch
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid_mlp = {
    'hidden_layer_sizes': [(64,), (128,), (64, 32), (128, 64)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'batch_size': [32, 64],
    'max_iter': [100, 200, 300]
}
with tqdm(desc='MLPRegressor', total=1) as pbar_mlp:
    grid_search_mlp = GridSearchCV(MLPRegressor(random_state=42), param_grid_mlp, cv=5, scoring='neg_mean_squared_error')
    grid_search_mlp.fit(X_train_scaled, y_train)
    best_mlp = grid_search_mlp.best_estimator_
    pbar_mlp.update(1)

# Predictions and Metrics
models = {'KNN': best_knn, 'RandomForest': best_rf, 'GradientBoosting': best_gb, 'DecisionTree': best_dt, 'MLPRegressor': best_mlp}
for name, model in models.items():
    if name == 'MLPRegressor':
        y_pred = model.predict(X_test_scaled)
    else:
        y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name} Model Metrics:')
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'R-squared (R2): {r2}')


Using device: cuda


KNN: 100%|██████████| 1/1 [00:32<00:00, 32.43s/it]
RandomForest: 100%|██████████| 1/1 [27:41<00:00, 1661.68s/it]
GradientBoosting:   0%|          | 0/1 [00:00<?, ?it/s]