In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import Callback
import tensorflow.keras.backend as K
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.ensemble import BaggingRegressor
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Load data
data = pd.read_csv("/kaggle/input/ml-competition-2024-for-ukrainians/train.csv")
test_data = pd.read_csv("/kaggle/input/ml-competition-2024-for-ukrainians/test.csv")
submission = pd.read_csv("/kaggle/input/ml-competition-2024-for-ukrainians/sample_submission.csv")

In [None]:
for i in test_data.columns:
    if len(test_data[i].unique())<30:
        print(i, test_data[i].unique())
    else: print(i)

In [None]:
def compute_statistics(data):
    stats = data.groupby(['Item_Type', 'Outlet_Identifier'])['Item_Outlet_Sales'].agg(['mean', 'std', 'min', 'max', 'median', 'count']).reset_index()
    stats.columns = ['Item_Type', 'Outlet_Identifier', 'IOS_mean', 'IOS_std', 'IOS_min', 'IOS_max', 'IOS_median', 'IOS_count']
    
    # Замінюємо NaN значення std на 0
    stats['IOS_std'].fillna(0, inplace=True)
    
    return stats

def merge_statistics(data, stats):
    data = pd.merge(data, stats, on=['Item_Type', 'Outlet_Identifier'], how='left')
    return data

def preprocess_data(train_data, test_data):
    # Compute statistics for training data
    stats = compute_statistics(train_data)
    
    # Merge statistics into training and test data
    train_data = merge_statistics(train_data, stats)
    test_data = merge_statistics(test_data, stats)
    
    # Replace specific values in 'Item_Fat_Content' column
    train_data.replace(['Regular', 'reg'], 1, inplace=True)
    train_data.replace(['Low Fat', 'LF', 'low fat'], 0, inplace=True)
    test_data.replace(['Regular', 'reg'], 1, inplace=True)
    test_data.replace(['Low Fat', 'LF', 'low fat'], 0, inplace=True)
    
    # Add new feature 'Age_of_Outlet'
    train_data['Age_of_Outlet'] = 2024 - train_data['Outlet_Establishment_Year']
    test_data['Age_of_Outlet'] = 2024 - test_data['Outlet_Establishment_Year']
    
    # Convert categorical variables to dummy variables
    categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
    train_data = pd.get_dummies(train_data, columns=categorical_cols)
    test_data = pd.get_dummies(test_data, columns=categorical_cols)
    
    # Ensure both train and test datasets have the same dummy variables
    test_data = test_data.reindex(columns = train_data.columns, fill_value=0)
    
    # Fill NaN values in numerical columns with 0 (or other appropriate value)
    train_data.fillna(0, inplace=True)
    test_data.fillna(0, inplace=True)
    
    # Select numerical features including 'Age_of_Outlet'
    numerical_cols = ['Age_of_Outlet', 'Item_Weight', 'Item_Visibility','IOS_mean', 'IOS_std', 'IOS_min', 'IOS_max', 'IOS_median', 'IOS_count']
    X_train_numerical = train_data[numerical_cols]
    X_test_numerical = test_data[numerical_cols]
    
    # Scale numerical features
    scaler = StandardScaler()
    X_train_numerical_scaled = scaler.fit_transform(X_train_numerical)
    X_test_numerical_scaled = scaler.transform(X_test_numerical)
    
    # Select categorical features, excluding 'Item_Outlet_Sales', 'id', 'Item_Identifier', 'Outlet_Establishment_Year'
    exclude_cols = numerical_cols + ['Item_Outlet_Sales', 'id', 'Item_Identifier', 'Outlet_Establishment_Year']
    X_train_categorical = train_data.drop(columns=exclude_cols)
    X_test_categorical = test_data.drop(columns=exclude_cols)
    
    # Convert boolean values to int
    X_train_categorical = X_train_categorical.astype(int)
    X_test_categorical = X_test_categorical.astype(int)
    
    # Combine numerical and categorical features
    X_train = np.hstack([X_train_numerical_scaled, X_train_categorical.values])
    X_test = np.hstack([X_test_numerical_scaled, X_test_categorical.values])
    
    return X_train, X_test

In [None]:
# Preprocessing the data
X, test_data = preprocess_data(data, test_data)
y = data['Item_Outlet_Sales']

In [None]:
# Розділення даних на навчальну (80%) та тестову (20%) вибірки
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=55)

In [None]:
class CustomLearningRateScheduler(Callback):
    def __init__(self, patience=10, factor=0.5, min_lr=1e-6):
        super(CustomLearningRateScheduler, self).__init__()
        self.patience = patience
        self.factor = factor
        self.min_lr = min_lr
        self.best_weights = None
        self.wait = 0
        self.best = np.Inf

    def on_train_begin(self, logs=None):
        self.wait = 0
        self.best = np.Inf
        self.best_weights = None

    def on_epoch_end(self, epoch, logs=None):
        current_loss = logs.get("val_loss")
        if current_loss is None:
            return
        if np.less(current_loss, self.best):
            self.best = current_loss
            self.wait = 0
            self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                old_lr = float(K.get_value(self.model.optimizer.learning_rate))
                if old_lr > self.min_lr:
                    new_lr = old_lr * self.factor
                    new_lr = max(new_lr, self.min_lr)
                    self.model.optimizer.learning_rate.assign(new_lr)
                    print(f"\nEpoch {epoch+1}: reducing learning rate to {new_lr}.")
                    self.model.set_weights(self.best_weights)
                    self.wait = 0


In [None]:
model = Sequential()
model.add(Dense(2048, input_dim=X.shape[1], activation='relu'))
# model.add(BatchNormalization())
model.add(Dropout(0.25))
model.add(Dense(1024, activation='relu'))
# model.add(BatchNormalization())
model.add(Dropout(0.15))
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))


# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# Create the custom learning rate scheduler callback
clr_scheduler = CustomLearningRateScheduler(patience=5, factor=0.5, min_lr=1e-6)


# Compile your model
model.compile(optimizer=Adam(learning_rate=0.005), loss='mean_squared_logarithmic_error')

# Train your model with the custom learning rate scheduler
model.fit(X, y, epochs=150, batch_size=64, 
          validation_data=(X_val, y_val), verbose=1, 
          callbacks=[early_stopping, clr_scheduler])


In [None]:
# Make predictions on test data
predictions = model.predict(test_data).flatten()
submission['Item_Outlet_Sales'] = predictions
submission_file_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_file_path, index=False)
submission.head()

In [None]:
# Прогнозування на валідаційних даних
y_val_pred = model.predict(X_val)
msle_val = mean_squared_log_error(y_val, y_val_pred)
print(f'Mean Squared Logarithmic Error (MSLE) on validation set: {msle_val}')