In [1]:
%pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
# Function to calculate RMSLE
def root_mean_squared_log_error(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


In [3]:
N_JOBS = 1

In [4]:
training_data_path = '../input/ml-competition-2024-for-ukrainians/train.csv'
test_data_path = '../input/ml-competition-2024-for-ukrainians/test.csv'
train_df = pd.read_csv(training_data_path)
test_df = pd.read_csv(test_data_path)

print(train_df.shape, test_df.shape)

(378428, 13) (252286, 12)


In [5]:
def process_visibility(train_df, test_df):
    train_df["Item_Visibility"] = train_df["Item_Visibility"].replace(0, np.nan)
    test_df["Item_Visibility"] = test_df["Item_Visibility"].replace(0, np.nan)

    train_df["Item_Visibility"] = np.log1p(train_df["Item_Visibility"])
    test_df["Item_Visibility"] = np.log1p(test_df["Item_Visibility"])

    mean_visibility = train_df["Item_Visibility"].mean()
    print(f"mean_visibility: {mean_visibility}")

    train_df["Item_Visibility"] = train_df["Item_Visibility"].fillna(mean_visibility)
    test_df["Item_Visibility"] = test_df["Item_Visibility"].fillna(mean_visibility)

    return train_df, test_df

train_df, test_df = process_visibility(train_df, test_df)

mean_visibility: 0.06264653074731183


In [6]:
train_df["Item_Outlet_Sales"] = np.log1p(train_df["Item_Outlet_Sales"].values)

In [7]:
target_mean = train_df["Item_Outlet_Sales"].mean()
target_std = train_df["Item_Outlet_Sales"].std()
print(f"target_mean: {target_mean}, target_std: {target_std}")

target_mean: 7.360180293004388, target_std: 0.8535593658882088


In [8]:
def clean_Item_Fat_Content(x):
    if x == "LF" or x == "low fat":
        return "Low Fat"
    elif x == "reg":
        return "Regular"
    else:
        return x
    
def mrp2class(v):
    b0 = 70
    b1 = 135
    b2 = 204
    if v < b0:
        return 0
    elif v >= b0 and v < b1:
        return 1
    elif v >= b1 and v < b2:
        return 2
    else:
        return 3

def extract_features(df):
    df["Item_Fat_Content"] = df["Item_Fat_Content"].apply(lambda x: clean_Item_Fat_Content(x))
    df["Item_MRP_class"] = df["Item_MRP"].apply(lambda x: mrp2class(x))

In [9]:
extract_features(train_df)
extract_features(test_df)

In [25]:
def target_encode_cv(train_df, test_df, target_column, categorical_column, n_splits=5):
    """
    Encodes a categorical variable with the mean of the target variable using cross-validation to avoid target leakage.

    :param train_df: pandas DataFrame containing the training data
    :param test_df: pandas DataFrame containing the test data
    :param target_column: name of the target column in the training data
    :param categorical_column: name of the categorical column to be encoded
    :param n_splits: number of splits for cross-validation
    :return: Two DataFrames with the new encoded feature added to both train and test data
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    train_df[f'{categorical_column}_TE'] = np.nan
    
    # Create a temporary DataFrame for test encoding
    test_encoded = pd.DataFrame()
    
    for train_index, val_index in kf.split(train_df):
        X_train, X_val = train_df.iloc[train_index], train_df.iloc[val_index]
        
        # Compute the mean of the target for each category in the training fold
        category_target_mean = X_train.groupby(categorical_column)[target_column].mean()
        
        
        # Map the means to the validation fold
        train_df.loc[train_df.index[val_index], f'{categorical_column}_TE'] = \
            train_df.loc[train_df.index[val_index], categorical_column].map(category_target_mean)
        
        # Update test encoding by accumulating results from each fold
        fold_test_encoded = test_df[categorical_column].map(category_target_mean)
        test_encoded = pd.concat([test_encoded, fold_test_encoded], axis=1)
    
    # Compute the mean for test encoding over all folds
    test_df[f'{categorical_column}_TE'] = test_encoded.mean(axis=1)
    
    # Handle missing values in test set that were not present in training set
    test_df[f'{categorical_column}_TE'].fillna(train_df[target_column].mean(), inplace=True)
    category_target_mean
    
    return train_df, test_df, category_target_mean

In [26]:
te_cols = []
cols_for_te = [
    "Item_Identifier",
    "Item_Type",
    "Outlet_Identifier",
    "Item_MRP",
    "Item_Weight",
    "Item_Visibility",
]
for c in cols_for_te:
    train_df, test_df, category_target_mean= target_encode_cv(train_df, test_df, "Item_Outlet_Sales", c)
    te_cols.append(f"{c}_TE")
category_target_mean

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[f'{categorical_column}_TE'].fillna(train_df[target_column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[f'{categorical_column}_TE'].fillna(train_df[target_column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace m

Item_Visibility
0.003505    8.438363
0.003512    7.541443
0.003518    7.056490
0.003567    7.336288
0.003568    7.414006
              ...   
0.272727    5.828877
0.273026    5.303317
0.278633    7.486451
0.280383    5.176427
0.283968    7.331634
Name: Item_Outlet_Sales, Length: 61624, dtype: float64