In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from scipy.stats import randint
import xgboost as xgb
from copy import deepcopy
from catboost import CatBoostRegressor, Pool
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ml-competition-2024-for-ukrainians/sample_submission.csv
/kaggle/input/ml-competition-2024-for-ukrainians/train.csv
/kaggle/input/ml-competition-2024-for-ukrainians/test.csv


In [2]:
train_df = pd.read_csv('/kaggle/input/ml-competition-2024-for-ukrainians/train.csv')
test_df = pd.read_csv('/kaggle/input/ml-competition-2024-for-ukrainians/test.csv')

In [3]:
categoric_features = ['Item_Type',
                      'Outlet_Identifier',
                      'Item_Fat_Content',
                      'Outlet_Size',
                      'Outlet_Location_Type',
                      'Outlet_Type']

cont_features = ['Item_Weight','Item_Visibility','Item_MRP']

In [4]:
def pre_processing_train(df):
    
    df_tr = df.copy()
    
    df_tr['Item_Fat_Content'] = df_tr['Item_Fat_Content'].str.lower().str.strip()
    df_tr['Item_Fat_Content'] = df_tr['Item_Fat_Content'].replace([r'(^| )reg( |$)', 'lf'], ['regular', 'low fat'], regex=True)
    
    
    cols_to_encode = categoric_features
    one_hot = OneHotEncoder(handle_unknown='ignore')
    OH_cols_train = one_hot.fit_transform(df_tr[cols_to_encode])
    category_names = one_hot.get_feature_names_out(categoric_features)
    OH_cols_train = pd.DataFrame(OH_cols_train.toarray(), columns=category_names, index=df_tr.index)
    df_tr = df_tr.drop(cols_to_encode, axis=1)
    df_tr = df_tr.drop(['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Establishment_Year'], axis=1)
    #df_tr = df_tr.drop(['Item_Fat_Content', 'Outlet_Size'], axis=1)
    df_tr = pd.concat([df_tr, OH_cols_train], axis=1)
    
    scale_cols = cont_features
    sc = StandardScaler()
    scaled_data = sc.fit_transform(df_tr[scale_cols])
    df_tr[scale_cols] = scaled_data
    
    df_tr.columns = df_tr.columns.astype(str)
    
    return df_tr, sc, one_hot

In [5]:
def pre_processing_test(df, sc: StandardScaler, one_hot: OneHotEncoder):
    df_test = df.copy()
    
    df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].str.lower().str.strip()
    df_test['Item_Fat_Content'] = df_test['Item_Fat_Content'].replace([r'(^| )reg( |$)', 'lf'], ['regular', 'low fat'], regex=True)
    
    cols_to_encode = categoric_features
    OH_cols_train = one_hot.transform(df_test[cols_to_encode])
    category_names = one_hot.get_feature_names_out(categoric_features)
    OH_cols_train = pd.DataFrame(OH_cols_train.toarray(), columns=category_names, index=df_test.index)
    df_test = df_test.drop(cols_to_encode, axis=1)
    df_test = df_test.drop(['Item_Outlet_Sales', 'Item_Identifier', 'Outlet_Establishment_Year'], axis=1)
    #df_test = df_test.drop(['Item_Fat_Content', 'Outlet_Size'], axis=1)
    df_test = pd.concat([df_test, OH_cols_train], axis=1)
    
    scale_cols = cont_features
    scaled_data = sc.transform(df_test[scale_cols])
    df_test[scale_cols] = scaled_data
    
    df_test.columns = df_test.columns.astype(str)
    
    return df_test

In [6]:
y = train_df['Item_Outlet_Sales']
X, sc, one_hot = pre_processing_train(train_df)
X

Unnamed: 0,id,Item_Weight,Item_Visibility,Item_MRP,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0,1.039133,-0.634585,1.530694,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,1.667063,1.017029,1.227692,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2,1.201529,0.803979,1.842414,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,3,-0.140943,-0.226181,0.730313,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,-0.498213,-0.246987,0.404395,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378423,378423,0.530293,1.055443,0.041880,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
378424,378424,0.844258,-0.760213,-0.807009,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
378425,378425,0.064759,-0.150663,1.292600,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
378426,378426,-0.845740,1.439135,-1.419530,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [7]:
# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


In [8]:
#XGBRegressor  best params
xgb_params = {
    'objective': 'reg:gamma',  #  для регресії
    'n_estimators': 200,  # Кількість дерев
    'max_depth': 4,  # Максимальна глибина дерев
    'learning_rate': 0.05,  # Крок оновлення
    'gamma': 0,  # Параметр регуляризації
    'reg_lambda': 1,  # Параметр регуляризації
}
xgb_model = xgb.XGBRegressor(**xgb_params)  # для регресії
xgb_model.fit(X_train, y_train)

rf_params = {'max_depth': 30,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 132}

rf_model = RandomForestRegressor(**rf_params, random_state=42)

rf_model.fit(X_train, y_train)

In [9]:
# Прогнози з XGBRegressor
xgb_train_pred = xgb_model.predict(X_train)
xgb_valid_pred = xgb_model.predict(X_valid)

# Прогнози з RandomForestRegressor
rf_train_pred = rf_model.predict(X_train)
rf_valid_pred = rf_model.predict(X_valid)

In [10]:
# Створення нових фіч з прогнозів базових моделей
X_train_blend = np.column_stack((xgb_train_pred, rf_train_pred))
X_valid_blend = np.column_stack((xgb_valid_pred, rf_valid_pred))

In [11]:
# Нормалізація даних за допомогою StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_blend)
X_valid_scaled = scaler.transform(X_valid_blend)

In [12]:
# Навчання метамоделі Ridge на нормалізованих даних
meta_model_ridge = Ridge(positive=True, alpha=0.5)
meta_model_ridge.fit(X_train_scaled, y_train)

In [13]:
# Прогнози з базових моделей на тестових даних
xgb_test_pred = xgb_model.predict(X_test)
rf_test_pred = rf_model.predict(X_test)

# Створення нового набору даних для тестових прогнозів
X_test_blend = np.column_stack((xgb_test_pred, rf_test_pred))

# Нормалізація тестових даних для прогнозів
X_test_scaled = scaler.transform(X_test_blend)

# Прогноз з метамоделі Ridge на нормалізованих тестових даних
final_pred = meta_model_ridge.predict(X_test_scaled)


In [14]:
# boosted_model = GradientBoostingRegressor(loss='squared_error', alpha=0.95,
#                           n_estimators=200, max_depth=4,
#                           learning_rate=.1, min_samples_leaf=9,
#                           min_samples_split=9)

# # Навчання ансамблю на навчальних даних
# boosted_model.fit(X_train, y_train)

# # Прогноз на тестових даних
# final_pred = boosted_model.predict(X_valid)

In [15]:
final_pred_sorted = np.sort(final_pred)
final_pred_sorted

array([-525.52920061, -515.69306567, -486.74426764, ..., 7136.22761976,
       7968.88131199, 8359.91739397])

In [16]:
rmse_log = mean_squared_log_error(y_test, final_pred, squared=False)
rmse_log

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.