## Custom pipeline notebook
Created by Nikolay Pavlychev \
email: nikolaypavlychev@ya.ru

In [5]:
from datetime import datetime

import numpy as np 
import pandas as pd 
import seaborn as sns     

import joblib

from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.ensemble import RandomForestRegressor
import optuna

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

In [7]:
sales = pd.read_csv('./data/sales_train.csv')
item_cat = pd.read_csv('./data/item_categories.csv')
items = pd.read_csv('./data/items.csv')
sample = pd.read_csv('./data/sample_submission.csv')
shop = pd.read_csv('./data/shops.csv')
test = pd.read_csv('./data/test.csv')

## Train model based on dataset

In [11]:
sales_agg_features_target = pd.read_csv('./data/sales_agg_features_target.csv',sep=';')

sales_agg_features_target.head(10)

Unnamed: 0,ID,item_category_id,date_block_num,item_price_sum,item_price_mean,item_cnt_day_sum,item_cnt_day_mean,target
0,0_1000,67,0,290.0,58.0,5.0,1.0,4.0
1,0_1000,67,1,232.0,58.0,4.0,1.0,
2,0_10004,40,1,64.0,64.0,1.0,1.0,
3,0_1001,67,0,116.0,58.0,2.0,1.0,
4,0_10012,40,0,76.0,76.0,1.0,1.0,2.0
5,0_10012,40,1,152.0,76.0,2.0,1.0,
6,0_1002,67,0,116.0,58.0,2.0,1.0,
7,0_1003,67,0,116.0,58.0,2.0,1.0,
8,0_10033,55,1,110.0,110.0,1.0,1.0,
9,0_10038,40,1,69.0,69.0,1.0,1.0,


In [12]:
sales_agg_features_target_train_test = sales_agg_features_target[~sales_agg_features_target['target'].isnull()]

In [13]:
sales_train = sales_agg_features_target_train_test[~sales_agg_features_target_train_test['date_block_num'].isin([25,26,27,28,29,30,31,32,33])]
sales_test = sales_agg_features_target_train_test[sales_agg_features_target_train_test['date_block_num'].isin([25,26,27,28,29,30,31,32,33])]
print(sales_test.shape[0]/sales_agg_features_target_train_test.shape[0], sales_train.shape[0]/sales_agg_features_target_train_test.shape[0])


0.16834284464593388 0.8316571553540661


In [22]:
sales_train['item_category_id'] = sales_train['item_category_id'].astype('category')
sales_test['item_category_id'] = sales_test['item_category_id'].astype('category')
sales_train['date_block_num'] = sales_train['date_block_num'].astype('category')
sales_test['date_block_num'] = sales_test['date_block_num'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_train['item_category_id'] = sales_train['item_category_id'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_test['item_category_id'] = sales_test['item_category_id'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sales_train['date_block_num'] = sales

In [23]:
sales_test.dtypes

ID                     object
item_category_id     category
date_block_num       category
item_price_sum        float64
item_price_mean       float64
item_cnt_day_sum      float64
item_cnt_day_mean     float64
target                float64
dtype: object

In [24]:
X_train = sales_train.drop(['ID','target'],axis=1)
y_train = sales_train['target']

X_test = sales_test.drop(['ID','target'],axis=1)
y_test = sales_test['target']

In [28]:
model = RandomForestRegressor(n_estimators=100,criterion='squared_error',random_state=42)
model_rf = model.fit(X_train, y_train)

In [30]:
predicted_train=model_rf.predict(X_train)
predicted_test=model_rf.predict(X_test)

print(f'OOF score: {sqrt(mean_squared_error(y_train, predicted_train))}')
print(f'HOLDOUT score: {sqrt(mean_squared_error(y_test, predicted_test))}')

OOF score: 2.6872724536725805
HOLDOUT score: 12.040495373656215


## Hyperparams tuning by Optuna

In [32]:
def objective(trial):
    '''
    The function initialize search space hyperparams, train model, predict values, evaluate by metrics.

            Parameters:
                    trial (object of Optuna class)
        
            Returns:
                    RMSE (int): evaluate model by RMSE
    '''
    criterion = trial.suggest_categorical('criterion', ['mse'])
    bootstrap = trial.suggest_categorical('bootstrap',['True','False'])
    max_depth = trial.suggest_int('max_depth', 10, 100)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 10, 100)
    n_estimators =  trial.suggest_int('n_estimators', 30, 300)
    
    regr = RandomForestRegressor(bootstrap = bootstrap, criterion = criterion,
                                 max_depth = max_depth, max_features = max_features,
                                 max_leaf_nodes = max_leaf_nodes,n_estimators = n_estimators,n_jobs=-1)
    
    
    regr.fit(X_train, y_train)
    predicted_test = regr.predict(X_test)

    return sqrt(mean_squared_error(y_test, predicted_test))
    
    

In [33]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

#Create an instance with tuned hyperparameters
optimised_rf = RandomForestRegressor(bootstrap = study.best_params['bootstrap'], criterion = study.best_params['criterion'],
                                     max_depth = study.best_params['max_depth'], max_features = study.best_params['max_features'],
                                     max_leaf_nodes = study.best_params['max_leaf_nodes'],n_estimators = study.best_params['n_estimators'],
                                     n_jobs=12)
#learn
optimised_rf.fit(X_train ,y_train)

predicted_train=optimised_rf.predict(X_train)
predicted_test=optimised_rf.predict(X_test)

print(f'OOF score: {sqrt(mean_squared_error(y_train, predicted_train))}')
print(f'HOLDOUT score: {sqrt(mean_squared_error(y_test, predicted_test))}')

[I 2023-12-26 05:21:06,324] A new study created in memory with name: no-name-bed403e3-8e3a-49b7-8f42-ae7b2548d819
  warn(
[I 2023-12-26 05:21:13,678] Trial 0 finished with value: 11.79742959262202 and parameters: {'criterion': 'mse', 'bootstrap': 'False', 'max_depth': 58, 'max_features': 'sqrt', 'max_leaf_nodes': 75, 'n_estimators': 218}. Best is trial 0 with value: 11.79742959262202.
  warn(
  warn(
[I 2023-12-26 05:21:17,370] Trial 1 finished with value: 11.874561650807804 and parameters: {'criterion': 'mse', 'bootstrap': 'False', 'max_depth': 18, 'max_features': 'auto', 'max_leaf_nodes': 21, 'n_estimators': 54}. Best is trial 0 with value: 11.79742959262202.
  warn(
[I 2023-12-26 05:21:23,867] Trial 2 finished with value: 11.741365089391046 and parameters: {'criterion': 'mse', 'bootstrap': 'False', 'max_depth': 69, 'max_features': 'log2', 'max_leaf_nodes': 70, 'n_estimators': 206}. Best is trial 2 with value: 11.741365089391046.
  warn(
[I 2023-12-26 05:21:26,070] Trial 3 finished w

OOF score: 4.606263868678547
HOLDOUT score: 11.625470394937363


In [37]:
optimised_rf

In [34]:
joblib.dump(optimised_rf,'optimised_rf.pkl')

['optimised_rf.pkl']