In [None]:
from google.cloud import bigquery

# Initialize the BigQuery client
client = bigquery.Client.from_service_account_json('/Users/servandodavidtorresgarcia/servando/relu/gcp/local-catalyst-398309-52645c242657.json')


# Define your table and destination URI
dataset_name = 'thelook_ecommerce'
table_name = 'distribution_centers'
bucket_name = 'ecommerce_looker_download'
destination_uri = f"gs://{bucket_name}/{table_name}.csv"

# Create an extract job
dataset_ref = client.dataset(dataset_name, project='bigquery-public-data')
table_ref = dataset_ref.table(table_name)

extract_job = client.extract_table(
    table_ref,
    destination_uri,
    location='US',  # Location must match that of the source table.
)
extract_job.result()  # Wait for job to complete

print(f"Exported {dataset_name}.{table_name} to {destination_uri}")


In [1]:

from src.data.dataset_reader import DatasetReader
from pathlib import Path
import pandas as pd
pd.set_option('display.max_columns', None)
%load_ext autoreload
%autoreload 2
# ignore warnings
import warnings
warnings.filterwarnings('ignore')
data_folder = Path("../data/raw").resolve()

products = DatasetReader(data_folder).get_data_csv('products.csv')
orders = DatasetReader(data_folder).get_data_csv('orders.csv')
order_items = DatasetReader(data_folder).get_data_csv('order_items.csv')
inventory_items = DatasetReader(data_folder).get_data_csv('inventory_items.csv')

In [None]:
order_items.query('id == 27569')

In [None]:
orders.query("order_id == 18975")

In [None]:
products.query("id == 19220")

In [None]:
inventory_items.query("id == 74342")

In [2]:
# Select specific columns from each DataFrame
order_items_selected = order_items[['product_id','order_id','status', 'created_at', 'shipped_at', 'delivered_at', 'returned_at', 'sale_price']]
orders_selected = orders[['order_id', 'gender', 'num_of_item']]  # Including 'order_id' for joining
products_selected = products[['id', 'cost', 'category', 'name', 'brand', 'retail_price', 'department', 'sku']]  # Including 'id' for joining

# Perform the join operations based on the relationships identified in the queries
# Assuming 'order_id' is common between 'order_items' and 'orders'
# Assuming 'product_id' is common between 'order_items' and 'products'

merged_df = order_items_selected \
    .merge(orders_selected, on='order_id', how='inner') \
    .merge(products_selected, left_on='product_id', right_on='id', how='inner')



In [3]:
merged_df.drop(columns=['product_id', 'order_id', 'id'], inplace=True)

In [4]:
# place sku to the beginning of the dataframe
cols = list(merged_df.columns)
cols.insert(0, cols.pop(cols.index('sku')))
merged_df = merged_df.loc[:, cols]


In [5]:
merged_df.head()
# checl if sale_price is equal to retail_price
merged_df.query('sale_price != retail_price')
merged_df.drop(columns=['retail_price'], inplace=True)

In [6]:
print(merged_df.head().to_string(index=False))

                             sku   status                     created_at                     shipped_at                   delivered_at returned_at  sale_price gender  num_of_item    cost    category                                    name            brand department
4ACBEDBE977480D19B7B682D4878CAE2  Shipped        2023-08-20 21:21:15 UTC        2023-08-21 07:58:00 UTC                            NaN         NaN         2.5      F            4  1.1075 Accessories Elegant PASHMINA SCARF WRAP SHAWL STOLE Scarf_tradinginc      Women
4ACBEDBE977480D19B7B682D4878CAE2 Complete 2023-09-08 00:38:23.869168 UTC 2023-09-07 19:56:53.869168 UTC 2023-09-08 14:07:53.869168 UTC         NaN         2.5      F            4  1.1075 Accessories Elegant PASHMINA SCARF WRAP SHAWL STOLE Scarf_tradinginc      Women
E0F19F64F086E393CEB0CF4A8C561B51  Shipped        2023-08-24 21:44:30 UTC        2023-08-21 07:58:00 UTC                            NaN         NaN        64.0      F            4 30.2080      Shorts 

In [7]:
merged_df.query('sku == "4ACBEDBE977480D19B7B682D4878CAE2"')

Unnamed: 0,sku,status,created_at,shipped_at,delivered_at,returned_at,sale_price,gender,num_of_item,cost,category,name,brand,department
0,4ACBEDBE977480D19B7B682D4878CAE2,Shipped,2023-08-20 21:21:15 UTC,2023-08-21 07:58:00 UTC,,,2.5,F,4,1.1075,Accessories,Elegant PASHMINA SCARF WRAP SHAWL STOLE,Scarf_tradinginc,Women
1,4ACBEDBE977480D19B7B682D4878CAE2,Complete,2023-09-08 00:38:23.869168 UTC,2023-09-07 19:56:53.869168 UTC,2023-09-08 14:07:53.869168 UTC,,2.5,F,4,1.1075,Accessories,Elegant PASHMINA SCARF WRAP SHAWL STOLE,Scarf_tradinginc,Women


In [8]:
merged_df.status.unique()

array(['Shipped', 'Complete', 'Processing', 'Returned', 'Cancelled'],
      dtype=object)

In [9]:
#trainsform dates to datetime
merged_df['created_at'] = pd.to_datetime(merged_df['created_at'])
merged_df['shipped_at'] = pd.to_datetime(merged_df['shipped_at'])
merged_df['delivered_at'] = pd.to_datetime(merged_df['delivered_at'])
merged_df['returned_at'] = pd.to_datetime(merged_df['returned_at'])

In [10]:
merged_df.head()
merged_df.groupby('sku').size().reset_index(name='sales')
# create column sales that is the number sales per sku
# Correct way to calculate sales per SKU
merged_df['sales'] = merged_df.groupby('sku')['num_of_item'].transform('sum')


In [None]:
merged_df.head(3
               )

In [None]:
merged_df.query('sku == "4ACBEDBE977480D19B7B682D4878CAE2"')
# TODO see duplicated skus with their corresponding sale and decide what to do with them

In [11]:
from src.data.eda import Eda
eda = Eda()
eda.missing_values_table(merged_df)

The selected dataframe has 15 columns and 5 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
returned_at,162234,89.8
delivered_at,117399,65.0
shipped_at,63212,35.0
brand,142,0.1
name,12,0.0


In [None]:
# inspecting for seasonality
from src.data.seasonality import SeasonalityInspector, QuickSeasonalityInspector


# Initialize
inspector = SeasonalityInspector()
quick_inspector = QuickSeasonalityInspector()

# Load your data into a DataFrame called 'data'
# data = pd.read_csv('your_data.csv')

# Plot time series
#inspector.plot_time_series(merged_df, 'created_at', 'sales')



In [None]:
import numpy as np
# Find dominant period through FFT
dominant_period = quick_inspector.fast_fourier_transform(merged_df, 'sales')
print(f"Dominant period: {dominant_period}")


# Initialize default N
N = 7

# Check if dominant_period is finite
if np.isfinite(dominant_period):
    # Run rolling stats only if dominant_period is finite
    is_mean_stable, is_var_stable = quick_inspector.rolling_stats(merged_df, 'sales', window=int(dominant_period))
    
    # Update N based on rolling stats
    if not (is_mean_stable and is_var_stable):
        N = int(dominant_period)

print(f"Optimal N: {N}")


In [12]:
from src.data.feature_engineering import FeatureEngineeringProcess
fep = FeatureEngineeringProcess()


In [13]:
merged_df_fe1 = fep.price_sales_correlation_features_updated(merged_df, 7, [(-5.0, 1.0), (-3.0, 1.0), (-2.0, 1.0), (1.0, 1.0), (-1.0, 0.5), (-1.0, 0.33)],
                                                             'sku', 'created_at', 'sale_price', 'sales')

In [14]:
from src.data.eda import Eda
eda = Eda()

In [15]:
eda.missing_values_table(merged_df_fe1)

The selected dataframe has 21 columns and 11 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
returned_at,162234,89.8
f_corr_-5.0_1.0,148221,82.1
f_corr_-3.0_1.0,148221,82.1
f_corr_-2.0_1.0,148221,82.1
f_corr_1.0_1.0,148221,82.1
f_corr_-1.0_0.5,148221,82.1
f_corr_-1.0_0.33,148221,82.1
delivered_at,117399,65.0
shipped_at,63212,35.0
brand,142,0.1


In [16]:
# handle missing values in price_sales_correlation 
cols_to_fill_zero = [col for col in merged_df_fe1.columns if 'f_corr_' in col]
merged_df_fe1[cols_to_fill_zero] = merged_df_fe1[cols_to_fill_zero].fillna(0)


In [17]:
# Drop uneccesary date columns 
merged_df_fe1.drop(columns=['shipped_at', 'delivered_at', 'returned_at'], inplace=True)

In [18]:
eda.missing_values_table(merged_df_fe1)

The selected dataframe has 18 columns and 2 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
brand,142,0.1
name,12,0.0


In [19]:
merged_df_fe2 =  fep.normalize_features(merged_df_fe1, [5, 10, 15], 28, 'sku', 'created_at', 'sale_price', 'sales')

In [20]:
eda.missing_values_table(merged_df_fe2)

The selected dataframe has 30 columns and 14 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
normalized_log_avg_price_5_days,180562,100.0
normalized_log_avg_sales_5_days,180562,100.0
normalized_log_avg_price_10_days,180562,100.0
normalized_log_avg_sales_10_days,180562,100.0
normalized_log_avg_price_15_days,180562,100.0
normalized_log_avg_sales_15_days,180562,100.0
normalized_std_price_15_days,180429,99.9
normalized_std_sales_15_days,180429,99.9
normalized_std_price_10_days,174313,96.5
normalized_std_sales_10_days,174313,96.5


In [21]:
# drop columns with 100% missing values
merged_df_fe2.dropna(axis=1, how='all', inplace=True)


In [22]:
eda.missing_values_table(merged_df_fe2)

The selected dataframe has 24 columns and 8 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
normalized_std_price_15_days,180429,99.9
normalized_std_sales_15_days,180429,99.9
normalized_std_price_10_days,174313,96.5
normalized_std_sales_10_days,174313,96.5
normalized_std_price_5_days,110106,61.0
normalized_std_sales_5_days,110106,61.0
brand,142,0.1
name,12,0.0


In [23]:
normalized_cols = [col for col in merged_df_fe2.columns if 'normalized_' in col]
merged_df_fe2[normalized_cols] = merged_df_fe2[normalized_cols].fillna(method='ffill').fillna(method='bfill')


In [24]:
eda.missing_values_table(merged_df_fe2)

The selected dataframe has 24 columns and 2 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
brand,142,0.1
name,12,0.0


In [25]:
merged_df_fe3, insuficient_data = fep.filter_stability_periods(merged_df_fe2, 7, 0.04, sku_column='sku', date_column='created_at', price_column='sale_price')

In [26]:
eda.missing_values_table(merged_df_fe3)

The selected dataframe has 24 columns and 2 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values
brand,122,0.1
name,11,0.0


In [27]:
merged_df_fe3.head()

Unnamed: 0,sku,status,created_at,sale_price,gender,num_of_item,cost,category,name,brand,department,sales,f_corr_-5.0_1.0,f_corr_-3.0_1.0,f_corr_-2.0_1.0,f_corr_1.0_1.0,f_corr_-1.0_0.5,f_corr_-1.0_0.33,normalized_std_price_5_days,normalized_std_sales_5_days,normalized_std_price_10_days,normalized_std_sales_10_days,normalized_std_price_15_days,normalized_std_sales_15_days
169130,00003E3B9E5336685200AE85D21B4F5E,Shipped,2022-05-14 21:24:59+00:00,99.0,F,2,56.232,Pants & Capris,Anne Klein Women's Crop Pant,Anne Klein,Women,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169131,00003E3B9E5336685200AE85D21B4F5E,Complete,2022-12-09 03:58:35+00:00,99.0,F,1,56.232,Pants & Capris,Anne Klein Women's Crop Pant,Anne Klein,Women,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
169132,00003E3B9E5336685200AE85D21B4F5E,Complete,2023-05-12 23:08:53+00:00,99.0,F,1,56.232,Pants & Capris,Anne Klein Women's Crop Pant,Anne Klein,Women,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171824,0004D0B59E19461FF126E3A08A814C33,Processing,2020-08-31 08:42:06+00:00,79.949997,F,1,37.656449,Fashion Hoodies & Sweatshirts,The Bradford Exchange Breast Cancer Support Wo...,Bradford Exchange,Women,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171823,0004D0B59E19461FF126E3A08A814C33,Shipped,2022-10-19 10:38:15+00:00,79.949997,F,1,37.656449,Fashion Hoodies & Sweatshirts,The Bradford Exchange Breast Cancer Support Wo...,Bradford Exchange,Women,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
## embeddings
from src.data.embeddings import EmbeddingModel
import torch
from sklearn.preprocessing import LabelEncoder
#merged_df_embeddings = merged_df_fe3.copy()


label_encoders = {}
for col in ['category', 'name', 'brand', 'department']:
    le = LabelEncoder()
    merged_df_fe3[col] = le.fit_transform(merged_df_fe3[col])
    label_encoders[col] = le

embedding_dims = [(merged_df_fe3[col].max() + 1, int((merged_df_fe3[col].max() + 1) ** 0.5)) for col in ['category', 'name', 'brand', 'department']]
model = EmbeddingModel(embedding_dims)




#preapre data for embeddings
cat_data = merged_df_fe3[['category', 'name', 'brand', 'department']].values
cat_data_tensor = torch.tensor(cat_data, dtype=torch.long)


In [29]:
import torch.nn as nn
sales_data = merged_df_fe3['sales'].values
sales_data_tensor = torch.tensor(sales_data, dtype=torch.float32).view(-1, 1)

import torch.optim as optim

# Loss and optimizer
criterion = nn.MSELoss()  # Mean Squared Error for a regression problem
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 10  # Number of training epochs

for epoch in range(epochs):
    # Forward pass
    outputs = model(cat_data_tensor)
    
    # Compute loss
    loss = criterion(outputs, sales_data_tensor)

    # Zero gradients, backward pass, optimizer step
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')


Inside Forward
Epoch [1/10], Loss: 191.0462
Inside Forward
Epoch [2/10], Loss: 190.7918
Inside Forward
Epoch [3/10], Loss: 190.5376
Inside Forward
Epoch [4/10], Loss: 190.2836
Inside Forward
Epoch [5/10], Loss: 190.0298
Inside Forward
Epoch [6/10], Loss: 189.7762
Inside Forward
Epoch [7/10], Loss: 189.5228
Inside Forward
Epoch [8/10], Loss: 189.2696
Inside Forward
Epoch [9/10], Loss: 189.0167
Inside Forward
Epoch [10/10], Loss: 188.7640


In [30]:

with torch.no_grad():
    embeddings = model.forward(cat_data_tensor).numpy()


Inside Forward


In [31]:
eda.missing_values_table(merged_df_fe3)

The selected dataframe has 24 columns and 0 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values


In [39]:
# Reset Index Before Concatenating
merged_df_fe3.reset_index(drop=True, inplace=True)
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.reset_index(drop=True, inplace=True)
merged_df_embeddings = pd.concat([merged_df_fe3, embeddings_df], axis=1)


In [40]:
# check lenghts
assert len(merged_df_fe3) == len(embeddings_df), "Dataframes have different lengths."


In [41]:
eda.missing_values_table(merged_df_embeddings)

The selected dataframe has 247 columns and 0 columns with missing values.


Unnamed: 0,Missing Values,% of Total Values


In [33]:
#merged_df_embeddings = pd.concat([merged_df_fe3, pd.DataFrame(embeddings)], axis=1)
#merged_df_embeddings.drop(['category', 'name', 'brand', 'department'], axis=1, inplace=True)


In [45]:
import numpy as np
# Create a correlation matrix to check for multicollinearity
corr_matrix = merged_df_embeddings.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
print(f"Features to drop: {to_drop}")

Features to drop: ['cost', 'f_corr_-3.0_1.0', 'f_corr_-2.0_1.0', 'f_corr_1.0_1.0', 'f_corr_-1.0_0.5', 'f_corr_-1.0_0.33', 222]


In [48]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

# Drop non-numeric columns for VIF calculation
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
new_df = merged_df_embeddings.select_dtypes(include=numerics)


# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = new_df.columns

vif_data["VIF"] = [variance_inflation_factor(new_df.values, i) for i in range(len(new_df.columns))]

# Identify features with a high VIF value
high_vif_features = vif_data[vif_data["VIF"] > 10]["feature"].tolist()

print(f"Features with high VIF should be removed or examined: {high_vif_features}")



Features with high VIF should be removed or examined: ['sale_price', 'cost', 'department', 'f_corr_-5.0_1.0', 'f_corr_-3.0_1.0', 'f_corr_-2.0_1.0', 'f_corr_1.0_1.0', 'f_corr_-1.0_0.5', 'f_corr_-1.0_0.33', 222]


In [51]:
print(to_drop)
print(high_vif_features)

['cost', 'f_corr_-3.0_1.0', 'f_corr_-2.0_1.0', 'f_corr_1.0_1.0', 'f_corr_-1.0_0.5', 'f_corr_-1.0_0.33', 222]
['sale_price', 'cost', 'department', 'f_corr_-5.0_1.0', 'f_corr_-3.0_1.0', 'f_corr_-2.0_1.0', 'f_corr_1.0_1.0', 'f_corr_-1.0_0.5', 'f_corr_-1.0_0.33', 222]


In [54]:
train_dataset = merged_df_embeddings.drop(columns=['f_corr_-3.0_1.0', 'f_corr_-2.0_1.0', 'f_corr_1.0_1.0', 'f_corr_-1.0_0.5', 'f_corr_-1.0_0.33', 222], axis=1)

In [56]:
# drop categorical features used in embeddings

train_dataset = train_dataset.drop(columns=['category', 'name', 'brand', 'department'], axis=1)

In [59]:
train_dataset  =  fep.datetime_transform(train_dataset,'created_at', ['day_of_the_year'])


In [63]:
train_dataset.status.unique()
# drop skus with status "Returned" and "Cancelled"
train_dataset = train_dataset.query('status != "Returned" and status != "Cancelled"')


In [65]:
# use label encoder for status and gender


train_dataset['status'] = le.fit_transform(train_dataset['status'])
train_dataset['gender'] = le.fit_transform(train_dataset['gender'])



In [68]:
train_dataset.sort_values(by=['created_at'], inplace=True)

In [69]:
# print first and last date
print(train_dataset.created_at.min())
print(train_dataset.created_at.max())


2019-01-11 13:05:59+00:00
2023-09-11 18:25:47.230646+00:00


In [71]:
from typing import Dict, Union
import pandas as pd

def describe_time_series(df: pd.DataFrame, date_column: str, sales_column: str) -> Dict[str, Union[int, Dict[int, Dict[str, int]]]]:
    """
    Function to provide a comprehensive description of the time series data.
    
    Args:
    - df (pd.DataFrame): DataFrame containing the time series data.
    - date_column (str): Column name containing the date information.
    
    Returns:
    - Dict: Description containing number of years, months per year, and average sales per month.
    """

    # Convert date column to datetime if not already
    df[date_column] = pd.to_datetime(df[date_column])

    # Extract year and month from date
    df['year'] = df[date_column].dt.year
    df['month'] = df[date_column].dt.month

    # Initialize results dictionary
    result = {
        "total_years": 0,
        "years_detail": {}
    }

    # Total number of unique years
    total_years = df['year'].nunique()
    result["total_years"] = total_years

    # Iterate over each year to find details
    for year in df['year'].unique():
        year_data = df[df['year'] == year]
        
        # Number of unique months in this year
        unique_months = year_data['month'].nunique()
        
        # Average sales per month
        avg_sales_per_month = year_data.groupby('month')[sales_column].mean().to_dict()

        # Add details to result
        result['years_detail'][year] = {
            "unique_months": unique_months,
            "avg_sales_per_month": avg_sales_per_month
        }

    return result

# Example usage (assuming 'merged_df' contains 'date' and 'sales' columns)
# describe_time_series(merged_df, 'date')






In [72]:
describe_time_series(train_dataset, 'created_at', 'sales')

{'total_years': 5,
 'years_detail': {2019: {'unique_months': 12,
   'avg_sales_per_month': {1: 12.666666666666666,
    2: 13.74,
    3: 13.682926829268293,
    4: 13.419117647058824,
    5: 13.859903381642512,
    6: 13.095959595959595,
    7: 14.178807947019868,
    8: 13.647058823529411,
    9: 13.664021164021165,
    10: 13.160401002506266,
    11: 13.606490872210953,
    12: 13.599616858237548}},
  2020: {'unique_months': 12,
   'avg_sales_per_month': {1: 13.807692307692308,
    2: 13.877697841726619,
    3: 13.811475409836065,
    4: 13.593667546174142,
    5: 13.701594533029613,
    6: 13.743362831858407,
    7: 13.716026241799439,
    8: 13.991035856573705,
    9: 13.58987090367428,
    10: 13.746280991735537,
    11: 14.052842273819055,
    12: 13.537164750957855}},
  2021: {'unique_months': 12,
   'avg_sales_per_month': {1: 13.979452054794521,
    2: 13.53061224489796,
    3: 13.515333333333333,
    4: 13.603861517976032,
    5: 13.754950495049505,
    6: 13.868031854379977,
 

In [86]:
# print min and max date
print(train_dataset.created_at.min())
print(train_dataset.created_at.max())

2019-01-11 13:05:59+00:00
2023-09-11 18:25:47.230646+00:00


In [94]:
# First localize to UTC and then convert to naive datetime
train_dataset['created_at'] = pd.to_datetime(train_dataset['created_at'], utc=True).dt.tz_localize(None)

# Check unique dates and data types after conversion
print("Unique dates after conversion:", train_dataset['created_at'].unique())
print("Data type after conversion:", train_dataset['created_at'].dtype)



Unique dates after conversion: ['2019-01-11T13:05:59.000000000' '2019-01-17T07:00:33.000000000'
 '2019-01-18T13:39:38.000000000' ... '2023-09-11T17:30:37.049453000'
 '2023-09-11T18:23:00.836955000' '2023-09-11T18:25:47.230646000']
Data type after conversion: datetime64[ns]


In [110]:

from src.data.data_splitter import DataSplitter


data_splitter = DataSplitter(train_dataset, 'sales','sku', 'created_at', 36, 6, 4)
X_train, y_train, X_val, y_val, X_test, y_test = data_splitter.split_data()

Debug: Data type of date column before processing: datetime64[ns]
Debug: Unique dates before processing: ['2019-01-11T13:05:59.000000000' '2019-01-17T07:00:33.000000000'
 '2019-01-18T13:39:38.000000000' ... '2023-09-11T17:30:37.049453000'
 '2023-09-11T18:23:00.836955000' '2023-09-11T18:25:47.230646000']
Number of SKUs in train: 4386
Number of SKUs in validation: 28576
Number of SKUs in test: 5317
Min date in dataset: 2019-01-11 13:05:59
Max date in dataset: 2023-09-11 18:25:47.230646
Train-Val threshold: 2020-03-11 18:25:47.230646
Val-Test threshold: 2023-08-14 18:25:47.230646
Training data covers from 2019-01-21 10:37:26 to 2020-03-11 15:46:38
Validation data covers from 2020-03-12 22:31:23 to 2023-08-14 07:48:06
Test data covers from 2023-08-14 18:56:53 to 2023-09-11 17:30:37.049453


In [111]:
print('X_val info', X_val.info())
print('X_train info', X_train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1294 entries, 83409 to 78374
Columns: 238 entries, sku to month
dtypes: datetime64[ns](1), float32(221), float64(9), int64(6), object(1)
memory usage: 1.3+ MB
X_val info None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 48710 to 13669
Columns: 238 entries, sku to month
dtypes: datetime64[ns](1), float32(221), float64(9), int64(6), object(1)
memory usage: 548.1+ KB
X_train info None


In [114]:
X_train.head()

Unnamed: 0,sku,status,sale_price,gender,num_of_item,cost,f_corr_-5.0_1.0,normalized_std_price_5_days,normalized_std_sales_5_days,normalized_std_price_10_days,normalized_std_sales_10_days,normalized_std_price_15_days,normalized_std_sales_15_days,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,created_at_day_of_the_year,year,month
48710,5421E013565F7F1AFA0CFE8AD87A99AB,1,46.990002,0,1,23.166071,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618097,0.21361,0.19446,-0.736318,-0.128905,1.207151,0.485925,-0.05088,-0.197749,-0.057977,0.582514,-0.033598,1.127005,0.732027,0.538784,0.508141,-0.818943,0.273989,-0.959761,-0.617215,-0.218731,-0.29149,0.037319,0.121658,1.289786,1.088777,0.154329,-0.983612,-1.703685,0.249646,1.382619,-1.325768,-1.374904,-2.372104,0.187922,1.496713,1970-01-01 00:00:00.000000000,1.201915,0.369966,-1.010207,0.301286,-0.050044,-0.06655,-0.162605,-0.553055,1.202927,0.839608,0.281543,-0.575626,-1.182218,-0.199428,2.012374,1.175463,-0.887381,0.863653,1.353978,0.94579,1.379825,-1.020125,-0.244657,0.716591,-0.214237,1.45572,0.539852,0.447123,-1.568236,-0.790396,-0.10847,0.330332,-0.019592,0.096214,-0.652981,0.612672,-1.184975,-0.038366,-0.363132,-0.445671,0.089539,-0.965762,-1.46511,0.821124,-0.233077,-0.980053,-1.907379,1.748868,-0.452622,0.306807,0.118778,0.293713,0.990599,-0.247371,0.651318,1.713531,0.526332,0.898758,0.943439,-0.714881,-1.414468,0.508031,2.038816,0.885909,2.014792,0.68026,-1.312942,1.203345,0.319201,-0.622913,-1.187147,-0.89109,1.049363,0.745267,-0.762985,-0.029543,0.247401,-0.996835,-0.300898,-0.65678,1.04354,-0.368252,-1.774425,-1.173386,-1.054245,-0.028838,0.482536,0.862234,0.385652,-2.345551,-3.109762,0.848012,-0.044475,0.160467,0.226374,0.891752,1.507178,1.777077,-0.616642,1.766401,-1.213559,-0.31952,1.01572,1.095911,0.915277,0.687072,0.608772,0.49681,-0.149331,-1.915155,0.136365,-0.785196,1.051046,-0.254127,-0.576281,0.420114,-1.041021,3.093266,-0.500465,-0.345949,0.397736,1.233427,-0.949307,1.26235,-1.515631,0.296013,0.038116,0.309216,-0.71766,0.394764,-0.322131,-0.197474,-0.178135,0.396525,1.689622,0.135936,2.463261,-0.288633,0.984552,0.557286,-1.179625,1.468262,0.125309,-0.693422,0.348455,0.969624,1.154254,-0.863502,0.782441,0.43693,1.423537,-0.21393,1.065327,0.028801,-0.215653,0.795972,-1.134716,0.506972,0.533417,1.566646,2.094918,-1.778114,-0.217691,1.015229,-0.849052,0.081957,-0.894382,-0.508788,-1.747872,0.571601,1.535555,0.923321,0.029534,0.515496,-0.366679,0.054093,0.237928,-0.630707,-1.18141,0.005756,-1.001396,-0.038317,0.241639,-0.263245,-1.197127,21,2019,1
111440,BF1B2F4B901C21A1D8645018EA9AEB05,1,21.99,0,1,11.74266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27828,0.016417,-0.460472,0.332568,-0.213493,-0.833412,-0.719302,-0.886362,0.262113,0.513147,0.610317,-0.405723,1.536282,-0.612963,1.484782,-0.877877,-0.880291,-0.69123,0.669534,1.423011,-1.32092,-0.596399,-0.10877,-1.363332,1.40003,0.5691,-0.610503,-0.393085,-0.249465,1.715142,-2.020521,-1.031594,-1.367871,-0.543092,-2.6776,-0.84599,1969-12-31 23:59:59.999999999,-0.471224,2.338659,0.060505,-0.694462,0.195182,-0.707157,0.360006,0.232534,-0.527404,1.643142,-1.105963,0.753707,-1.051327,1.549529,0.757793,1.244341,-0.697667,-1.268019,1.18442,-0.398877,-0.739393,-0.278463,1.003109,-1.019797,-0.535316,0.255902,-0.43859,-0.578406,0.377727,0.647866,-1.06476,1.917616,-0.009729,-0.234132,-0.314626,3.518685,-0.431947,-0.722694,-1.18071,0.225187,-0.751375,0.191631,-0.642732,1.257587,-0.487521,1.983962,0.382809,-0.256377,0.724278,0.828387,-0.215339,-0.150473,0.86448,0.654002,-1.364736,1.123864,-0.520771,0.319441,-0.321732,1.042233,-1.584668,1.116114,0.999161,0.350538,1.122899,0.503661,1.346976,1.301403,0.453051,0.454387,-0.270166,-0.5071,1.632334,-0.317886,0.259777,0.738327,-2.566289,-0.474713,0.214358,1.352113,0.962129,0.142632,1.601749,-0.129694,1.188415,0.686035,0.596572,0.031881,-0.129979,-0.95647,0.444692,0.077683,0.79462,0.024088,1.002792,0.007868,0.459993,-0.62738,0.960245,-1.846775,0.10726,-0.653045,-0.212616,0.938377,0.57295,0.39061,0.399934,-0.805998,-0.236079,-1.241394,-0.943666,0.023865,-1.390856,-1.704194,-0.223222,0.30761,0.623304,1.105473,0.017237,0.210728,-0.060681,-1.171243,0.835542,0.960193,-0.77378,1.293464,-2.451555,1.03541,0.459067,0.27205,-0.849619,0.184171,0.763115,-0.18342,-0.26387,1.433598,0.089215,1.399363,0.157339,1.710036,-0.74618,2.441929,0.592574,1.331167,0.283946,-0.349899,1.751136,-0.853384,-0.539677,0.408415,-2.613026,-0.101007,-1.3988,0.247971,0.534216,-0.440259,0.774072,0.356605,1.043615,-1.14437,-0.622975,0.627671,-0.893805,-1.3306,1.253306,-0.302694,-0.025382,-0.261907,-0.677118,-1.292501,0.124883,-0.291863,1.365641,0.525903,0.536398,-0.828403,1.984043,0.372194,-0.930642,-1.484335,1.717415,-0.14319,0.171449,-0.11768,0.902497,29,2019,1
33532,391F63B419F364635C25479CC36C4D0A,1,29.99,0,1,15.98467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115487,-0.196998,1.195093,0.787168,-0.479353,0.995957,-0.448979,0.048292,1.234985,1.054082,1.301146,0.396409,-0.164643,1.268481,1.452433,-1.205705,0.491717,0.074162,0.816737,0.281564,-0.630703,0.065228,0.160732,0.702531,-1.031765,0.087141,1.152605,0.086701,1.155406,1.499308,-0.323956,0.339716,0.807414,-0.532576,0.231565,0.39429,1970-01-01 00:00:00.000000000,-0.084035,0.517923,-0.063126,-0.43617,-0.333423,1.404258,-0.225058,0.375283,-0.225549,0.935865,0.67139,-0.292576,-1.366748,1.415452,0.130599,1.416581,-2.338975,-0.423598,-1.359923,0.346353,-0.083784,-1.009702,-0.028048,-0.513407,-0.902785,-1.323051,-0.009522,0.86383,-0.376907,0.625272,0.688566,1.851161,0.775809,-0.930668,-0.257458,-1.171042,0.01929,0.697102,-0.320253,0.757211,-0.094036,1.152987,0.191951,-1.563408,0.463738,0.594206,-1.095769,1.073238,-0.848058,1.265161,0.329678,-0.555542,-1.245393,4.179773,0.390507,0.835309,0.364547,2.009103,-0.302782,0.112025,1.209006,-0.230094,0.68257,-0.023561,-0.251192,0.837492,0.653247,1.902912,-1.276453,0.606224,0.583068,-1.154166,1.616972,0.512166,-0.264303,-0.643528,0.293913,0.1211,0.290837,-1.890865,0.421724,1.317313,1.611343,0.009761,0.918397,-0.853408,1.047667,-0.226151,-1.098457,0.316326,-1.102091,0.692975,-1.130916,-1.316697,0.109154,0.520942,0.751329,-0.351057,0.336582,0.636944,1.076955,-0.642002,1.829002,0.017759,-0.137356,1.962695,-0.460971,-0.330293,-0.093946,-0.792841,0.239334,-1.475164,-0.650715,0.470307,-0.727898,-1.161629,-0.306826,0.599024,0.669592,0.252707,-1.417552,-0.277424,-0.255455,0.120034,-0.104329,0.776494,0.95476,1.110929,0.673614,0.105717,-1.592442,-0.918858,-0.323032,1.087372,1.115114,0.895136,0.295184,-0.086501,-0.038536,0.704503,-1.38755,1.093072,0.28777,0.75312,0.777256,0.483339,-0.014662,1.147608,0.368412,2.075485,-0.651472,0.918263,1.081081,-1.081422,0.542116,-1.367349,0.832113,0.508947,0.637119,0.485371,-0.74435,-0.266686,1.340239,-0.339106,0.547474,-0.366271,2.187562,-0.19652,-0.146425,0.846854,0.112231,0.051312,-0.229535,-0.723631,-1.824379,-0.702298,-1.719063,0.506876,-0.289567,0.723699,-0.517629,-0.356421,-0.056799,0.998218,0.455227,32,2019,2
137635,ECC92A19F0DE821519B715D10CBF7C62,1,21.99,1,1,8.29023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.684866,-0.599629,0.307981,-1.133241,0.082706,2.024479,-0.104129,0.001773,1.288263,-0.546797,0.749705,3.109177,-0.11945,2.033079,0.796545,-0.468542,-1.284127,-0.677343,2.262892,0.426692,0.060821,-0.191495,0.441079,0.664492,-0.511169,1.021261,0.052835,-0.592934,0.293322,1.077823,-0.440262,0.780482,0.248446,0.901385,-0.615399,-0.24041,1970-01-01 00:00:00.000000000,0.285634,0.133858,1.503282,0.772424,-0.501677,0.100074,1.965985,0.726022,-0.110055,0.304375,0.310763,-0.609177,-1.03567,-1.459504,0.689618,1.306212,1.837016,-2.013314,1.309956,-0.24306,-0.19548,-0.89176,-0.456069,-1.002717,-2.057149,-0.39097,-0.059263,-1.608588,-0.845477,0.126162,1.022122,1.493693,-0.106236,1.240874,2.069761,-0.266097,-0.524218,-1.179879,0.630108,-0.878781,-0.817443,-0.602308,-1.020969,-0.740378,1.641141,-1.255095,0.998494,0.086988,-0.488031,0.376367,-1.016361,0.723184,-0.738649,0.08221,-0.551677,-1.252086,0.206378,-1.163242,-0.412774,0.086303,0.917899,-1.186939,0.109538,0.772333,-0.209432,0.110825,1.251495,1.308746,-0.336564,0.511048,-0.072089,-0.556419,-0.662616,-0.411477,-0.890905,0.898127,-0.538432,0.498596,-0.571522,0.987302,1.84154,-0.415655,-0.238813,-0.50483,-1.696186,0.399698,1.144054,-1.627343,-0.335626,1.346666,0.320182,-0.23311,1.174601,-0.454073,0.009131,1.857369,0.139499,-0.31115,-0.310939,-0.422021,1.081478,0.312661,-1.847169,-1.012452,-0.364614,1.459841,-0.607443,-0.589012,0.417543,1.23922,-0.072534,1.002056,0.24292,-1.803701,0.912695,-0.848448,-0.907741,0.590729,-1.287363,0.337908,1.186833,-1.432721,1.059924,-0.452611,0.381308,-0.423271,0.790957,0.675334,1.224747,-0.777164,-0.835481,-1.127076,0.544197,0.056778,0.638356,0.138404,-2.276854,-0.27593,0.872847,-0.856768,0.363716,1.059364,0.04725,-0.679431,-0.320738,0.117745,1.487585,0.764,0.056244,0.912553,-0.574233,1.236521,-0.845649,-0.901311,0.131005,0.058515,-0.48091,-0.302423,-1.058355,-0.295921,0.784509,0.320983,-0.066574,0.608834,-1.405478,1.38203,0.134458,-1.676155,-1.303766,0.677606,0.785338,1.27958,-0.434869,1.017412,-3.093305,-0.284652,2.128885,-1.152616,-0.747064,-0.586299,1.801888,0.126198,-0.876269,1.263922,0.728654,33,2019,2
71106,7A66484D6D916DCF62D300D65A8F003C,1,75.0,1,2,28.425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.684866,-0.599629,0.307981,-1.133241,0.082706,1.526399,2.570804,0.678041,1.826341,0.524813,-0.500121,-1.194333,0.684884,1.616221,-2.104103,1.213315,0.64509,0.280714,1.409696,-0.946691,-0.56099,-1.013067,1.04183,-0.134811,-0.259832,0.626856,-0.512287,0.543561,1.692819,0.408575,-0.994246,-0.659889,0.00368,0.366367,-0.642176,0.621417,1970-01-01 00:00:00.000000000,0.134962,-0.266057,1.434485,-1.016605,-1.31379,-0.016314,-0.260872,0.442775,-0.27934,-0.826949,-0.709058,0.612499,-0.835128,-0.203948,0.900066,2.192527,0.803569,-0.057678,1.97693,1.940316,1.19637,0.008292,-1.50121,1.01247,-0.949623,0.004332,-0.407474,0.744397,1.63567,0.140194,-0.441636,0.565077,-0.12388,-0.714263,-0.474701,-0.376177,-0.187988,-1.105978,0.648713,0.251898,-2.047055,-1.08451,1.503377,1.146151,1.14087,0.85232,-2.701654,-1.10802,0.723802,0.382818,0.116315,1.147028,-0.682986,-0.049551,1.206111,2.99285,-0.50788,0.86319,-0.599646,-0.532293,0.275722,0.429066,-0.293652,-0.214246,-0.246633,-1.119872,-1.03343,0.98067,0.705202,0.229692,1.621667,0.389174,-0.403523,-0.952038,0.25636,0.192101,-3.083372,0.077714,-1.899159,0.625287,0.001528,1.055301,-0.665623,-0.727471,0.38977,0.755636,1.149925,-0.154749,-0.13831,0.343314,1.765655,1.428099,1.149647,-1.222629,0.551292,0.278259,0.898442,-0.108853,1.936718,2.259066,0.054265,-0.191951,0.431942,-1.4784,1.803171,2.874109,-0.92433,2.407463,-0.202876,-0.036388,-0.012477,0.912659,-1.205831,1.620699,0.668946,0.568808,0.777292,0.903951,2.04143,-0.552687,0.867336,0.12853,0.641406,-0.696393,-0.244131,-0.630209,1.216925,-0.146843,0.2971,0.357595,0.90229,-0.851341,-0.006189,1.516459,0.048708,1.284819,-0.469956,-0.448388,-0.256854,-0.184573,-0.335541,-2.158028,-0.106914,1.093212,-1.409999,-0.290796,0.703981,0.898222,-1.179895,-0.22093,0.703445,0.71055,1.932158,0.518492,0.15642,-0.408324,1.170099,-0.195317,-1.440999,1.214041,0.659061,0.774127,0.080391,-0.062985,-0.324606,0.290562,0.113426,0.052004,-1.119221,-0.423102,0.221371,-0.711644,1.630591,-0.352747,0.893363,-1.156087,-1.369835,-0.705095,0.398076,0.479165,1.659986,-0.693061,0.434434,-0.351947,-1.247702,39,2019,2


In [115]:
# drop 'sku', 'year', 'month' from X_train and X_val
X_train.drop(columns=['sku', 'year', 'month'], inplace=True)
X_val.drop(columns=['sku', 'year', 'month'], inplace=True)

X_test.drop(columns=['year', 'month'], inplace=True)

In [121]:
# drop column "36" from X_train and X_val
X_train.drop(columns=[36], inplace=True)
X_val.drop(columns=[36], inplace=True)
X_test.drop(columns=[36], inplace=True)


In [126]:
from src.train.quantile_training import DemandCurveTrainer
trainer = DemandCurveTrainer(X_train, y_train, X_val, y_val,  tune_params=True)
trainer.train_models()
trainer.evaluate_models()
#trainer.plot_feature_importance()
# Predict demand curve (Assume test_data is prepared with SKUs and prices)
sku_demand_curves, demand_curve_pricing, price_points_per_sku = trainer.predict_demand_curve(X_test, 'sku', 'sale_price')


[I 2023-09-11 18:53:15,461] A new study created in memory with name: no-name-03ebe45f-6589-4bbd-82f5-6c0be9e54914




[I 2023-09-11 18:53:15,804] Trial 0 finished with value: 0.7046730446906897 and parameters: {'max_depth': 5, 'min_data_in_leaf': 9, 'subsample': 0.88547910669042, 'n_estimators': 58, 'learning_rate': 0.15395121530210276, 'colsample_bytree': 0.6470038017577991}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:16,223] Trial 1 finished with value: 9.258408050989638 and parameters: {'max_depth': 7, 'min_data_in_leaf': 14, 'subsample': 0.7738623467267365, 'n_estimators': 78, 'learning_rate': 0.012354244383720132, 'colsample_bytree': 0.6848661167316181}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:16,692] Trial 2 finished with value: 0.785043082517139 and parameters: {'max_depth': 8, 'min_data_in_leaf': 11, 'subsample': 0.8674942037276714, 'n_estimators': 94, 'learning_rate': 0.13292942984064707, 'colsample_bytree': 0.6230176978987356}. Best is trial 0 with value: 0.7046730446906897.
[I 2023-09-11 18:53:16,897] Trial 3 finished with value: 2.696783816464004 and parameters: {'max_depth': 3, 'min_data_in_leaf': 18, 'subsample': 0.8223375638190216, 'n_estimators': 84, 'learning_rate': 0.122296290774111, 'colsample_bytree': 0.983477814809281}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:17,627] Trial 4 finished with value: 0.8741384593706361 and parameters: {'max_depth': 8, 'min_data_in_leaf': 5, 'subsample': 0.7662942351371912, 'n_estimators': 80, 'learning_rate': 0.08874461178467698, 'colsample_bytree': 0.9845148100690033}. Best is trial 0 with value: 0.7046730446906897.
[I 2023-09-11 18:53:17,752] Trial 5 finished with value: 8.939002323352383 and parameters: {'max_depth': 3, 'min_data_in_leaf': 8, 'subsample': 0.9291870795324148, 'n_estimators': 56, 'learning_rate': 0.037688417481956636, 'colsample_bytree': 0.6799357235243382}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:18,138] Trial 6 finished with value: 1.2971347472853296 and parameters: {'max_depth': 10, 'min_data_in_leaf': 13, 'subsample': 0.815187667969307, 'n_estimators': 60, 'learning_rate': 0.170684168276661, 'colsample_bytree': 0.6112087590017148}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:18,836] Trial 7 finished with value: 1.5834506828184554 and parameters: {'max_depth': 10, 'min_data_in_leaf': 9, 'subsample': 0.7377932835117873, 'n_estimators': 74, 'learning_rate': 0.07007864514345166, 'colsample_bytree': 0.9854491656191469}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:19,335] Trial 8 finished with value: 3.7941045537663656 and parameters: {'max_depth': 6, 'min_data_in_leaf': 8, 'subsample': 0.6020176956177268, 'n_estimators': 86, 'learning_rate': 0.03305053353476536, 'colsample_bytree': 0.8452828249647417}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:19,886] Trial 9 finished with value: 0.7105744271955658 and parameters: {'max_depth': 9, 'min_data_in_leaf': 17, 'subsample': 0.7379385369377294, 'n_estimators': 94, 'learning_rate': 0.1593724608737781, 'colsample_bytree': 0.6332978890696518}. Best is trial 0 with value: 0.7046730446906897.




[I 2023-09-11 18:53:20,521] Trial 10 finished with value: 0.49826289292691256 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.9884245548487779, 'n_estimators': 66, 'learning_rate': 0.19217059734625117, 'colsample_bytree': 0.7513265558508014}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:20,970] Trial 11 finished with value: 0.5531982460753712 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.9653633919021373, 'n_estimators': 66, 'learning_rate': 0.1990206812357619, 'colsample_bytree': 0.751330702819809}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:21,423] Trial 12 finished with value: 0.5094471200321751 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.9590163992412857, 'n_estimators': 67, 'learning_rate': 0.19678068522405381, 'colsample_bytree': 0.7544530712140735}. Best is trial 10 with value: 0.49826289292691256.
[I 2023-09-11 18:53:21,567] Trial 13 finished with value: 3.511264725380804 and parameters: {'max_depth': 2, 'min_data_in_leaf': 5, 'subsample': 0.9778923490785669, 'n_estimators': 68, 'learning_rate': 0.19650294814632785, 'colsample_bytree': 0.7698390558839016}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:21,993] Trial 14 finished with value: 0.7489429560319719 and parameters: {'max_depth': 5, 'min_data_in_leaf': 7, 'subsample': 0.989821970924507, 'n_estimators': 51, 'learning_rate': 0.1819477108751464, 'colsample_bytree': 0.8291370131411221}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:22,319] Trial 15 finished with value: 0.8717694476034636 and parameters: {'max_depth': 4, 'min_data_in_leaf': 11, 'subsample': 0.931300393159178, 'n_estimators': 69, 'learning_rate': 0.1464373008116906, 'colsample_bytree': 0.7305460672460949}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:22,686] Trial 16 finished with value: 1.2180562697444728 and parameters: {'max_depth': 6, 'min_data_in_leaf': 20, 'subsample': 0.9823492740767673, 'n_estimators': 63, 'learning_rate': 0.18109140682314323, 'colsample_bytree': 0.7980449726618166}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:23,024] Trial 17 finished with value: 0.711962499551221 and parameters: {'max_depth': 4, 'min_data_in_leaf': 7, 'subsample': 0.9326278736625546, 'n_estimators': 72, 'learning_rate': 0.1719739627749601, 'colsample_bytree': 0.7158695580102656}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:23,399] Trial 18 finished with value: 1.6981744007125956 and parameters: {'max_depth': 7, 'min_data_in_leaf': 15, 'subsample': 0.9966523527615493, 'n_estimators': 52, 'learning_rate': 0.11654936478181793, 'colsample_bytree': 0.7852004935127453}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:23,798] Trial 19 finished with value: 0.6501275970582985 and parameters: {'max_depth': 4, 'min_data_in_leaf': 11, 'subsample': 0.8986525173744554, 'n_estimators': 100, 'learning_rate': 0.13683017879578915, 'colsample_bytree': 0.8379549519420111}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:24,239] Trial 20 finished with value: 0.7747580840636995 and parameters: {'max_depth': 6, 'min_data_in_leaf': 6, 'subsample': 0.937222756255637, 'n_estimators': 63, 'learning_rate': 0.16301242456818282, 'colsample_bytree': 0.7118550090009972}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:24,787] Trial 21 finished with value: 0.5303439683471354 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.9596562894539087, 'n_estimators': 65, 'learning_rate': 0.19249825405068136, 'colsample_bytree': 0.7496445931646334}. Best is trial 10 with value: 0.49826289292691256.




[I 2023-09-11 18:53:25,194] Trial 22 finished with value: 0.45746543091994435 and parameters: {'max_depth': 5, 'min_data_in_leaf': 6, 'subsample': 0.960147873663129, 'n_estimators': 71, 'learning_rate': 0.19964580797000797, 'colsample_bytree': 0.7471880672828386}. Best is trial 22 with value: 0.45746543091994435.
[I 2023-09-11 18:53:25,399] Trial 23 finished with value: 1.2647703285416743 and parameters: {'max_depth': 3, 'min_data_in_leaf': 7, 'subsample': 0.9998815689797779, 'n_estimators': 71, 'learning_rate': 0.1834661643573286, 'colsample_bytree': 0.7733960204679059}. Best is trial 22 with value: 0.45746543091994435.




[I 2023-09-11 18:53:25,675] Trial 24 finished with value: 0.531492005708861 and parameters: {'max_depth': 4, 'min_data_in_leaf': 9, 'subsample': 0.9477715405996772, 'n_estimators': 80, 'learning_rate': 0.19688418193548884, 'colsample_bytree': 0.6866624514995406}. Best is trial 22 with value: 0.45746543091994435.




[I 2023-09-11 18:53:26,205] Trial 25 finished with value: 0.7774635653223418 and parameters: {'max_depth': 7, 'min_data_in_leaf': 6, 'subsample': 0.9051896719850562, 'n_estimators': 74, 'learning_rate': 0.17224371749624684, 'colsample_bytree': 0.8009118323235992}. Best is trial 22 with value: 0.45746543091994435.




[I 2023-09-11 18:53:26,546] Trial 26 finished with value: 0.7156465366765994 and parameters: {'max_depth': 5, 'min_data_in_leaf': 6, 'subsample': 0.9611791064360197, 'n_estimators': 60, 'learning_rate': 0.1999380258640745, 'colsample_bytree': 0.7322627835765662}. Best is trial 22 with value: 0.45746543091994435.
[I 2023-09-11 18:53:26,692] Trial 27 finished with value: 4.514593920676442 and parameters: {'max_depth': 2, 'min_data_in_leaf': 10, 'subsample': 0.8645396129704318, 'n_estimators': 77, 'learning_rate': 0.15188561406113654, 'colsample_bytree': 0.6643426426504895}. Best is trial 22 with value: 0.45746543091994435.




[I 2023-09-11 18:53:27,063] Trial 28 finished with value: 0.7748954497313065 and parameters: {'max_depth': 6, 'min_data_in_leaf': 7, 'subsample': 0.9125767379353618, 'n_estimators': 55, 'learning_rate': 0.1831904598910945, 'colsample_bytree': 0.7040967409457116}. Best is trial 22 with value: 0.45746543091994435.




[I 2023-09-11 18:53:27,394] Trial 29 finished with value: 0.5787765650846737 and parameters: {'max_depth': 5, 'min_data_in_leaf': 8, 'subsample': 0.9602680189348624, 'n_estimators': 69, 'learning_rate': 0.16139172344065175, 'colsample_bytree': 0.6451169608780121}. Best is trial 22 with value: 0.45746543091994435.
[I 2023-09-11 18:53:27,624] Trial 30 finished with value: 0.9432365031750198 and parameters: {'max_depth': 4, 'min_data_in_leaf': 6, 'subsample': 0.9993721523574318, 'n_estimators': 61, 'learning_rate': 0.1463593898869231, 'colsample_bytree': 0.6009001951643416}. Best is trial 22 with value: 0.45746543091994435.




[I 2023-09-11 18:53:28,018] Trial 31 finished with value: 0.4515444125065049 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.9586914738810638, 'n_estimators': 65, 'learning_rate': 0.18756526866756335, 'colsample_bytree': 0.7523644325460804}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:28,515] Trial 32 finished with value: 1.0222563267221914 and parameters: {'max_depth': 7, 'min_data_in_leaf': 5, 'subsample': 0.9681088612048678, 'n_estimators': 65, 'learning_rate': 0.18670413217429024, 'colsample_bytree': 0.7501391487393292}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:29,106] Trial 33 finished with value: 0.7366346538067262 and parameters: {'max_depth': 6, 'min_data_in_leaf': 6, 'subsample': 0.8840438593199372, 'n_estimators': 71, 'learning_rate': 0.17639115547258327, 'colsample_bytree': 0.7352048813307699}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:29,403] Trial 34 finished with value: 0.48625423732995793 and parameters: {'max_depth': 5, 'min_data_in_leaf': 9, 'subsample': 0.9413687673563185, 'n_estimators': 57, 'learning_rate': 0.1887961775180651, 'colsample_bytree': 0.6969808992785202}. Best is trial 31 with value: 0.4515444125065049.
[I 2023-09-11 18:53:29,622] Trial 35 finished with value: 1.0836481996373077 and parameters: {'max_depth': 4, 'min_data_in_leaf': 10, 'subsample': 0.9210923195356399, 'n_estimators': 56, 'learning_rate': 0.16679628195491508, 'colsample_bytree': 0.6761426290106389}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:29,792] Trial 36 finished with value: 2.2139545682621353 and parameters: {'max_depth': 3, 'min_data_in_leaf': 9, 'subsample': 0.9372058383778454, 'n_estimators': 57, 'learning_rate': 0.18483824025301568, 'colsample_bytree': 0.7001608198808253}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:30,225] Trial 37 finished with value: 1.0595919389991668 and parameters: {'max_depth': 8, 'min_data_in_leaf': 8, 'subsample': 0.979744492520637, 'n_estimators': 53, 'learning_rate': 0.1557711360678977, 'colsample_bytree': 0.6470201066720577}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:30,546] Trial 38 finished with value: 0.9129340485966158 and parameters: {'max_depth': 6, 'min_data_in_leaf': 13, 'subsample': 0.8598909186968391, 'n_estimators': 59, 'learning_rate': 0.17411375142818677, 'colsample_bytree': 0.7140536891070497}. Best is trial 31 with value: 0.4515444125065049.
[I 2023-09-11 18:53:30,730] Trial 39 finished with value: 1.659871033878923 and parameters: {'max_depth': 3, 'min_data_in_leaf': 10, 'subsample': 0.9436148000491363, 'n_estimators': 62, 'learning_rate': 0.19057272626261507, 'colsample_bytree': 0.6908367288615702}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:31,122] Trial 40 finished with value: 0.6239278282816316 and parameters: {'max_depth': 5, 'min_data_in_leaf': 7, 'subsample': 0.8937338127803556, 'n_estimators': 84, 'learning_rate': 0.16348984673200273, 'colsample_bytree': 0.6684163111130043}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:31,533] Trial 41 finished with value: 0.4571554835739989 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.951016060759242, 'n_estimators': 68, 'learning_rate': 0.1900241795561501, 'colsample_bytree': 0.7622539037701317}. Best is trial 31 with value: 0.4515444125065049.




[I 2023-09-11 18:53:31,936] Trial 42 finished with value: 0.43660756312979615 and parameters: {'max_depth': 5, 'min_data_in_leaf': 6, 'subsample': 0.9176210318243924, 'n_estimators': 78, 'learning_rate': 0.19104558017236822, 'colsample_bytree': 0.7297524530628288}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:32,312] Trial 43 finished with value: 0.531652354369364 and parameters: {'max_depth': 5, 'min_data_in_leaf': 8, 'subsample': 0.9196492051045438, 'n_estimators': 76, 'learning_rate': 0.17511755330881004, 'colsample_bytree': 0.7253482808435062}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:32,949] Trial 44 finished with value: 0.5979821350883351 and parameters: {'max_depth': 6, 'min_data_in_leaf': 6, 'subsample': 0.9490912729922009, 'n_estimators': 79, 'learning_rate': 0.18820392026217012, 'colsample_bytree': 0.6978077626290186}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:33,421] Trial 45 finished with value: 1.0910756862357895 and parameters: {'max_depth': 7, 'min_data_in_leaf': 9, 'subsample': 0.9143929118509667, 'n_estimators': 82, 'learning_rate': 0.19979714712256255, 'colsample_bytree': 0.7768339968914397}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:33,694] Trial 46 finished with value: 0.589895365862395 and parameters: {'max_depth': 4, 'min_data_in_leaf': 7, 'subsample': 0.8796249633089881, 'n_estimators': 75, 'learning_rate': 0.1786948586106031, 'colsample_bytree': 0.7406350580779181}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:33,988] Trial 47 finished with value: 0.6931585097008709 and parameters: {'max_depth': 4, 'min_data_in_leaf': 12, 'subsample': 0.9695676858719591, 'n_estimators': 73, 'learning_rate': 0.16766485496960465, 'colsample_bytree': 0.7625282669315396}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:34,471] Trial 48 finished with value: 0.4932330997907338 and parameters: {'max_depth': 5, 'min_data_in_leaf': 5, 'subsample': 0.9447739946268753, 'n_estimators': 89, 'learning_rate': 0.18919308556045694, 'colsample_bytree': 0.7249842733097855}. Best is trial 42 with value: 0.43660756312979615.




[I 2023-09-11 18:53:34,889] Trial 49 finished with value: 0.6869216366887604 and parameters: {'max_depth': 6, 'min_data_in_leaf': 8, 'subsample': 0.8523345105518135, 'n_estimators': 70, 'learning_rate': 0.1911622130765715, 'colsample_bytree': 0.8111344472912768}. Best is trial 42 with value: 0.43660756312979615.


Fitting model for quantile 0.1
[1]	valid_0's quantile: 0.519981
[2]	valid_0's quantile: 0.509422
[3]	valid_0's quantile: 0.499093
[4]	valid_0's quantile: 0.492477
[5]	valid_0's quantile: 0.484207
[6]	valid_0's quantile: 0.480057
[7]	valid_0's quantile: 0.473872
[8]	valid_0's quantile: 0.467611
[9]	valid_0's quantile: 0.463298
[10]	valid_0's quantile: 0.458618
[11]	valid_0's quantile: 0.452573
[12]	valid_0's quantile: 0.449456
[13]	valid_0's quantile: 0.445496
[14]	valid_0's quantile: 0.442669
[15]	valid_0's quantile: 0.438586
[16]	valid_0's quantile: 0.436881
[17]	valid_0's quantile: 0.43393
[18]	valid_0's quantile: 0.431137
[19]	valid_0's quantile: 0.428422
[20]	valid_0's quantile: 0.425812
[21]	valid_0's quantile: 0.423952
[22]	valid_0's quantile: 0.422991
[23]	valid_0's quantile: 0.420801
[24]	valid_0's quantile: 0.419148
[25]	valid_0's quantile: 0.418232
[26]	valid_0's quantile: 0.417358
[27]	valid_0's quantile: 0.415713
[28]	valid_0's quantile: 0.414729
[29]	valid_0's quantile: 0.