## Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import datetime
import itertools
from collections import defaultdict

import math
import numpy as np
import pandas as pd
from scipy.stats import shapiro
import scipy.stats as stats
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from statsmodels.graphics.tsaplots import plot_pacf

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit, KFold, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_log_error, make_scorer
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from sklearn.compose import make_column_selector as selector

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNetCV, ElasticNet
from xgboost import XGBRegressor
import catboost as cb
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objs as go
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Configurations

In [2]:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

## Load Data

In [3]:
dir_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/'

artcls_df = pd.read_csv(dir_path + 'articles.csv')
cust_df = pd.read_csv(dir_path + 'customers.csv')
trnsctns_df = pd.read_csv(dir_path + 'transactions_train.csv')

In [4]:
artcls_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,3,Light,9,White,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [5]:
cust_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [6]:
trnsctns_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.05,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.03,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.02,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.02,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.02,2


## Data Pre-processing

In [7]:
print(f'Minimum date in transaction dataset: {trnsctns_df["t_dat"].min()}')
print(f'Maximum date in transaction dataset: {trnsctns_df["t_dat"].max()}')

Minimum date in transaction dataset: 2018-09-20
Maximum date in transaction dataset: 2020-09-22


#### Handle Missing Values

In [8]:
cust_df['FN'] = cust_df['FN'].fillna(0)
cust_df['Active'] = cust_df['Active'].fillna(0)
cust_df['club_member_status'] = cust_df['club_member_status'].fillna('NOT A MEMBER')
cust_df['Active'] = cust_df['Active'].fillna('NONE')
cust_df['age'] = cust_df['age'].fillna(cust_df['age'].median())

#### Convert Data Types

In [9]:
trnsctns_df['sales_channel_id'] = trnsctns_df['sales_channel_id'].astype('category')

## Data Preparation

In [10]:
mergd_df = trnsctns_df.merge(artcls_df.drop(columns=['detail_desc']), on='article_id', how='left')
mergd_df = mergd_df.merge(cust_df.drop(columns=['postal_code']), on='customer_id', how='left')

mergd_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,FN,Active,club_member_status,fashion_news_frequency,age
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.05,2,663713,Atlanta Push Body Harlow,283,Underwear body,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1338,Expressive Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear",0.0,0.0,ACTIVE,NONE,24.0
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.03,2,541518,Rae Push (Melbourne) 2p,306,Bra,Underwear,1010016,Solid,51,Light Pink,1,Dusty Light,4,Pink,1334,Casual Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear",0.0,0.0,ACTIVE,NONE,24.0
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.02,2,505221,Inca Jumper,252,Sweater,Garment Upper body,1010010,Melange,52,Pink,2,Medium Dusty,4,Pink,5963,Tops Knitwear DS,D,Divided,2,Divided,58,Divided Selected,1003,Knitwear,1.0,1.0,ACTIVE,Regularly,32.0
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.02,2,685687,W YODA KNIT OL OFFER,252,Sweater,Garment Upper body,1010010,Melange,52,Pink,7,Medium,4,Pink,3090,Campaigns,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1023,Special Offers,1.0,1.0,ACTIVE,Regularly,32.0
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.02,2,685687,W YODA KNIT OL OFFER,252,Sweater,Garment Upper body,1010010,Melange,93,Dark Green,4,Dark,19,Green,3090,Campaigns,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1023,Special Offers,1.0,1.0,ACTIVE,Regularly,32.0


In [11]:
print(f"articles: {artcls_df.shape}")
print(f"customers: {cust_df.shape}")
print(f"transactions: {trnsctns_df.shape}")
print(f"master dataset: {mergd_df.shape}")

articles: (105542, 25)
customers: (1371980, 7)
transactions: (31788324, 5)
master dataset: (31788324, 33)


In [12]:
del trnsctns_df

## More Data Pre-processing

#### Change 't_dat' column type

In [13]:
mergd_df['t_dat'] = pd.to_datetime(mergd_df['t_dat'], format = "%Y-%m-%d")

#### Check if any date is missing in the merged dataset during the reporting period

In [14]:
strt_dt = mergd_df['t_dat'].min()
end_dt = mergd_df['t_dat'].max()
dt_rnge = pd.date_range(start=strt_dt, end=end_dt)
missing_dts = dt_rnge.difference(mergd_df['t_dat'])

print(f"Missing dates in merged dataset: {missing_dts}")

Missing dates in merged dataset: DatetimeIndex([], dtype='datetime64[ns]', freq=None)


#### Pick the top-5 articles that had the highest sales during the reporting period and forecast sales

In [20]:
top5_article_sales = mergd_df.groupby(['article_id']).agg({'price': ['count', 'sum']})

# Flatten the multi-level column index
#top5_article_sales.columns = ['_'.join(col).strip() for col in top5_article_sales.columns.values]
top5_article_sales.columns = ['quantity', 'sales']

# Sort by quantity in descending order
top5_article_sales = top5_article_sales.sort_values(by='quantity', ascending=False)

top5_article_sales.head()

Unnamed: 0_level_0,quantity,sales
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1
706016001,50287,1631.73
706016002,35043,1136.32
372860001,31718,411.0
610776002,30199,244.1
759871002,26329,147.58


In [21]:
# Sort by sales in descending order
top5_article_sales.sort_values(by='sales', ascending=False).head()

Unnamed: 0_level_0,quantity,sales
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1
706016001,50287,1631.73
706016002,35043,1136.32
568601006,19379,939.27
448509014,19216,781.48
706016003,21241,692.2


#### Check the metadata of top-5 articles that reported the highest sales

In [24]:
artcls_df[artcls_df['article_id'].isin([706016001, 706016002, 568601006, 448509014, 706016003])]

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
3091,448509014,448509,Perrie Slim Mom Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,72,Blue,3,Light,2,Blue,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,"5-pocket, ankle-length jeans in washed, sturdy..."
16003,568601006,568601,Mariette Blazer,264,Blazer,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1212,Suit,A,Ladieswear,1,Ladieswear,11,Womens Tailoring,1008,Dressed,Fitted jacket in woven fabric with notch lapel...
53892,706016001,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,9,Black,4,Dark,5,Black,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
53893,706016002,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,71,Light Blue,3,Light,2,Blue,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
53894,706016003,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,73,Dark Blue,2,Medium Dusty,2,Blue,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...


#### Aggregate sales for those articles

In [31]:
agg_sales_top5_articles = mergd_df[mergd_df['article_id'].isin([706016001, 706016002, 568601006, 448509014, 706016003])].\
                                groupby(['t_dat', 'article_id'])['price'].sum().reset_index()
agg_sales_top5_articles.rename(columns={'price': 'sales'}, inplace=True)

agg_sales_top5_articles.head()

Unnamed: 0,t_dat,article_id,sales
0,2018-09-20,448509014,0.75
1,2018-09-20,568601006,2.34
2,2018-09-21,448509014,0.89
3,2018-09-21,568601006,2.29
4,2018-09-22,448509014,0.13


#### Create a multi-index variable

In [32]:
multi_index = pd.MultiIndex.from_product([pd.date_range(strt_dt, end_dt),
                                          agg_sales_top5_articles.article_id.unique()],
                                         names=['t_dat','article_id'],)
agg_sales_top5_articles = agg_sales_top5_articles.set_index(['t_dat','article_id']).reindex(multi_index).reset_index()

agg_sales_top5_articles.head()

Unnamed: 0,t_dat,article_id,sales
0,2018-09-20,448509014,0.75
1,2018-09-20,568601006,2.34
2,2018-09-20,706016002,
3,2018-09-20,706016001,
4,2018-09-20,706016003,


#### Fill missing values with 0s

In [33]:
agg_sales_top5_articles[['sales']] = agg_sales_top5_articles[['sales']].fillna(0.00)
agg_sales_top5_articles.head()

Unnamed: 0,t_dat,article_id,sales
0,2018-09-20,448509014,0.75
1,2018-09-20,568601006,2.34
2,2018-09-20,706016002,0.0
3,2018-09-20,706016001,0.0
4,2018-09-20,706016003,0.0


## Feature Engineering