# Packages Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import os
import sys
sys.path.append(os.path.realpath('..')) #note to self: this works, only when notebook is alrdy saved in directory. So, first save notebook and then use this line of code.
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Data Loading

In [None]:
# Loading application records into a pandas dataframe
sales = pd.read_csv( '../input/summer-products-and-sales-in-ecommerce-wish/summer-products-with-rating-and-performance_2020-08.csv') 

In [None]:
print( sales.shape )
print()
sales.info()
print()
sales.head()

So...  
We have:  
* NA's  
* probably a single value in currency_buyer  
* probably some useless fields like: product_url, merchant_profile_picture and product_picture fields

# Data Cleaning

Let's start by removing the useless columns we saw previously:

In [None]:
sales_clean = sales.drop(
    columns=['product_url', 'merchant_profile_picture', 'product_picture','currency_buyer', \
    'theme', 'crawl_month', 'merchant_info_subtitle', 'merchant_name', 'merchant_title', \
        'urgency_text'
    ])

sales_clean.head()

## Dealing With Missing Data and Duplicates

Let's check again how many uniques and nulls we got:

In [None]:
def null_unique(dataframe):
    """
    Receives a pandas dataframe and  produces a new dataframe with
    nr. of uiques count and nr. of nulls count.
    """
    uniques = pd.DataFrame( dataframe.nunique(), columns= ['nr_uniques'] )
    nulls = pd.DataFrame( dataframe.isnull().sum(), columns= ['nr_nulls'] )
    _a = pd.concat( [uniques, nulls], axis = 1 )
    _a['nr_observations'] = dataframe.shape[0]
    
    if _a['nr_nulls'].sum() > 0:
        _a = _a[['nr_observations','nr_uniques', 'nr_nulls']].sort_values(by=['nr_nulls'], ascending = False)
    elif _a['nr_uniques'].sum() != _a['nr_observations'].sum():
        _a = _a[['nr_observations','nr_uniques', 'nr_nulls']].sort_values(by=['nr_uniques'], ascending = True)
    else:
        _a = _a[['nr_observations','nr_uniques', 'nr_nulls']].sort_index()

    return _a

In [None]:
null_unique(sales_clean)

As we can see, there are has_urgency_banner is lacking alot of values.  
This is probably, because there simply isn't a urgency flag on it, so we'll fillna with 0 in this case

In [None]:
sales_clean.has_urgency_banner.fillna(0, inplace=True)

With the below code, we can easily check that rating counts columns are null always when rating_count equals 0.  

In [None]:
for number in ['one', 'two', 'three', 'four', 'five']:
    col = 'rating_'+ number +'_count'
    print( col + ' has ' + str( sales_clean[sales_clean[col].isnull()].rating_count.value_counts()[0]) +' null values when rating_count=0' )

In this cases I guess it's safe to fillna with 0's aswell, because if we feed this information into a given algorithm, it will recognize that a certain product that had rating_count=0 it also had 0 counts in other rating columns.

In [None]:
for number in ['one', 'two', 'three', 'four', 'five']:
    col = 'rating_'+ number +'_count'
    sales_clean[col].fillna(0, inplace=True)

Let's check our uniques and nulls grid again, this time only for the cases where we still have NA's:

In [None]:
grid_check = null_unique(sales_clean)
grid_check[grid_check.nr_nulls > 0]

Let's inspect:  
Let's first check if there are any duplicated prodcuts with colors NA's in some cases, and in other cases with colors assigned.

In [None]:
# counting 
mask = sales_clean.product_id.value_counts() 
mask = mask[mask>1]
sales_dup_prod = sales_clean.iloc[mask.to_list()].sort_values(by=['product_id'])
sales_dup_prod.isnull().sum().sum()


This means we have no NA's whitin the duplicated products subset.  
So, there is no way we can know what is the main color of the product.  
For this reason we can fill NA's for product_color and origin_country with 'unknown' category.

In [None]:
sales_clean.product_color.fillna('unknown', inplace=True)
sales_clean.origin_country.fillna('unknown', inplace=True)

Lets, finally, check product_variation_size_id for NA's:  
I suspect that this field is NA whenever product_variation_inventory = 0

In [None]:
sales_clean[['product_id', 'product_variation_inventory', 'product_variation_size_id']][sales_clean.product_variation_size_id.isnull()]

Ooopss...! I guess I was wrong =)  
I guess we won't be able to know what is one of the available size variation for this products, so we better fillna with 'unknown' category

In [None]:
sales_clean.product_variation_size_id.fillna('unknown', inplace=True)

Let's check our unique and null grid again:

In [None]:
null_unique(sales_clean)

This time we got no missing data anymore. Hurray to that! =)  
However, we still have about 200 duplicated product ID's.  
Let's get rid of those, as they won't help in prodictive task we got ahead.

In [None]:
sales_clean = sales_clean.drop_duplicates(subset='product_id').reset_index(drop=True)

null_unique(sales_clean)

## Organizing our Dataset

merchant_id and product_id are univocal identifiers of each row of the dataset now.  
Let's pass those as indexes:  

In [None]:
sales_clean = sales_clean.set_index(['merchant_id', 'product_id']).sort_index()

sales_clean.info()

We can now build a couple lists of dataset columns that might prove usefull ahead.  
We can divide columns by: Flags or binary features, numerical features and categorical features:

In [None]:
_b = null_unique(sales_clean)
flag_cols = _b[_b['nr_uniques'] == 2].index.to_list()
flag_cols

In [None]:
num_cols = [col for col in sales_clean.columns if col not in flag_cols and sales_clean[col].dtype != 'O' ]
num_cols

In [None]:
cat_cols = [col for col in sales_clean.columns if sales_clean[col].dtype == 'O' ]
cat_cols

In [None]:
# Check-Zone:
# len(sales_clean.columns) == 31
# len(flag_cols) == 7
# len(num_cols) == 17
# len(cat_cols) == 7

len(flag_cols) + len(num_cols)  + len(cat_cols)  == len(sales_clean.columns)

# Continuing With Data Cleaning Tasks

### Categorical Features  Cleaning

In [None]:
for col in cat_cols:
    print( 'There are ' + str(len( sales_clean[col].unique() )) + ' distinct categories for '+ col +' feature.' )

As we can see, our categorical features have alot of categories (and i'm not referring to title or tags features right now, we''l deal with those later on).  
We should try to reduce the size of categrories in each of them, or the predictive model might be poorly fed.  
Below we'll clean any trailing and/or leading spaces from each feature aswell as lower case of the strings:  

In [None]:
# passing all categorical features to lower case, and striping trailing and/or leading spaces
for col in cat_cols:
    sales_clean[col] = sales_clean[col].str.lower()
    sales_clean[col] = sales_clean[col].str.strip()
sales_clean[cat_cols].head()

Ok!  
Now that we have uniformized the strings characters, let's check what we got using the help of some plotting:

In [None]:
for col in cat_cols:
    if 'title' not in col and 'tags' not in col:
        #print('ok')
        pd.DataFrame( sales_clean[col].value_counts(ascending=True) ).plot(kind='barh', figsize=(10,22), legend = False)
        plt.title( col + ' | Categories Distribution.' )
        ;

As we can see, although there are alot of categories, in each categorical feature, we can try to reduce them in some cases:  
For instance, in ``` shipping_option_name ``` we can see that the categories are expressed in several languages.  
One thing we could try, is to use googletrans to detect language and translate to english. However, there is a flag column   
that identifies if the shipping is express or not (which, was the primary objective of the shipping_option_name if we fed it to any model - the model would retain if the product is shipped in express mode or not), so this column end up being redundant thus we can drop it.  
In any case i'll leave a cell below with an implementation of googletrans application (yeah! I tried the implementation, because I forgot about the flag column mention previously).

In [None]:
# from googletrans import Translator
# def translate_strings( pandas_series ):
#     """
#     Translate the unique values from a pandas series, in order to minimize the calls to google services.
#     Even thought google service is free it has a limit, therefore the use fo this function.
# 
#     This functions returns a dataframe with 2 columns:
#     real_cats = original pandas series categories
#     cats = translated cateories
#     """
#     translator = Translator()
#     _b = pd.DataFrame( pandas_series.unique(), columns = ['real_cats'] )
#     _b['cats'] = _b['real_cats'].apply(translator.translate, dest='en').apply(getattr, args=('text',)).str.lower()
# 
#     return _b
# 
# ship_options = translate_strings( sales_clean['shipping_option_name'] )
# 
# 
# replacing_cats = {
#     'standard delivery': 'standard shipping',
#     'standard post': 'standard shipping',
#     'normal delivery': 'standard shipping',
#     'express delivery': 'express shipping'
# }
# ship_options['cats'].replace(replacing_cats, inplace = True)
# ship_options.set_index('real_cats', inplace = True)
# 
# sales_clean['shipping_option_name'] = sales_clean.shipping_option_name.replace( ship_options.cats.to_dict() )
# sales_clean.shipping_option_name.unique()

In [None]:
sales_clean.drop(columns = ['shipping_option_name'], inplace = True)

cat_cols.remove('shipping_option_name')

In ```origin_country```, we can try to transform the feature to have 2 categories, since the majority of the records has 'cn' value:  
let's say 'cn' and 'others' will be the new categories:

In [None]:
sales_clean['origin_country'] = sales_clean.origin_country.where(sales_clean.origin_country=='cn', 'other country')

In ```product_color```, we can try to transform the feature to have a couple categories with some equilized distribution values:

In [None]:
sales_clean['product_color'] = \
sales_clean.product_color.where( 
    sales_clean.product_color.isin(['black','white','pink','blue','yellow', 'red', 'green', 'grey', 'purple', 'unknown']),
    'other'
    )

In [None]:
sales_clean.product_color.value_counts()

In ``` product_variation_size_id ``` we have some more data cleaning to deal with

In [None]:

sales_clean['product_variation_size_id'] = sales_clean.product_variation_size_id.str.replace('.*[^x*]xl', 'xl')
sales_clean['product_variation_size_id'] = sales_clean.product_variation_size_id.str.replace('size', '')
sales_clean['product_variation_size_id'] = sales_clean.product_variation_size_id.str.replace('--', '')
sales_clean['product_variation_size_id'] = sales_clean.product_variation_size_id.str.replace('-', '')
sales_clean['product_variation_size_id'] = sales_clean.product_variation_size_id.str.replace('\.', '')
sales_clean['product_variation_size_id'] = sales_clean.product_variation_size_id.str.strip()

sales_clean.product_variation_size_id.unique()

In [None]:
sales_clean['product_variation_size_id'] = \
sales_clean.product_variation_size_id.where(
    sales_clean.product_variation_size_id.isin(['xxs','xs','s','m','l','xl','xxl']),
    'other size'
    )
sales_clean.product_variation_size_id.unique()

In [None]:
for col in cat_cols:
    if 'title' not in col and 'tags' not in col:
        #print('ok')
        pd.DataFrame( sales_clean[col].value_counts(ascending=True) ).plot(kind='barh', legend = False)
        plt.title( col + ' | Categories Distribution.' )
        ;

Title columns can be used to build a new flag feature, that will indicate to an eventual predictive model that it was made availabe a translation for the product.  
Perhaps consumers buy more if the product has a translation for theyr own language, because this fact alone can cause a better perception, on the consumer, of what the product is.

In [None]:
sales_clean['flag_has_transl'] =   np.where( sales_clean.title == sales_clean.title_orig , 1, 0 )
sales_clean.drop(columns=['title','title_orig'], inplace=True)

cat_cols.remove('title')
cat_cols.remove('title_orig')
flag_cols.append('flag_has_transl')

### Flag Features  Cleaning

For this set of features we will only change the data type in order to optimize memory usage a little bit

In [None]:
sales_clean[flag_cols].info()

In [None]:
for col in flag_cols:
    sales_clean[col] = sales_clean[col].astype('uint8')

sales_clean[flag_cols].info()

In [None]:
for col in flag_cols:
    pd.DataFrame( sales_clean[col].value_counts(ascending=True) ).plot(kind='barh', legend = False)
    plt.title( col + ' | Categories Distribution.' );

### Numeric Features  Cleaning

In [None]:
sales_clean[num_cols].info()

Nothing to clean in this subset of features

# Feature Engineering

Before diving deep into analytics, lets build a new numeric feature that will represent the user perception of the advantage that is buying the product via Wish platform when comparing with other retailers.  
For that we will simply calculate the difference between the retail price and "wish" price:

In [None]:
sales_clean['user_discount'] = sales_clean['retail_price'] - sales_clean['price']
num_cols.append('user_discount')

sales_clean[['retail_price','price','user_discount']].head()

Please note:  
  
* If the ```user_discount``` < 0 => negative user perception: he will spend that extra money if buying product via Wish platform;  
* If the ```user_discount``` > 0 => positive user perception: he will save that amount of money if buying product via Wish platform;  

At this point, we can also build another numeric feature, that will basically count the number of tags a product have

In [None]:
sales_clean['tags_nr'] = sales_clean.tags.str.count(',')
num_cols.append('tags_nr')

sales_clean.drop(columns = ['tags'], inplace =True)
cat_cols.remove('tags')

sales_clean[['retail_price','price','user_discount', 'tags_nr', 'units_sold']].head()

# Descriptive Analytics

## Numeric Features

Let's check numeric features distributions

In [None]:
def distplot_matrix(total_cols, dataframe):
    """
    Builds a facet with mutiple sns distribution plots - one for eachfeature in the dataframe.
    This function works for numeric features.

    Input elements:
    total_cols - total number of columns the user wants to have in the matrix where the plots will be in the end
    dataframe - pandas dataframe with the features we want to plot
    """
    # Subplots are organized in a Rows x Cols Grid
    # Tot and Cols are known
    # https://stackoverflow.com/questions/12319796/dynamically-add-create-subplots-in-matplotlib

    Tot = len( dataframe.columns )
    Cols = total_cols

    # Compute Rows required
    Rows = Tot // Cols 
    Rows += Tot % Cols

    # Create a Position index
    Position = range(1, Tot + 1)

    # Create main figure
    fig = plt.figure(1, figsize=(30,35)) 
    # optimization needed: dynamically adjust figure size given the total columns the user wants to see in final matrix and the total charts to display = len(dataframe.columns)

    for k, col in zip( range(Tot), dataframe.columns ):
        # add every single subplot to the figure with a for loop
        ax = fig.add_subplot(Rows, Cols, Position[k])
        
        sns.distplot( 
            dataframe[col], 
            hist_kws={"histtype": "bar", "rwidth":0.7,  "alpha": 1, "color": "#f1c80f"}, 
            kde_kws={"color": "black", "lw": 2, "bw":0.6},
            ax=ax
            )                
        #Removes frame but keep axis
        plt.gca().spines['right'].set_color('none')
        plt.gca().spines['top'].set_color('none');
        plt.title(col + ' Distribution', fontweight="bold")

    plt.show();

In [None]:
distplot_matrix( 4, sales_clean[num_cols] )

From the above matrix of distribution plots we can retain some conclusions:  
* If we compare the price and retail_price plots, we can see that retail_price has a 5 times larger scale, when compared with prices. Maybe this is why consumers come to this platform to buy. Perhaps the price differences are really this big between Wish and other selling spots and consumers realize that when shopping.  
  
* Another interesting observation, is that the majority of the mean product ratings concentrate arround 4 (scale 1 to 5), which means that the Whish platform users/consumers prespective over the products are, overall, pretty decent. The same can be observed for the seller ratings. This leads to conclude that Wish consumers perception about products and sellers is relatively good and, probably, this fact alone will generate more confidence amongst the consumers to keep buying or, eventually, to buy, more.  
  
* Rating counts features have all very simillar distributions, although with different frequencies.  
  
* Aparently products/sellers do not own many badges, since most observations concentrate themselfs arround 0 unit.  
  
* Most of the products are well stocked (perhaps too well stocked?), since most of the observations concentrate themselfs on 50 units. However, there is a quite relevant part of products that concentrate on 10 or less units. Perhaps those are the products that are selling well and need to be replenished in stock more frequently?  
  
* The shipping prices are, in the majority of the cases, very low (<4€). This fact alone is another sales booster since the consumers won't have to spend alot more money in shiping costs.  
  
* There is a significant proportion of poducts where the user discount is null or negative, although in the majority of the products the users can actually get some decent discounts.
  
* A final observation to talk about the countries where a given product is shipped: most products are shipped to arround 40 distinct countries. This leads to conclude and again reinforce the conclusions we made earlier about consumer perception on a global level.  
  
  

## Numeric Features Correlations

Let's plot Pearson correlations:

In [None]:
plt.figure(figsize=(15,9))

corr_df = sales_clean[num_cols].corr().round(2)
mask_ut= np.zeros_like(corr_df)
mask_ut[np.triu_indices_from(mask_ut)] = True
ax = sns.heatmap( corr_df, mask = mask_ut, annot=True )
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)
plt.title('Numerical features Correlations', fontweight="bold")
;

We can see that:  
* rating_count is highly correlated with  'rating_five_count', 'rating_four_count', 'rating_three_count', 'rating_two_count'and 'rating_one_count'  
* all the previously mentioned features are correlated with units sold  
* shipping_option_price is highly correlated with price
* user_discount is highly correlated with retail_price

For this reason, it's safe to drop 'retail_price','shipping_option_price', 'rating_five_count', 'rating_four_count', 'rating_three_count', 'rating_two_count', 'rating_one_count' and 'rating_count' features

In [None]:
sales_clean.drop(columns=['retail_price','shipping_option_price','rating_five_count', 'rating_four_count', 'rating_three_count', 'rating_two_count', 'rating_one_count', 'rating_count'], inplace = True)

num_cols_to_drop = ['retail_price','shipping_option_price', 'rating_five_count', 'rating_four_count', 'rating_three_count', 'rating_two_count', 'rating_one_count', 'rating_count']

num_cols = [col for col in num_cols if col not in num_cols_to_drop ]

## Categorical Features Distributions

In [None]:
def hist_matrix(total_cols, dataframe):
    """
    Builds a facet with mutiple sns distribution plots - one for eachfeature in the dataframe.
    This function works categorical features only.

    Input elements:
    total_cols - total number of columns the user wants to have in the matrix where the plots will be in the end
    dataframe - pandas dataframe with the features we want to plot
    """
    # Subplots are organized in a Rows x Cols Grid
    # Tot and Cols are known
    # https://stackoverflow.com/questions/12319796/dynamically-add-create-subplots-in-matplotlib

    Tot = len( dataframe.columns )
    Cols = total_cols

    # Compute Rows required
    Rows = Tot // Cols 
    Rows += Tot % Cols

    # Create a Position index
    Position = range(1, Tot + 1)

    # Create main figure
    fig = plt.figure(1, figsize=(25,5)) 
    # optimization needed: dynamically adjust figure size given the total columns the user wants to see in final matrix and the total charts to display = len(dataframe.columns)

    #sns.set_palette(sns.cubehelix_palette(8))
    sns.set_palette(sns.color_palette("BrBG", 7))
    for k, col in zip( range(Tot), dataframe.columns ):
        # add every single subplot to the figure with a for loop
        ax = fig.add_subplot(Rows, Cols, Position[k])
        _data = pd.DataFrame( dataframe[col].value_counts() ).rename(columns={col:'freq'})
        

        sns.barplot( 
            data = _data,
            y = list(_data.index), 
            x=_data.freq,
            ax=ax
            )                
        #Removes frame but keep axis
        plt.gca().spines['right'].set_color('none')
        plt.gca().spines['top'].set_color('none')
        plt.title(col + ' Distribution', fontweight="bold")
        ;


    plt.show();



In [None]:
hist_matrix(3, sales_clean[cat_cols])

With those plots we can conclude that:  
* Most products in the dataset have colors, or combinations of colors, other then black, white, pink, etc. However, there is a big proportion of products whose color is black or white;  
* Most products on the data set have size s or xs;  
* Most products on the dataset are originary from china (which, probably justifies the lower price ranges of Whish platform, when comparing with prices other retailers).

## Binary features

In [None]:
def flag_matrix(total_cols, dataframe):
    """
    Builds a facet with mutiple sns distribution plots - one for eachfeature in the dataframe.
    This function works categorical features only.

    Input elements:
    total_cols - total number of columns the user wants to have in the matrix where the plots will be in the end
    dataframe - pandas dataframe with the features we want to plot
    """
    # Subplots are organized in a Rows x Cols Grid
    # Tot and Cols are known
    # https://stackoverflow.com/questions/12319796/dynamically-add-create-subplots-in-matplotlib

    Tot = len( dataframe.columns )
    Cols = total_cols

    # Compute Rows required
    Rows = Tot // Cols 
    Rows += Tot % Cols

    # Create a Position index
    Position = range(1, Tot + 1)

    # Create main figure
    fig = plt.figure(1, figsize=(25,22)) 
    # optimization needed: dynamically adjust figure size given the total columns the user wants to see in final matrix and the total charts to display = len(dataframe.columns)

    for k, col in zip( range(Tot), dataframe.columns ):
        # add every single subplot to the figure with a for loop
        ax = fig.add_subplot(Rows, Cols, Position[k])
        _data = pd.DataFrame( dataframe[col].value_counts() ).rename(columns={col:'freq'})
        
        sns.barplot( 
            data = _data,
            y = list(_data.index), 
            x=_data.freq,
            orient = 'h',
            palette = ['#4341ab', '#85738f'],
            ax=ax
            )                
        #Removes frame but keep axis
        plt.gca().spines['right'].set_color('none')
        plt.gca().spines['top'].set_color('none')
        plt.title(col + ' Distribution', fontweight="bold")
        ;
plt.tight_layout()
plt.show();

Lets make some plots

In [None]:
flag_matrix(3, sales_clean[flag_cols])

From the above mosaic we can conclude:  
* Most products do not own a quality badge;  
  
* Most products are not classified with an urgency banner ( maybe because, as we seen previously, most products are well stocked?);  
  
* Most products are not shipped in express mode;  
  
* Most products do not own a fast shipping badge, meaning that most products do not get shipped rapidly (or at least, users are not reporting them as so);  
  
* Most products are not produced locally;  
  
* Most merchants do not own a picture (I don't know how much this feature, ```merchant_has_profile_picture``` , can be a decent input for a machine learning model. However, I, as a consumer, would feel alot more confortable buying from a known/trusted merchant and in that sense, I belive a picture would help me to gain some trust in the merchant if I dind't know him already);  
 
* In our dataset, there is a balance between products where theyr merchants use advertisement boosts and products where theyr merchants do not use advertisement boosts;  
  
* In most cases, merchants do not offer a translation for the products

## Joint Features Analytics | Outlier Detection

Lets start by building a funtion that will allow us for some massive box plotting

In [None]:
def boxplot_matrix(total_cols, num_cols, cat_col, dataframe):
    """
    Builds a facet with mutiple sns box plots.
    
    Input elements:
    total_cols - total number of columns the user wants to have in the matrix where the plots will be in the end
    num_cols - list with numeric columns in the dataframe to use 
    cat_col - name of the categorical columns to be used
    dataframe - pandas dataframe with the features we want to plot
    """
    # Subplots are organized in a Rows x Cols Grid
    # Tot and Cols are known
    # https://stackoverflow.com/questions/12319796/dynamically-add-create-subplots-in-matplotlib

    Tot = len( dataframe.columns )
    Cols = total_cols

    # Compute Rows required
    Rows = Tot // Cols 
    Rows += Tot % Cols

    # Create a Position index
    Position = range(1, Tot + 1)

    # Create main figure
    fig = plt.figure(1, figsize=(23,70)) 
    # optimization needed: dynamically adjust figure size given the total columns the user wants to see in final matrix and the total charts to display = len(dataframe.columns)

    sns.set_palette(sns.color_palette("BrBG", 7))
    for k, num_col in zip( range(Tot), dataframe[num_cols].columns ):
        # add every single subplot to the figure with a for loop
        ax = fig.add_subplot(Rows, Cols, Position[k])
        order_df = pd.DataFrame( dataframe.groupby(cat_col)[num_col].mean()).sort_values(by= num_col, ascending = False)
        
        sns.boxplot( 
            data = dataframe,
            x = num_col,
            y = cat_col,
            order = order_df.index,
            linewidth= 1,
            orient = 'h',
            ax=ax
            )                
        #Removes frame but keep axis
        plt.gca().spines['right'].set_color('none')
        plt.gca().spines['top'].set_color('none')
        plt.title( cat_col +  ' Vs. ' + num_col, fontweight="bold" )        
        ;

plt.tight_layout();
plt.show();



In [None]:
#cat_cols == ['product_color', 'product_variation_size_id', 'origin_country'] <-- used to switch variibale column in cat_col argument in below function
boxplot_matrix(2, num_cols = num_cols, cat_col = 'product_color', dataframe = sales_clean);

From the above mosaic of boxplots we can see that there are alot of outliers.  
So, next step is to scale data. Scalling will be beneficial, not only to minimize/remove outliers, but also to uniformize all features unit measure.


# Preprocessing

## Splitting Data

In [None]:
Y = sales_clean['units_sold']

In [None]:
X = sales_clean.drop(columns = ['units_sold'])

In [None]:
rand_seed = 123
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = .3, random_state = rand_seed)

## Getting Dummies

In [None]:
x_train = pd.get_dummies( x_train, columns = cat_cols, prefix_sep = '==' , drop_first = True)
x_test = pd.get_dummies( x_test, columns = cat_cols, prefix_sep = '==' , drop_first = True)


## Feature Scaling

In [None]:

# perform a robust scaler transform of the dataset
trans = RobustScaler()

#We are interested in scaling the input feature and not the output feature, so we take this extra step to create a list with numeric input features
num_cols.remove('units_sold')

sales_num_scaled = pd.DataFrame( trans.fit_transform(x_train[num_cols]), columns = num_cols, index =  x_train.index )

x_train.drop(columns = num_cols, inplace = True)

x_train = pd.concat( [x_train, sales_num_scaled] , axis = 1 )

num_cols.append('units_sold')

#keep test set with same column order then train set
x_train.sort_index(axis=1, inplace=True)
x_test.sort_index(axis=1,inplace=True)

# Predicting Units sold

In [None]:
scoring_gridsearchcv = pd.DataFrame( metrics.SCORERS.keys(), columns = ['metrics_name_gridsearchcv'] )
#scoring_gridsearchcv
scoring_gridsearchcv[scoring_gridsearchcv.metrics_name_gridsearchcv.str.contains('error')]

## Finding baseline predictions MAE

In [None]:
baseline_predictions = np.ones(y_test.shape) * y_train.mean()
mae_baseline = mean_absolute_error(y_test, baseline_predictions)

print( 'Baseline MAE is {:.2f}'.format(mae_baseline) )

## Finding more advanced solutions: XGBoost

In [None]:
param_grid = {
    'nthread':[2], #when use hyperthread, xgboost may become slower
    'learning_rate': [0.001, 0.005, 0.01], #so called `eta` value
    'max_depth': range(3,5,1),
    #'importance_type': ['weight', 'gain', 'cover'],
    #'min_child_weight' : [ 1 ],
    #'gamma': [ 0.0],
    'colsample_bytree' : [0.2],
    'verbosity': [0],
    'n_estimators': [800, 900, 1000], #number of trees
    'seed': [rand_seed]
    }

scoring_func = 'neg_mean_absolute_error'

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=rand_seed)
optimal_xgb_model = GridSearchCV(estimator=XGBRegressor(), param_grid = param_grid, scoring = scoring_func, cv= cv, error_score='raise', verbose = 1)
optimal_xgb_model.fit(x_train, y_train)

In [None]:
print( 'Optimal xgb model configuration found:' )
print()
print(optimal_xgb_model.best_estimator_)
print()
print()
print( 'Optimal ' + scoring_func + ' :' )
print(  int( optimal_xgb_model.best_score_.round() ) )

In [None]:
y_pred = pd.Series( optimal_xgb_model.predict(x_test ).round().astype('int'), index = x_test.index )
df_final_comp = pd.concat([y_pred,y_test], axis=1).rename(columns={0:'units_sold_pred_xgb'})

df_final_comp.head(10)

## Finding more advanced solutions: AdaBoost

https://machinelearningmastery.com/adaboost-ensemble-in-python/

In [None]:
param_grid = {
    'n_estimators':[5, 10, 20, 50, 100],
    'learning_rate':[0.00001, 0.00002, 0.0001]
    }

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=rand_seed)
optimal_ada_model = GridSearchCV(estimator= AdaBoostRegressor(), param_grid = param_grid, scoring = scoring_func, cv= cv, error_score='raise', verbose = 1)
optimal_ada_model.fit(x_train, y_train)

In [None]:
print( 'Optimal AdaBoost model configuration found:' )
print()
print(optimal_ada_model.best_estimator_)
print()
print()
print( 'Optimal ' + scoring_func + ' :' )
print(  int( optimal_ada_model.best_score_.round() ) )

In [None]:
y_pred_ada = pd.Series( optimal_ada_model.predict(x_test).round().astype('int'), index = x_test.index )
df_final_comp = pd.concat([df_final_comp,y_pred_ada], axis=1).rename(columns={0:'units_sold_pred_ada'})
df_final_comp = df_final_comp[['units_sold_pred_xgb','units_sold_pred_ada','units_sold']]

df_final_comp.head(10)

### A couple comments:  
Overall I'm not happy with both models found.  
They defenetly can be improved.  
I'm not sure if I tunned the models correctly.  
It's defenetly possible to build more predictable features out of the dataset, but I have no idea how to do it, so far.