# Segmentation analysis

For the segmentation analysis I will revert back to using the data that has entries only for sales (no zeros). The reason for this is that what is interesting here is to better understand products with respect to when they sell (instead of when they don't sell). Investigating these product wrt when they don't sell if left for future work.

With respect to sales it's interesting to uncover insights that may better inform future sales. As such, we include the following features in the segmentation analysis:
* Average Price of product
* Average amount of items sold per month
* Total number of months in which product was sold

With the data that is available, these are the only features I can think to include

__Unfortunately this notebook is incomplete as my time available to spend on this assessment has run out.
Below I have created the features listed above, but I did not have time complete the actual analysis. This is dedicated to future work.__

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler

In [2]:
#Import data
def create_main_df(sales_path: str, items_path: str) -> pd.DataFrame:
    """
    Create the main dataframe containing the raw sales data.
    :param sales_path: String pointing to sales file
    :param items_path: Sting pointing to items file
    :return: pd.DataFrame containg sales data merged with item category data
    """
    #Load csv with sales data
    df_sales = pd.read_csv(sales_path)
    #Load cvs with product category data
    df_items = pd.read_csv(items_path)
    #Merge
    df = pd.merge(df_sales, df_items, on='item_id')
    
    return df

#Create main df
sales_path = 'historic_sales_data.csv'
items_path = 'items.csv'
df = create_main_df(sales_path, items_path)
    
    

In [3]:
#Aggregate data
def construct_monthly_sales_df(df_func: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate sales data to monthly.
    :param df_func: pd.DataFrame containing the sales data current on a daily level
    :return: pd.DataFrame containing aggregated sales data
    """
    #Create column on which to group data
    df_func['year_month'] = df_func.apply(lambda row:  str(row['year']) + str(row['month']), axis=1)
    #Construct monthly dataframe
    df_func = df_func.groupby(by=['year_month', 'shop_id', 'item_id', 'item_category_id']).agg({'item_cnt_day': 'sum', 'item_price': 'mean'}).reset_index()
    #Rename columns
    df_func.rename(columns={'item_cnt_day': 'item_count_month'}, inplace=True)
    #Round item price
    df_func['item_price'] = df_func['item_price'].round(1)
    #Recreate year and month columns
    df_func['year'] = df_func['year_month'].apply(lambda x:  int(x[:4]))
    df_func['month'] = df_func['year_month'].apply(lambda x:  int(x[4:]))
    #Assign day equal to 1 as placeholder
    df_func['day'] = 1
    #Generate datetime
    df_func['date'] = pd.to_datetime(df_func[['year', 'month', 'day']])
    
    return df_func

#Aggregate sales to monthly
df_monthed = construct_monthly_sales_df(df.copy())

In [4]:
#limit the data set to products that sold in 2020
def twenty20_products(df_func: pd.DataFrame, date_limit: datetime) -> pd.DataFrame:
    """
    Limit data to product that were sold in 2020.
    :param df_func: pd.DataFrame containing sales data
    :param date_limit: datetime object indicating relevant cutoff date
    return: pd.DataFrame containing filtered products
    """
    #Identify products
    prods_sold_2020 = df_func[df_func['date']>= date_limit]['item_id'].unique()
    #Limit dataframe
    df_func = df_func.drop(df_func[~df_func['item_id'].isin(prods_sold_2020)].index)
    
    return df_func

#Limit data to 2020 products
date_limit = datetime(2020, 1, 1)
df_monthed = twenty20_products(df_monthed, date_limit)

In [5]:
#limit the data set to the past 15 months
def limit_data_by_date(df_func: pd.DataFrame, date_limit: datetime) -> pd.DataFrame:
    """
    Limit data to sales after date_limit.
    :param df_func: pd.DataFrame containing sales data
    :param date_limit: datetime object indicating relevant cutoff date
    return: pd.DataFrame containing filtered products
    """
    df_func = df_func.drop(df_func[df_func['date']< date_limit].index)
    
    return df_func

#Limit data by date
date_limit = datetime(2019, 3, 1)
df_monthed = limit_data_by_date(df_monthed, date_limit)

In [6]:
#Cap price outliers 
def cap_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """
    Cap outliers wrt item price in order to reduce their effect during training.
    #NOTE: Based on investigation anything > 30000 was determined an outlier
    #TODO: Consider making outlier cap shop specific
    :param df_train_func: pd.DataFrame containg training data
    :param shop: Integer identifyin shop
    :return: pd.DataFrame updated with capped prices
    """
    df["item_price"] = df["item_price"].apply(lambda x: 30000 if x > 30000 else x)
    
    return df

df_monthed = cap_outliers(df_monthed)

In [7]:
#Create features for segmentation
def limit_data_by_date(df_func: pd.DataFrame, date_limit: datetime) -> pd.DataFrame:
    """
    Create the features of interest.
    :param df_func: pd.DataFrame containing sales data
    :param date_limit: datetime object indicating relevant cutoff date
    return: pd.DataFrame containing filtered products
    """
    df_func = df_func.drop(df_func[df_func['date']< date_limit].index)
    
    return df_func

#Limit data by date
date_limit = datetime(2019, 3, 1)
df_monthed = limit_data_by_date(df_monthed, date_limit)

In [9]:
#Aggregate data
def create_features(df_func: pd.DataFrame) -> pd.DataFrame:
    """
    Create the features for segmentation. That is:
        - Average Price of product
        - Average amount of items sold per month
        - Total number of months in which product was sold
        - Number of shops at which item is sold

    :param df_func: pd.DataFrame containing the sales data
    :return: pd.DataFrame containing created freatured for segmentation
    """
    #Average Price of product
    df_price= df_monthed.groupby(by=['item_id']).agg({'item_price': 'mean'}).round()
    # Average amount of items sold per month
    df_sold_per_month = df_monthed.groupby(by=['item_id']).agg({'item_count_month': 'sum'})/15
    df_sold_per_month['item_count_month'] = df_sold_per_month['item_count_month'].round()
    #Total number of months in which product was sold
    df_month_in_solf = df_monthed.groupby(by=['item_id']).agg({'month': 'nunique'})
    #Number of shops at which item is sold
    df_shops_sold_in = df_monthed.groupby(by=['item_id']).agg({'shop_id': 'nunique'})

    df_segmentation = pd.concat([df_price, df_sold_per_month, df_month_in_solf, df_shops_sold_in], axis=1)
    return df_segmentation, df_segmentation.columns.to_list()

#Create features for segmentation
df_segmentation, column_names = create_features(df_monthed)

In [10]:
#standardise data
def standardise_data(df_segmentation: pd.DataFrame) -> pd.DataFrame:
    """
    Standardise data so as to improve clustering quality.
    :param df_segmentation: pd.DataFrame containing data to be clustered
    """
    # Create an instance of the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler to your data
    scaler.fit(df_segmentation)

    # Transform the data using the scaler
    df_segmentation_scaled = scaler.transform(df_segmentation)
    
    return df_segmentation_scaled

df_segmentation_scaled = standardise_data(df_segmentation)

In [11]:
df_segmentation_scaled = pd.DataFrame(data=df_segmentation_scaled, columns=column_names)

In [17]:
df_segmentation_scaled

Unnamed: 0,item_price,item_count_month,month
0,-0.473519,-0.151604,-1.743055
1,-0.473519,-0.151604,-1.743055
2,-0.473519,-0.151604,-1.743055
3,-0.433175,-0.151604,-1.743055
4,-0.392832,0.036629,1.140187
...,...,...,...
9224,-0.259121,-0.137125,-1.743055
9225,-0.271800,0.442056,-0.694603
9226,-0.073539,0.268302,-0.956716
9227,-0.403206,0.022150,1.140187
