### Setup

In [1]:
# Load helpers and custom dataset class
from __init__ import (PricingWizardDataset, 
                      regression_accuracy, 
                      threshold_accuracy, 
                      ridge_regression_pipeline, 
                      base_regression_pipeline,
                      drop_helpers, 
                      save_model, 
                      train,
                      train_step,
                      test,
                      validation,
                      condition_encoding,
                      filter_rare_categories
                      )

# Data manipulation 
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import matplotlib as mpl
import datetime
import joblib
import os
try:
    from kmodes.kprototypes import KPrototypes
except:
    !pip install kmodes
    from kmodes.kprototypes import KPrototypes
import session_info
from sklearn.preprocessing import StandardScaler

In [2]:
# Data loading
data = PricingWizardDataset(
    filename = 'post_preprocessing_without_dummies.csv'
)

Dataset Loaded: post_preprocessing_without_dummies
	Number of Rows: 283055
	Number of Columns: 22
	Outlier Removal: True
	Train Size: 0.8
	Test Size: 0.2
	Random State: 42


## Kprototypes

In [114]:
# Reset dataset, used during modelling and overwrites any previous changes
data.reset_dataset()

# Extract the dataframe
df = data.df.copy()

In [115]:
# Filter out rare subsubsub categories and ordinally encode conditions
df = filter_rare_categories(df, 'subsubsubcategory_name', 'subsubcategory_name')
df = condition_encoding(df)

In [116]:
# Subtracting brands with less than 30 listings
brands = df['brand_name'].value_counts()
brands = brands[brands < 30].index

df.loc[df[df.brand_name.isin(brands)].index, 'brand_name'] = 'Other'    

In [117]:
# Extracting relevant columns for clustering
cluster_data = df[['brand_name','subsubsubcategory_name','listing_price','viewed_count','condition_name']]

In [118]:
# Numeric Columns
num_col = (cluster_data.select_dtypes('integer') +
           cluster_data.select_dtypes('float')).columns.to_list()

cluster_data_norm = cluster_data.copy()

cluster_data_norm.loc[:, num_col] = StandardScaler().fit_transform(cluster_data[num_col]) # Alt. MinMaxScaler, normalize

In [109]:
# Initalize models
model = KPrototypes(n_clusters=4, init='Cao', n_jobs=-1) 

# Fit and predict clusters for data
clusters = model.fit_predict(cluster_data_norm.to_numpy(), categorical=[0,1])

In [110]:
# Add clusters to dataframe
df['cluster'] = clusters
cluster_data['cluster'] = clusters

In [113]:
# Evaluate clusters
df.groupby('cluster').agg(
    mean_listing_price = ('listing_price', 'mean'),
    mean_views = ('viewed_count', 'mean'),
    top_5_brands = ('brand_name', lambda x: x.value_counts().index[0:5]),
    sub_categories = ('subsubcategory_name', lambda x: x.value_counts().index[0:5]),
    subsub_cateogries = ('subsubsubcategory_name', lambda x: x.value_counts().index[0:5]),
    count = ('classified_id', 'count'))

Unnamed: 0_level_0,mean_listing_price,mean_views,top_5_brands,sub_categories,subsub_cateogries,count
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,675.608636,263.05556,"[Stone Island, Carhartt, Stussy, Planet Nusa, ...","[Clothes, Women, Men, Accessories, Sportswear]","[Jackets, Knitwear, Trousers, Sneakers, T-shirts]",25360
1,296.584705,45.149059,"[Carhartt, Nike, Zara, Ganni, One Vintage]","[Clothes, Women, Men, Home, Accessories]","[Trousers, Sneakers, Jeans, Jackets, T-shirts]",171254
2,2377.58947,99.121328,"[Polo Ralph Lauren, Moncler, Louis Vuitton, Ai...","[Clothes, Women, Men, Electronics, Smartphones...","[Jackets, Sneakers, Shirts, Shoulder bags, Cro...",10484
3,442.001672,44.158577,"[Adidas, Zara, UGG, Stussy, Nike]","[Clothes, Women, Men, Accessories, Home]","[Sneakers, T-shirts, Trousers, Dresses, Tops]",75957
