In [1]:
import pandas as pd
import numpy as np

# Reading the data

In [2]:
filepath = '../Data/shopping_dataset.xlsx'
df = pd.read_excel(filepath, nrows=52924, sheet_name='in')
df = df.iloc[:,1:]
df = df[['CustomerID', 'Transaction_Date', 'Product_SKU', 'Product_Description']]


df_ref = pd.read_csv('../Data/prod_categories.csv')

df = pd.merge(df, df_ref
             ,how='left'
             ,left_on='Product_Description'
             ,right_on='Product_Description')
df = df.drop(columns=['Product_Category'])

# replacing whitespaces
df['Predicted_Category'] = df['Predicted_Category'].str.replace(' ', '') 
df['Predicted_Category'] = df['Predicted_Category'].str.replace('&', 'and') 
df.head()

Unnamed: 0,CustomerID,Transaction_Date,Product_SKU,Product_Description,Predicted_Category
0,17850,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,homeandkitchen
1,17850,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,homeandkitchen
2,17850,2019-01-01,GGOENEBQ078999,Nest Cam Outdoor Security Camera - USA,homeandkitchen
3,17850,2019-01-01,GGOENEBQ079099,Nest Protect Smoke + CO White Battery Alarm-USA,homeandkitchen
4,17850,2019-01-01,GGOENEBJ079499,Nest Learning Thermostat 3rd Gen-USA - Stainle...,homeandkitchen


In [3]:
df['Predicted_Category'].unique()

array(['homeandkitchen', 'office', 'Tees', 'sportsandfitness',
       'HoodiesandJackets', 'KidsApparel', 'CapsandHats', 'onesie',
       'bagsandluggage', 'Drinkware', 'petsupplies', 'beautyandhealth',
       'carandmotorbike', 'GiftCards'], dtype=object)

# We remove items that are very rarely bought since these will act as noise to the clustering algorithm

In [4]:
df = df[~df['Predicted_Category'].isin(['GiftCards' 
                                        ,'carandmotorbike' 
                                        ,'onesie'
                                        ,'homeandkitchen'
                                        ,'Tees'
                                        ,'office'
                                        #,'petsupplies'
                                       ])]
df.head()

Unnamed: 0,CustomerID,Transaction_Date,Product_SKU,Product_Description,Predicted_Category
2172,17850,2019-01-01,GGOEGAPB058615,Google Women's Yoga Jacket Black,sportsandfitness
2174,17850,2019-01-01,GGOEYAFB073115,YouTube Men's Fleece Hoodie Black,HoodiesandJackets
2179,12583,2019-01-01,GGOEGATB060415,Google Women's Quilted Insulated Vest Black,HoodiesandJackets
2185,12583,2019-01-01,GGOEGAFJ036214,Google Men's Pullover Hoodie Grey,HoodiesandJackets
2186,12583,2019-01-01,GGOEGAYC068324,Google Youth Short Sleeve T-shirt Royal Blue,KidsApparel


# Combining the categories bought by the customer

In [5]:
df_combined_categories = (
                            df
                            .groupby('CustomerID')
                            ['Predicted_Category']
                            .apply(lambda x: ' '.join(x))
                            .reset_index()
)
df_combined_categories.head()

Unnamed: 0,CustomerID,Predicted_Category
0,12347,KidsApparel HoodiesandJackets bagsandluggage b...
1,12348,KidsApparel KidsApparel sportsandfitness Drink...
2,12350,CapsandHats CapsandHats sportsandfitness Capsa...
3,12356,sportsandfitness KidsApparel KidsApparel Hoodi...
4,12359,CapsandHats


# TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply TF-IDF vectorization
tfidf_matrix = tfidf_vectorizer.fit_transform(df_combined_categories['Predicted_Category'])

# Convert the TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray()
                        ,columns=tfidf_vectorizer.get_feature_names_out()
                        ,index=df_combined_categories['CustomerID'])
tfidf_df.head()

Unnamed: 0_level_0,bagsandluggage,beautyandhealth,capsandhats,drinkware,hoodiesandjackets,kidsapparel,petsupplies,sportsandfitness
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12347,0.568189,0.0,0.388631,0.463974,0.366188,0.215148,0.0,0.36121
12348,0.0,0.0,0.0,0.636778,0.0,0.590556,0.0,0.49574
12350,0.0,0.0,0.974051,0.0,0.0,0.0,0.0,0.226331
12356,0.203496,0.436919,0.0,0.498515,0.39345,0.46233,0.0,0.388101
12359,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Export the features

In [7]:
tfidf_df.to_parquet('../Data/features_tfidf_removed_categories.parquet.gz', compression='gzip') 