In [16]:
import os
import pandas as pd
import polars as pl
import numpy as np
from pathlib import Path

In [17]:
projectdir = Path(os.getcwd())
datadir = projectdir.parent / 'Dataset'
test_data = pd.read_csv('test_data.csv', delimiter=';')

In [18]:
test_data.head(3)

Unnamed: 0,year,month,day,order,country,session ID,page 1 (main category),page 2 (clothing model),colour,location,model photography,price,price 2,page
0,2008,8,13,3,29,24023,1,A15,14,5,2,33,2,1
1,2008,8,13,4,29,24023,1,A16,1,6,1,33,2,1
2,2008,8,13,5,29,24023,4,P11,4,4,2,38,1,1


In [19]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   year                     10 non-null     int64 
 1   month                    10 non-null     int64 
 2   day                      10 non-null     int64 
 3   order                    10 non-null     int64 
 4   country                  10 non-null     int64 
 5   session ID               10 non-null     int64 
 6   page 1 (main category)   10 non-null     int64 
 7   page 2 (clothing model)  10 non-null     object
 8   colour                   10 non-null     int64 
 9   location                 10 non-null     int64 
 10  model photography        10 non-null     int64 
 11  price                    10 non-null     int64 
 12  price 2                  10 non-null     int64 
 13  page                     10 non-null     int64 
dtypes: int64(13), object(1)
memory usage: 1.2+ KB

In [20]:
test_data.rename(columns={
    'session ID': 'session-id',
    'price 2': 'price-higher-than-category',
    'page 1 (main category)': 'category',
    'page 2 (clothing model)': 'product-code',
    'model photography': 'model-photography',
    'page': 'page-no'
}, inplace=True)

In [21]:
test_data['price-higher-than-category'] = test_data['price-higher-than-category'].apply(lambda x: 1 if x == 1 else 0)

In [22]:
class LabelEncoder:
    def __init__(self) -> None:
        self.str2idx = {}
        self.idx2str = {}
    
    def __str__(self) -> str:
        return f'Number of encoded strings: {len(self.str2idx)}'
    
    def __repr__(self) -> str:
        return self.__str__()
    
    def fit(self, col: np.array) -> None:
        c = 0
        for strItem in col:
            if strItem not in self.str2idx:
                self.str2idx[strItem] = c
                self.idx2str[c] = strItem
                c += 1

    def transform(self, col: np.array) -> np.array:
        return np.array([self.str2idx[ele] for ele in col])
    
    def inverse_transform(self, col: np.array) -> np.array:
        return np.array([self.idx2str[ele] for ele in col])
    
    def fit_transform(self, col: np.array) -> np.array:
        self.fit(col)
        return self.transform(col)

In [23]:
test_data

Unnamed: 0,year,month,day,order,country,session-id,category,product-code,colour,location,model-photography,price,price-higher-than-category,page-no
0,2008,8,13,3,29,24023,1,A15,14,5,2,33,0,1
1,2008,8,13,4,29,24023,1,A16,1,6,1,33,0,1
2,2008,8,13,5,29,24023,4,P11,4,4,2,38,1,1
3,2008,8,13,6,29,24023,4,P18,2,6,1,28,0,1
4,2008,8,13,7,29,24023,4,P13,4,5,1,38,1,1
5,2008,8,13,1,29,24024,2,B10,2,4,1,67,1,1
6,2008,8,13,1,9,24025,1,A11,3,4,1,62,1,1
7,2008,8,13,1,34,24026,1,A2,3,1,1,43,0,1
8,2008,8,13,2,34,24026,3,C2,12,1,1,43,1,1
9,2008,8,13,3,34,24026,2,B2,3,1,2,57,1,1


In [24]:
product_encoder = LabelEncoder()
test_data['product-code-en'] = product_encoder.fit_transform(test_data['product-code'])

In [25]:
def price_bucketing(price: int):
    if price in range(0, 26):
        return 'budget'
    elif price in range(26, 36):
        return 'value'
    elif price in range(35, 66):
        return 'average'
    elif price in range(66, 101):
        return 'premium'
    else:
        return None

test_data['price-category'] = test_data['price'].apply(lambda price: price_bucketing(price))

In [26]:
# price_category_encoder = LabelEncoder()
# test_data['price-category_en'] = price_category_encoder.fit_transform(test_data['price-category'])

In [27]:
test_data

Unnamed: 0,year,month,day,order,country,session-id,category,product-code,colour,location,model-photography,price,price-higher-than-category,page-no,product-code-en,price-category
0,2008,8,13,3,29,24023,1,A15,14,5,2,33,0,1,0,value
1,2008,8,13,4,29,24023,1,A16,1,6,1,33,0,1,1,value
2,2008,8,13,5,29,24023,4,P11,4,4,2,38,1,1,2,average
3,2008,8,13,6,29,24023,4,P18,2,6,1,28,0,1,3,value
4,2008,8,13,7,29,24023,4,P13,4,5,1,38,1,1,4,average
5,2008,8,13,1,29,24024,2,B10,2,4,1,67,1,1,5,premium
6,2008,8,13,1,9,24025,1,A11,3,4,1,62,1,1,6,average
7,2008,8,13,1,34,24026,1,A2,3,1,1,43,0,1,7,average
8,2008,8,13,2,34,24026,3,C2,12,1,1,43,1,1,8,average
9,2008,8,13,3,34,24026,2,B2,3,1,2,57,1,1,9,average


In [32]:
cleaned_test_data = test_data.drop(['year', 'product-code','price-higher-than-category','session-id', 'day', 'order', 'country', 'page-no'], axis=1)

In [33]:
cleaned_test_data

Unnamed: 0,month,category,colour,location,model-photography,price,product-code-en,price-category
0,8,1,14,5,2,33,0,value
1,8,1,1,6,1,33,1,value
2,8,4,4,4,2,38,2,average
3,8,4,2,6,1,28,3,value
4,8,4,4,5,1,38,4,average
5,8,2,2,4,1,67,5,premium
6,8,1,3,4,1,62,6,average
7,8,1,3,1,1,43,7,average
8,8,3,12,1,1,43,8,average
9,8,2,3,1,2,57,9,average
