### `Data Object`
This notebook is used for authoring a dataset class that can be used for loading, partioning and returning data useful for infernence

# Setup

In [1]:
import numpy as np
import pandas as pd
import os

#  Class

In [56]:
'OutlierDetection.ipynb' in os.listdir(os.getcwd())

True

In [119]:
class PricingWizard:
    def __init__(self,
                 *,
                 filename:str = "post_preprocessing_without_dummies.csv",
                 alternative_data_path:str = None,
                 train_size:int = .8,
                 test_size:int = .2,
                 ramdom_state = 42) -> None:
        """
        Data extraction and partioning class.

        Class for extracting Trendsales data for modeling in Pricing Wizards exam Project. The class should simplify the process of passing data to models.

        Args:
            filename (str, optional): Filename from subfolder to load. Defaults to post_preprocessing_without_dummies
            alternative_path (str, optional): In case an alternative path is prefered, such can be called. Defaults to None.
            train_size (float, optional): Portion of dataset to include in train split. Defaults to .8 or 80% of data.
            test_size (float, optional): Portion of dataset to include in test split. Defaults to .2 or 20% of data.
            random_state = (int, optional): Controls the behaviour of shuffling applied to the data before applying the split. Enables reproducibaility of resutls across multiple initalizations. Defaults to 42.
            
            
        
        Attributes:
            data_directory (str): Full path to subdirectory storing data 
            filename (str): Name of data csv file to be extracted
            seed (int): Seed for reproducability

        Methods:
            __call__: Calling object after initalization will return partitioned dataset, in X_train, X_test, Y_train, Y_test format.
        """
        
        # Data storage details
        if alternative_data_path:
            self.data_folder = alternative_data_path
        else:
            # Control the working directories are correct
            assert 'pricing_wizards' in os.getcwd(), f"This program can only be executed inside the pricing_wizards directory if no alternative data path is specified. You're currently in {os.getcwd()}. Please change directory into pricing wizards before you calling class or specify an alternative data path."
            
            self.data_folder = os.getcwd().split('pricing_wizards/')[0] + 'pricing_wizards/data/'
        
        # Name of data file
        self.filename = filename + '.csv' if '.csv' not in filename else filename
        
        # Asserting file exists in data folder
        assert self.filename in os.listdir(self.data_folder), f'File, {self.filename}.csv, does not appear in data folder, {self.data_folder}. Please make sure the correct filename and data folder is specified. {os.listdir(self.data_folder)}'
        
        # Seed for reproducability
        self.seed = 42
        np.random.seed(self.seed)
        
    
        # Splitting details
        self.train_size = train_size
        self.test_size = test_size
        
        # If None is not passed as argument
        if not self.train_size:
            assert self.train_size + self.test_size == 1, "Sum of split sizes must equal 1. Ensure passed size arguments is equal to 1"
            

        # Loading data
        self.__load__()
    
    def __load__(self):
        """Function for loading data from data folder"""
        
        # Loading data using pandas
        self.df = pd.read_csv(f'{self.data_folder}{self.filename}')
        
    
    def __call__(self):
        """Function for returning partitioned dataset"""
        return 'you','are','a','pussy'
    

In [134]:
data = PricingWizard()

In [129]:
data.df.listing_price.describe(percentiles=[.25,.5,.75,.85,.90,.95,.99])

count    286570.000000
mean        453.937872
std         612.100921
min           1.000000
25%         150.000000
50%         299.000000
75%         539.750000
85%         800.000000
90%        1000.000000
95%        1395.000000
99%        2600.000000
max      100000.000000
Name: listing_price, dtype: float64

In [168]:
data.df.subsubsubcategory_name.value_counts()

Sneakers                21841
Trousers                18887
Jackets                 18759
T-shirts                16537
Knitwear                13809
                        ...  
Racing bicycles             1
Programs & softwares        1
Garbage cans                1
Single beds                 1
Sun care                    1
Name: subsubsubcategory_name, Length: 390, dtype: int64

In [130]:
data.df[data.df.listing_price < 2600]

Unnamed: 0,classified_id,listed_at_date,user_id,classified_price,listing_price,favourites,viewed_count,brand_name,condition_name,color_name,category_name,subcategory_name,subsubcategory_name,subsubsubcategory_name,classified_price_standardized,viewed_count_standardized,favourites_standardized,classified_price_normalized,viewed_count_normalized,favourites_normalized
0,30343099,2023-09-06,2425635,900,1299,10,145,Air Jordan,Almost as new,Black,Men,Men,Men,Sneakers,0.933785,0.706555,0.183100,0.118236,0.013349,0.032573
1,30346312,2023-09-06,144602,225,350,12,119,Ganni,Almost as new,Multi,Women,Women,Clothes,Skirts,-0.370245,0.459288,0.362075,0.028056,0.010956,0.039088
2,30364278,2023-09-07,2028837,120,120,38,209,One Vintage,Good but used,Multi,Women,Women,Women,Women,-0.573094,1.315213,2.688753,0.014028,0.019241,0.123779
3,30406315,2023-09-10,1953400,450,450,5,41,Ralph Lauren,"New, still with price",Navy,Men,Men,Clothes,T-shirts,0.064432,-0.282514,-0.264338,0.058116,0.003775,0.016287
4,30420441,2023-09-11,2202926,500,600,14,208,Air Jordan,Never used,Beige,Men,Men,Men,Sneakers,0.161027,1.305703,0.541050,0.064796,0.019149,0.045603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286565,31844957,2023-11-22,1290926,95,190,4,23,Yves Saint Laurent,Never used,White,Women,Women,Beauty & care,Other,-0.621392,-0.453699,-0.353826,0.010688,0.002117,0.013029
286566,31829356,2023-11-21,92133,720,800,3,25,Dr. Martens,Never used,Black,Women,Women,Women,Boots,0.586044,-0.434678,-0.443314,0.094188,0.002302,0.009772
286567,31876456,2023-11-24,2613917,270,270,10,89,Bitte Kai Rand,Almost as new,Black,Women,Women,Clothes,Dresses,-0.283310,0.173980,0.183100,0.034068,0.008194,0.032573
286568,31890162,2023-11-25,1805786,250,250,4,44,Vintage,Good but used,Red,Women,Women,Clothes,Sweatshirts,-0.321948,-0.253983,-0.353826,0.031396,0.004051,0.013029
