In [10]:
import os # type: ignore
import pandas as pd # type: ignore
import logging # type: ignore
from pathlib import Path # type: ignore
from datetime import datetime # type: ignore
import numpy as np # type: ignore
import random # type: ignore

In [11]:
#########################################################################################
# Set any display options and default values                                            #
#########################################################################################
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
BASEDIR = 'C:\\Users\\simon\\Documents\\py_projects\\retailnlp'

In [12]:
#########################################################################################
# Linked set of Helper functions to scramble copied data to mimic different data in     #
# 'to be matched' system                                                                #
#########################################################################################
#---------------------------------------------------------------------------------------#
# Parse each col to be scrambled, joining the result back into the original col         #
#---------------------------------------------------------------------------------------#
def helper_scramble_all_letters(df, change_cols, first_n_letters):
    for col in change_cols:
        df[col] = [' '.join(helper_scramble_words(words, first_n_letters)) for words in df[col].str.split()]
        
    #-----------------------------------------------------------------------------------#
    # Return df                                                                         #
    #-----------------------------------------------------------------------------------#
    return df

#---------------------------------------------------------------------------------------#
# Split each col into it's individual words and pass to scrambler                       #
#---------------------------------------------------------------------------------------#
def helper_scramble_words(words, first_n_letters):
    # pass each word to the scrambler
    words = [helper_scramble_word_letters(word, first_n_letters) for word in words]
    return words
#---------------------------------------------------------------------------------------#
# Scramble the first few letters of the word                                            #
#---------------------------------------------------------------------------------------#
def helper_scramble_word_letters(word, first_n_letters):
    # scramble the first n letters of the word
    foo = list(word[:first_n_letters])
    random.shuffle(foo)
    return ''.join(foo) + word[first_n_letters:]

In [13]:
# Creates full path and name for an output file - default assumes a log file and will auto create target folder if it doesn't exist
class Filename:
    def __init__(self, module_name='retailnlp', typeofile='log', suffix='.log', folder_name='logs', sep='_', term=''):
        self.module_name = module_name
        self.typeofile = typeofile
        self.suffix = suffix
        self.folder_name = folder_name
        self.sep = sep
        self.term = term
        self.basedir = BASEDIR
        self.filepath = os.path.join(BASEDIR, self.folder_name)
        self.filename = self.module_name + self.sep + self.typeofile + self.sep + self.term + self.sep + self.get_timestamp()
        self.filepathandname = Path(self.filepath, self.filename).with_suffix(self.suffix)
        self.make_dir()
    
    def __str__(self):
        return f'filepathandname:{str(self.filepathandname)}'
    
    def make_dir(self):
        this_filepath = os.path.join(BASEDIR, self.filepath)
        try:
            os.mkdir(this_filepath)
        except FileExistsError:
            pass

    def get_timestamp(self):
        today = datetime.today().strftime("%Y-%m-%d")
        now = datetime.now().strftime("%H-%M-%S")
        return today + '_' + now

# Creates and configures a log
class Log:
    def __init__(self, module_name='retailnlp', term='', level='INFO'):
        self.module_name = module_name
        self.log = logging.getLogger(module_name)
        self.term = term
        self.level = level
        self.log.setLevel(self.level)
        self.log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        self.date_format = '%Y-%m-%d %H:%M:%S'
        self.formatter = logging.Formatter(self.log_format, self.date_format)
        self.log_write_mode = 'w+'
        
    @property
    def log_filepathandname(self):
        f = Filename(self.module_name, term=self.term)
        return f.filepathandname
    
    def log_addfh(self):
        self.file_handler = logging.FileHandler(self.log_filepathandname, self.log_write_mode)
        self.file_handler.setFormatter(self.formatter)
        self.log.addHandler(self.file_handler)
    
    def log_addch(self):
        self.console_handler = logging.StreamHandler()
        self.console_handler.setFormatter(self.formatter)
        self.log.addHandler(self.console_handler)

    def log_remove_handlers(self):
        self.log.info('log_remove_handlers: Removing all existing log handlers')
        # get all loggers
        loggers = [logging.getLogger(name) if 'retail' in name else None for name in logging.root.manager.loggerDict]
        # for each valid logger remove all handlers
        for log in loggers:
            if log != None:
                while bool(len(log.handlers)):
                    for handler in log.handlers:
                        print('removing handler!')
                        log.removeHandler(handler)

#Load data from a csv file - create a df
class LoadCSV:
    def __init__(self, projectname, filename, seperator):
        self.project_name = projectname
        self.filename = filename
        self.seperator = seperator
    
    @property
    def filepath(self):
        return os.path.join(BASEDIR, self.project_name, 'data', self.filename)
    
    @property
    def df(self):
        df = pd.read_csv(self.filepath, self.seperator, header=0, encoding='latin1', engine='python')
        df.columns = df.columns.str.strip().str.upper().str.replace(r'\s+', '_').str.replace('-', '_').str.replace('(', '').str.replace(')', '')
        return df

In [14]:
class Invoice:
    def __init__(self, description, scrambled_description, unit_price):
        self.description = description
        self.scrambled_description = scrambled_description
        self.unit_price = unit_price
    
    def __str__(self):
        return f'description={self.description}:scrambled_description={self.scrambled_description}:unit_price={self.unit_price}'
    
    def to_dict(self):
        return {
            'DESCRIPTION': self.description
            ,'SCRAMBLED_DESCRIPTION': self.scrambled_description
            ,'UNIT_PRICE': self.unit_price
        }
    
    def to_df(self, list_of_items):
        return pd.DataFrame.from_records([i.to_dict for i in list_of_items])

class Product:
    def __init__(self, description, web_selling_price, cost_price):
        self.description = description
        self.web_selling_price = web_selling_price
        self.cost_price = cost_price

    def __str__(self):
        return f'description{self.description}:web_selling_price{self.web_selling_price}:cost_price{self.cost_price}'

    def to_dict(self):
        return {
            'DESCRIPTION': self.cost_price
            ,'WEB_SELLING_PRICE': self.web_selling_price
            ,'UNIT_PRICE': self.unit_price
        }

def to_df(list_of_items):
    return pd.DataFrame.from_records([i.to_dict for i in list_of_items])

def create_invoices(df_products, net_from_perc=0.85, net_to_perc=0.95):
    # Start with the products dataframe and delete all range priced products
    df_non_range_products = df_products.drop(df_products[df_products['FROM_PRICE'] != df_products['TO_PRICE']].index)

     # Select required columns from product data
    df_invoice_data = df_non_range_products[['DESCRIPTION']].copy()
    df_invoice_data['SCRAMBLED_DESCRIPTION'] = df_invoice_data['DESCRIPTION']

    # Create scrambled description column
    df_invoice_data = helper_scramble_all_letters(df_invoice_data, ['SCRAMBLED_DESCRIPTION'], 5)

    # Create / mimic invoice price by discounting selling price
    df_invoice_data['UNIT_PRICE'] = round(df_non_range_products['FROM_PRICE'] * np.random.randint(85, 95, len(df_non_range_products))/100, 2)
    list_of_invoices = [Invoice(row['DESCRIPTION'], row['SCRAMBLED_DESCRIPTION'], row['UNIT_PRICE']) for index, row in df_invoice_data.iterrows()]
    return list_of_invoices
    


In [15]:
# sub_log = Log(module_name=main_log.module_name + '.sub', term='main.sub')
main_log = Log(term='main')
main_log.log_addfh()
main_log.log_addch()
main_log.log.info('################ starting ################')
# Load Product data
product_csv = LoadCSV('retailnlp', 'products.csv', '\t')
df_products = product_csv.df
# Generte Invoice data
list_of_invoices = create_invoices(df_products)
# df_invoices = pd.DataFrame.from_records([i.to_dict() for i in list_of_invoices])
df_invoices = Invoice().to_df
# Close
main_log.log_remove_handlers()

2020-10-29 10:27:55 - retailnlp - INFO - ################ starting ################


TypeError: __init__() missing 3 required positional arguments: 'description', 'scrambled_description', and 'unit_price'

In [None]:
x = (np.random.randint(85,95, len(df_invoices))/100)
print(x, df_invoices.UNIT_PRICE, df_invoices.UNIT_PRICE * x)