In [1]:
# Example input table
import pandas as pd
input_table_1 = pd.DataFrame({"Example": ["Example"]})

## Check if 2 files are the same using their checksum

In [None]:
"""     
Script which was used to check if workflow which loads files to database is loading the same file. 
It checks checksum of the new file which is going to be loaded to the database with previous one. 
If checksum is the same it means that the file which should be loaded to the database is the same as the previous one,
which is already in the database, therefore it shouldnt be loaded.
"""

import pandas as pd
import hashlib

# Copy input to output -> KNIME Python node 
# input_table_1 - table passed in the workflow to the Python node (KNIME)
output_table_1 = input_table_1.copy()

# Saving paths from table to variables
file_path_recent = input_table_1.loc[0, 'local_full_path_recent']
file_path_previous = input_table_1.loc[0, 'local_full_path_previous']
path_list = [file_path_recent, file_path_previous]

checksum_list = []
# Getting checksum for every file from the list
for path in path_list:
    with open(path, "rb") as f:
        data = f.read()
        md5 = hashlib.md5(data).hexdigest()  # md5 algorithm, output returned in hexadecimal digits
        checksum_list.append(md5)

df_checksum_check = pd.DataFrame({"Path": path_list, "Checksum": checksum_list})
checksum_check = [df_checksum_check["Checksum"][0] != df_checksum_check["Checksum"][1]]

# Output needed to be returned in DataFrame -> script was used in Knime which is based on tabular data
output_table_1 = pd.DataFrame({"Different_checksum": checksum_check})


## Text encoding conversion

In [None]:
"""
Script used for correction of broken encoding of german letters in manufacturer's names.
Correction from windows-1250 encoding to standard utf-8
Data was taken from excel file with list of broken names of manufacturers
"""

import pandas as pd

excel_path = ""
cnx = "Database connection"

df_german_1250 = pd.read_excel(excel_path, sheet_name="german encoding")

# Correction of broken encoding of manufacturers names - from windows-1250 to utf-8
df_german_1250['name_in_utf8'] = df_german_1250.apply(lambda x: x['name'].encode("windows-1250", errors='ignore').decode('utf-8'), axis=1)

list_of_german_manuf = df_german_1250['name_in_utf8'].to_list()
list_of_german_manuf_query = ", ".join("'" + str(x).upper() + "'" for x in list_of_german_manuf)  # german manufacturers list for sql query

man_query = f""" select id,
                name,
                full_name
                from postgres_database.manufacturer
                where upper(full_name) in ({list_of_german_manuf_query})
"""
df_manufacturers_ger = pd.read_sql_query(man_query, cnx)

df_german_1250['name_in_utf8_upper'] = df_german_1250['name_in_utf8'].apply(lambda x: x.upper())
df_manufacturers_ger['full_name_upper'] = df_manufacturers_ger['full_name'].apply(lambda x: x.upper())

df_ger_man_final = pd.merge(df_german_1250, df_manufacturers_ger, left_on='name_in_utf8_upper', right_on='full_name_upper', how='left')

# Test string showing how decoding works in following script
test_broken_decoding_ger = "DĂ–RR"
print(test_broken_decoding_ger.encode("windows-1250").decode('utf-8'))

# Appending DataFrame with result to excel file
with pd.ExcelWriter(excel_path, engine="openpyxl", mode="a") as writer:
    df_ger_man_final.to_excel(writer, index=False, sheet_name='german_check')

## Price check in loaded file

In [None]:
"""
Script used to check price in file which is going to be loaded to the database.
It checks if price in currently loaded file is higher/lower than max/min price for product.
Max/min price for each product is counted in different workflow and stored in database.
If price of product in currently loaded file is 30% higher/lower than max/min price,
such product is suspicious. If 30% of prices in file are "suspicious" it means that following
file should not be loaded and that it should be checked manually.
"""

import pandas as pd

# Copy input to output -> KNIME Python node 
# input_table_1 - table passed in the workflow to the Python node (KNIME)
output_table_1 = input_table_1.copy()

# If input table is empty it means that table with max/min price for products does not have information for following data source
# Therefore checked file cannot be compared with max/min price table -> it receives status: pass

def suspicious_prices_check(input_table_1: pd.DataFrame) -> pd.DataFrame:
    if input_table_1.empty:
        return pd.DataFrame({'file_passed_price': [True],
                            'percentage_of_suspicious_products': [0],
                            'number_of_suspicious_products': [0],
                            'number_of_all_products': [0]})
    else:
        input_table_1['smaller'] = input_table_1.apply(lambda x: True if x['price_in_pln'] < x['min_price'] - (x['min_price'] * 0.3) else False, axis=1)
        input_table_1['higher'] = input_table_1.apply(lambda x: True if x['price_in_pln'] > x['max_price'] + (x['max_price'] * 0.3) else False, axis=1)
        input_table_1['pass'] = input_table_1.apply(lambda x: True if x['smaller'] == False and x['higher'] == False else False, axis=1)

        df_grouped_pass = input_table_1[['product_id', 'pass']].groupby('pass').count().reset_index().sort_values(by='pass')
        
        # Protection from no True or False statuses generated
        df_empty_table = pd.DataFrame({'pass': [True, False], 'product_id': [0,0]})
        
        df_grouped_pass = pd.concat([df_grouped_pass, df_empty_table])
        df_grouped_pass = df_grouped_pass[['pass', 'product_id']].groupby('pass').sum().reset_index()

        false_qty = int(df_grouped_pass[(df_grouped_pass['pass'] == False)]['product_id'].to_string(index=False))
        number_of_prices = input_table_1.shape[0]
        
        # Check if ratio of false (suspicious) quantites is lower than 30%
        passed = false_qty / number_of_prices * 100 < 30
        false_percent = round(false_qty / number_of_prices * 100, 2)

        return pd.DataFrame({'file_passed_price': [passed],
                            'percentage_of_suspicious_products': [false_percent],
                            'number_of_suspicious_products': [false_qty],
                            'number_of_all_products': [number_of_prices]})

output_table_1 = suspicious_prices_check(input_table_1)      

## File size checker

In [None]:
"""
Simple script which checks if recently loaded file is bigger/smaller/the same as previously loaded file.
"""
import pandas as pd

# Copy input to output -> KNIME Python node 
# input_table_1 - table passed in the workflow to the Python node (KNIME)
output_table_1 = input_table_1.copy()

recent_file_size = int(input_table_1['local_file_size_recent'][0])
previous_file_size = int(input_table_1['local_file_size_previous'][0])
source_type = input_table_1['source_type'].to_string(index=False)


def file_size_checker(recent_file_size: int, previous_file_size:int, source_type: str) -> pd.DataFrame:    
    # Different percentage ratio for inhouse suppliers/manufacturers
    if source_type in ("Inhouse supplier", "Inhouse manufacturer"):
        percentage = 0.2
    else:
        percentage = 0.4

    if recent_file_size > previous_file_size + (previous_file_size * percentage):
        file_pass = False
        reason = "File bigger than previous one"
    elif recent_file_size < previous_file_size - (previous_file_size * percentage):
        file_pass = False
        reason = "File smaller than previous one"
    elif recent_file_size == previous_file_size:
        file_pass = False
        reason = "File size is the same as previous one"
    else:
        file_pass = True
        reason = "File is ok"
        
    file_percentage = round(recent_file_size / previous_file_size * 100, 2)
    
    return pd.DataFrame({"file_passed_size": [file_pass],
                         "size_check_reason": [reason],
                         'recent_file_size': [recent_file_size],
                         'previous_file_size': [previous_file_size],
                         'percentage_recent_to_previous': [file_percentage]})

output_table_1 = file_size_checker(recent_file_size, previous_file_size, source_type)