Made by Pieter de Jong

In [None]:
import pandas as pd
from collections import Counter
import re
from difflib import ndiff, get_close_matches
import textdistance
import string
import json
import io
import os
import panel as pn
pn.extension()

In [None]:
def startupCheck():
    '''
    Checks if claim_matching.json is present, creates the file if it is not present
    '''
    if os.path.isfile("claim_matching.json") and os.access("claim_matching.json", os.R_OK):
        # checks if file exists
        print ("File found")
    else:
        print ("Creating file")
        with io.open(os.path.join("", 'claim_matching.json'), 'w') as db_file:
            db_file.write(json.dumps({}))

In [None]:
startupCheck()

In [None]:
def load_file(path):
    '''
    Creates dataframe from the columns Event Date, Product Name, and Cleams/Features of all sheets of one .xls file
    
    Arguments:
    path      (str): path to .xls file
    Returns:       : Dataframe
    
    Author(s):
    Pieter de Jong
    '''
    df = pd.concat(pd.read_excel(path, usecols=["Event Date", "Product Name", "Claims/Features"], sheet_name=None), ignore_index=True)
    return df

In [None]:
filename = "C:\\Users\\piete\\AppData\\Roaming\\MobaXterm\\slash\\RemoteFiles\\396834_2_0\\2020-2022_BAK Cakes&Sweet_Goods_WE1 (8300).xls"
df = load_file(filename)

In [None]:
#path = "/commons/dsls/fine_bakery/Data/"

In [None]:
def load_files(path):
    '''
    Reads in all .xls files inside the folder selected using the path
    Creates dataframe from the columns Event Date, Product Name, and Cleams/Features of all files and sheets within them
    
    Arguments:
    path      (str): path to folder containing .xls files
    Returns:       : Dataframe
    
    Author(s):
    Pieter de Jong
    '''
    files = os.listdir(path)
    files_xls = [file for file in files if file.endswith('xls')]
    df = pd.concat([pd.concat(pd.read_excel(path + excelfile, usecols=["Event Date", "Product Name", "Claims/Features"], sheet_name=None)) for excelfile in files_xls], ignore_index=True)

    return df

In [None]:
#path = "C:\\Users\\piete\\Desktop\\fine_bakery\\mokup\\"
#df = load_files(path)

In [None]:
def cleaning(df):
    '''
    Cleans the dataframe by removing rows with no claims. 
    Making the Claims/Features column all lowercase.
    Removing some unwanted characters.
    
    Arguments:
    path           : Dataframe 
    Returns:       : Dataframe
    
    Author(s):
    Pieter de Jong
    '''
    df = df.dropna(subset=["Claims/Features"])
    df["claims_proccesed"] = df["Claims/Features"].str.lower()
    df["claims_proccesed"] = df["claims_proccesed"].str.replace(",", ".").str.replace("\n", " ").str.replace("\'s", "")
    df["claims_proccesed"] = df["claims_proccesed"].str.rstrip(".").str.split("\. ")
    
    return df

In [None]:
df = cleaning(df)

In [None]:
def find_pattern(pattern, string):
    '''
    Returns each string containing pattern
    
    Arguments:
    Pattern        : String
    String         : String
    Returns:       : String
    
    Author(s):
    Pieter de Jong
    '''
    return bool(re.search(pattern, string))

In [None]:
def clean_nonclaims(df):
    '''
    Removes claims that contain patterns marking them non claims
    
    Arguments:
    df             : Dataframe
    Returns:       : Dataframe
    
    Author(s):
    Pieter de Jong
    '''
    all_prod_claims = []
    pattern = ": \d|kcal|kj|\dg|\d g|.org"
    for claims in df["claims_proccesed"]:
        claims_no_ingredients = []
        for claim in claims:
            claim = claim.lstrip()
            if not find_pattern(pattern, claim):
                claims_no_ingredients.append(claim)
        all_prod_claims.append(claims_no_ingredients)
    df["claims_proccesed"] = all_prod_claims
    return df

In [None]:
df = clean_nonclaims(df)

In [None]:
df

In [None]:
def advanced_space_split(df):
    '''
    Splits sentences where a space is missing. including when next sentence starts with a number.
    does not split abreviations like h.u.v and ignores no.1
    
    Arguments:
    df             : Dataframe
    Returns:       : Dataframe
    
    Author(s):
    Pieter de Jong
    '''
    pattern = "\D\.\D"
    pattern2 = "\D\.\D\."
    pattern3 = "\D\.\d"
    pattern4 = "no.1"
    claims_cleaned = []
    for claims in df["claims_proccesed"]:
        temp_claims = claims
        for claim in claims:
            if find_pattern(pattern, claim) and not find_pattern(pattern2, claim):
                temp_claims.remove(claim)
                temp_claims.append(claim.split(".")[0])
                temp_claims.append(claim.split(".")[1])
        

            if find_pattern(pattern3, claim) and not find_pattern(pattern4, claim):
                temp_claims.remove(claim)
                temp_claims.append(claim.split(".")[0])
                temp_claims.append(claim.split(".")[1])
        claims_cleaned.append(temp_claims)
    df["claims_proccesed"] = claims_cleaned
    return df

In [None]:
df = advanced_space_split(df)

In [None]:
def claim_counter(df):
    '''
    Create list of all unique claims and a list of all claims
    
    Arguments:
    df             : Dataframe
    Returns:       : List of claims, List of unique claims
    
    Author(s):
    Pieter de Jong
    '''
    all_cleaned_unique_claims = []
    all_cleaned_claims = []
    for claims in df["claims_proccesed"]:
        for claim in claims:
            all_cleaned_claims.append(claim)
            if claim not in all_cleaned_unique_claims:
                all_cleaned_unique_claims.append(claim)
    return all_cleaned_claims, all_cleaned_unique_claims

In [None]:
all_cleaned_claims, all_cleaned_unique_claims = claim_counter(df)

In [None]:
Claim_ammount = Counter(all_cleaned_claims)
Claim_ammount.most_common()

In [None]:

#find all claims containing pattern and adding these claims to claim_dict with pattern as key if their tickbox is selected
claim_dict = {}
pattern = "green dot certified"
def get_matches(pattern):
    #pattern = "vegan"
    pattern_match = []
    for claim in all_cleaned_unique_claims:
        if find_pattern(pattern, claim):
            if claim not in pattern_match:
                pattern_match.append(claim)
    return pattern_match


#text_input = pn.widgets.TextInput(name="Claim search", placeholder="Enter claim here")
checkbox_group = pn.widgets.CheckBoxGroup(name="Checkbox Group", value=get_matches(pattern), options=get_matches(pattern))
#column = pn.Column(text_input, checkbox_group)
#column
checkbox_group

In [None]:
claim_dict[pattern] = checkbox_group.value

filename = "claim_matching.json"

    
with open(filename, "r+") as jsonfile:
    dic = json.load(jsonfile)
    
    for key in claim_dict.keys():
        dic[key] = claim_dict[key]

with open(filename, "w") as jsonfile:
    json.dump(dic, jsonfile)


In [None]:
dic.keys()

In [None]:
for claims in df["claims_proccesed"]:
    print(any((True for x in dic["vegan"] if x in claims)))
    

In [None]:
#get_close_matches("recyclable", [claim for claim in claims_no_ingredients], 10, 0.7)