In [179]:
import pandas as pd
import os
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from fuzzywuzzy import fuzz

from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Function to check and download necessary NLTK resources
def download_nltk_resources():
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('taggers/averaged_perceptron_tagger')
    except LookupError:
        nltk.download('averaged_perceptron_tagger')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')

# Call the function to ensure resources are downloaded
download_nltk_resources()

PATH = os.getcwd()
PATH = PATH + '/data'

In [180]:

def load_data(file):
    file_path = os.path.join(PATH,file)
    file = pd.read_csv(file_path)
    return file

# Clean the foreign keys 
def clean_str(df,cols):
    if isinstance(cols,list):
        for col in cols:
            df[col] = df[col].map(lambda x: x.strip().upper() if isinstance(x, str) else str(x))
    elif isinstance(cols,str):
        df[col] = df[col].map(lambda x: x.strip().upper() if isinstance(x, str) else x)
    else:
        raise KeyError


def create_dataframe(path1= 'categories.csv', path2 = 'brand_category.csv', path3 = 'offer_retailer.csv'):

    product_cat=load_data(path1)
    product_cat = product_cat.drop('CATEGORY_ID',axis=1)

    brand_cat  = load_data(path2)
    brand_cat = brand_cat.drop('RECEIPTS',axis=1)

    retailers = load_data(path3)

    return [product_cat, brand_cat, retailers]


# set(stopwords.words('english'))
# remove brand, retailer and stop word form the offer string and clean things up
def clean_string(text):
    # Replace any character that is not a letter or a space with nothing
    cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return cleaned_text

def text_prep(txt,stop_words):
    txt = clean_string(txt)
    #if pd.notna(brand) or pd.notna(retailer):
        #brand_retailer = str(brand).replace("'","").title().split()+str(retailer).replace("'","").title().split()
        #for b in brand_retailer :
        #    txt = txt.title().replace(b,"")
    words = word_tokenize(txt)
    # words =[word.strip(u"\u2122").strip(u'\u0256') for word in words]
    filtered_text = [word for word in words if (not word.lower() in stop_words) and (word.isalpha()) 
                     ]
    tagged = pos_tag(filtered_text)
    nouns = [word for word, pos in tagged if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
    return ' '.join(nouns)


def reshape_to_2d(series):
    # Convert the series to a NumPy array
    array_1d = np.array(series)
    
    # Reshape to a 2D array (1 row and N columns)
    array_2d = array_1d.reshape(1, -1)
    
    return array_2d


def get_embedding(text):
    # Initialize the tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Tokenize the input text and prepare it as input for the model
    encoded_input = tokenizer(text, return_tensors='pt')

    # Get the embeddings
    with torch.no_grad():
        outputs = model(**encoded_input)

    # Aggregate the embeddings - here we take the mean across all tokens
    embeddings = outputs.last_hidden_state.mean(dim=1)

    # Convert to one-dimensional array and return
    return embeddings.squeeze().numpy()


def input_prep_with_embedding(txt,stop_words):
    txt =  clean_string(txt)

    words = word_tokenize(txt)
    # words =[word.strip(u"\u2122").strip(u'\u0256') for word in words]
    filtered_text = [word for word in words if (not word.lower() in stop_words) and (word.isalpha())]
    tagged = pos_tag(filtered_text)
    nouns = [word for word, pos in tagged if pos in ['NN', 'NNS', 'NNP', 'NNPS']]
    txt = " ".join(nouns)
    
    txt = get_embedding(txt)
    
    txt=reshape_to_2d(txt)

    return txt


def dup_cat_clean_bert(row, sw):
    
    cat = row['BRAND_BELONGS_TO_CATEGORY'].lower().replace('&'," ").split()
    cat = ' '.join(cat)
    
    embedding_cat = input_prep_with_embedding(cat,sw)

    offer = row['OFFER_clean'].lower()

    embedding_offer = input_prep_with_embedding(offer,sw)

    bert_score = cosine_similarity(embedding_cat, embedding_offer)[0][0]

    #fuzz_score1 = fuzz.partial_ratio(cat, offer)
    # fuzz_score1 = fuzz.token_sort_ratio(cat, offer)
    
    return bert_score


def join_str(row):
    s = f"""{str(row['OFFER_clean']) if (pd.notna(row['OFFER_clean']) and row['OFFER_clean']!='nan') else ""} {str(row['BRAND']).replace('&'," ").replace(",", "")} {str(row['RETAILER']) if (pd.notna(row['RETAILER']) and row['RETAILER']!='nan') else ''} {str(row['PRODUCT_CATEGORY']) if (pd.notna(row['PRODUCT_CATEGORY']) and row['PRODUCT_CATEGORY']!='nan') else ''} {row['IS_CHILD_CATEGORY_TO_compli'] if (pd.notna(row['IS_CHILD_CATEGORY_TO_compli']) and row['IS_CHILD_CATEGORY_TO_compli']!='nan') else ''}"""
    return s

In [181]:
def refine_cat(product_cat):
    clean_str(product_cat, ['PRODUCT_CATEGORY', 'IS_CHILD_CATEGORY_TO'])
    # attach parent's parent category to some of the category
    product_cat_refined = product_cat.merge(product_cat[['PRODUCT_CATEGORY','IS_CHILD_CATEGORY_TO']], left_on='IS_CHILD_CATEGORY_TO',right_on='PRODUCT_CATEGORY' ,how='left',suffixes=('','_compli'))
    product_cat_refined = product_cat_refined.drop("PRODUCT_CATEGORY_compli",axis=1)

    return product_cat_refined


def seperate_offers(product_cat, brand_cat, retailers):
    clean_str(product_cat, ['PRODUCT_CATEGORY', 'IS_CHILD_CATEGORY_TO'])
    clean_str(brand_cat, ['BRAND','BRAND_BELONGS_TO_CATEGORY'])
    clean_str(retailers,['RETAILER','BRAND'])

    # seperate brand with unique category from brand with multiple category
    cat_check = brand_cat.groupby("BRAND")["BRAND_BELONGS_TO_CATEGORY"].count()
    unique_cat  = cat_check[cat_check<2]
    dup_cat = cat_check[cat_check>=2]
    unique_brand_cat = brand_cat[brand_cat['BRAND'].isin(unique_cat.index)]
    dup_brand_cat = brand_cat[brand_cat['BRAND'].isin(dup_cat.index)]

    combined_unique_brand = retailers.merge(unique_brand_cat, how='inner', on='BRAND')
    combined_dup_brand  = retailers.merge(dup_brand_cat, how='inner', on='BRAND')

    #join back those brand not exist in brand_cat
    inner_brand = np.array(list(combined_dup_brand['BRAND'].values)+list(combined_unique_brand['BRAND'].values))
    not_coexist_offer = retailers[retailers['BRAND'].apply(lambda row: row not in inner_brand)][:]

    return [combined_unique_brand, combined_dup_brand, not_coexist_offer]

def generate_stopword():
    sw = set(stopwords.words('english'))
    sw.update({"buy", "spend", "select", 'varieties', 'sizes', 'ounce', 'count', 'liter'})
    sw.remove('any')
    return sw

def create_clean_offer(combined_unique_brand, combined_dup_brand, not_coexist_offer, sw):
    #brand_set = set(brand_cat['BRAND'].str.capitalize())
    combined_dup_brand['OFFER_clean'] = combined_dup_brand.apply(lambda row: text_prep(row['OFFER'],sw),axis=1)
    combined_unique_brand['OFFER_clean'] = combined_unique_brand.apply(lambda row: text_prep(row['OFFER'],sw),axis=1)
    not_coexist_offer['OFFER_clean'] = not_coexist_offer.apply(lambda row: text_prep(row['OFFER'],sw),axis=1)

    general_offer = combined_dup_brand[combined_dup_brand['OFFER_clean'].str.lower().str.contains("reward|club|member")]
    unique_brand_offer = pd.concat([combined_unique_brand,general_offer],axis=0)
    unique_brand_offer = pd.concat([unique_brand_offer,not_coexist_offer],axis=0)

    return [unique_brand_offer, combined_dup_brand]


def generate_training_str(unique_brand_offer, combined_dup_brand, product_cat_refined, sw):
    mislabeled_offer = combined_dup_brand[~combined_dup_brand['OFFER_clean'].str.lower().str.contains("reward|club|member")].copy()
    mislabeled_offer['bert_score'] = mislabeled_offer.apply(lambda row: dup_cat_clean_bert(row, sw), axis=1)
    final_dup = mislabeled_offer.groupby(['OFFER', 'BRAND']).apply(lambda x: x.loc[x['bert_score'].idxmax()]).drop('bert_score',axis=1)
    final_dup = final_dup.reset_index(drop=True)

    training  = pd.concat([unique_brand_offer,final_dup],axis=0)
    training = training.merge(product_cat_refined, how='left' , left_on="BRAND_BELONGS_TO_CATEGORY", right_on='PRODUCT_CATEGORY')
    training['training_str'] = training.apply(join_str,axis=1).to_list()

    return training

In [182]:

def fetch_data(path_to_cat = 'categories.csv', path_to_brand = 'brand_category.csv', path_to_offer = 'offer_retailer.csv'):
    product_cat, brand_cat, retailers = create_dataframe(path_to_cat, path_to_brand, path_to_offer)
    sw = generate_stopword()
    product_cat_refined = refine_cat(product_cat)
    combined_unique_brand, combined_dup_brand, not_coexist_offer = seperate_offers(product_cat, brand_cat, retailers)
    unique_brand_offer, combined_dup_brand = create_clean_offer(combined_unique_brand, combined_dup_brand, not_coexist_offer, sw)
    training = generate_training_str(unique_brand_offer, combined_dup_brand, product_cat_refined, sw)
    emb = training['training_str'].apply(get_embedding)
    training['training_str_vector'] = emb
    training.to_pickle("data/processed_data.pkl")

In [183]:
fetch_data()