In [1]:
import pandas as pd
import re
from tempfile import TemporaryFile
import numpy as np

# Data cleaning. Remove capitalization, special characters, and duplicate rows

In [2]:
def find_brand_multiplier(b):
    df=brand[brand["BRAND"]==b]
    total_receipts=df["RECEIPTS"].sum()
    df["MULTIPLIER"]=df["RECEIPTS"]/total_receipts
    df=df[["BRAND_BELONGS_TO_CATEGORY","MULTIPLIER"]]
    return df

In [3]:
pattern = r'[^\w\s]'

brand=pd.read_csv("Data/brand_category.csv")
brand["BRAND_BELONGS_TO_CATEGORY"]=brand["BRAND_BELONGS_TO_CATEGORY"].str.lower()
brand["BRAND"]=brand["BRAND"].str.lower()
brand['BRAND']=brand['BRAND'].astype(str)
brand["BRAND_BELONGS_TO_CATEGORY"]=brand["BRAND_BELONGS_TO_CATEGORY"].apply(lambda x: re.sub(pattern, '', x))
brand['BRAND']=brand['BRAND'].apply(lambda x: re.sub(pattern, '', x))
brand["BRAND"]=brand["BRAND"].apply(str)

df=pd.DataFrame()
for b in sorted(brand):
    df_temp=find_brand_multiplier(b)
    df_temp["BRAND"]=b
    df = pd.concat([df, df_temp])
brand=df.merge(brand,how='right', on=['BRAND','BRAND_BELONGS_TO_CATEGORY'])

brand.to_csv("Data/brand_category_clean.csv",index=False)

In [4]:
cat=pd.read_csv("Data/categories.csv")
cat["IS_CHILD_CATEGORY_TO"]=cat["IS_CHILD_CATEGORY_TO"].str.lower()
cat["PRODUCT_CATEGORY"]=cat["PRODUCT_CATEGORY"].str.lower()
cat["IS_CHILD_CATEGORY_TO"]=cat["IS_CHILD_CATEGORY_TO"].apply(lambda x: re.sub(pattern, '', x))
cat["PRODUCT_CATEGORY"]=cat["PRODUCT_CATEGORY"].apply(lambda x: re.sub(pattern, '', x))
cat["IS_CHILD_CATEGORY_TO"]=cat["IS_CHILD_CATEGORY_TO"].apply(lambda x: re.sub(pattern, '', x))
cat["PRODUCT_CATEGORY"]=cat["PRODUCT_CATEGORY"].apply(lambda x: re.sub(pattern, '', x))
cat.to_csv("Data/categories_clean.csv",index=False)

In [5]:
offer=pd.read_csv("Data/offer_retailer.csv")
offer["OFFER"]=offer["OFFER"].str.lower()
offer["RETAILER"]=offer["RETAILER"].str.lower()
offer["BRAND"]=offer["BRAND"].str.lower()
offer["RETAILER"]=offer["RETAILER"].astype(str)
offer["OFFER"]=offer["OFFER"].apply(lambda x: re.sub(pattern, '', x))
offer["RETAILER"]=offer["RETAILER"].apply(lambda x: re.sub(pattern, '', x))
offer["BRAND"]=offer["BRAND"].apply(lambda x: re.sub(pattern, '', x))
offer=offer.drop_duplicates()
offer=offer.replace('nan','')
offer.to_csv("Data/offer_retailer_clean.csv",index=False)

# Encoding text columns so that I can use semantic similarity later on

In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
brand_vectors=model.encode(sorted(list(set(brand["BRAND"].values))))
np.save('Data/brand_vectors.npy',brand_vectors)

In [7]:
offer_vectors=model.encode(list(offer['OFFER']))
np.save('Data/offer_vectors.npy',offer_vectors)

In [8]:
category_vectors=model.encode(list(cat['PRODUCT_CATEGORY']))
np.save('Data/category_vectors.npy',category_vectors)

In [9]:
retailer_vectors=model.encode(list(offer['RETAILER']))
np.save('Data/retailer_vectors.npy',retailer_vectors)

In [10]:
offer_brand_vectors=model.encode(list(offer['BRAND']))
np.save('Data/offer_brand_vectors.npy',offer_brand_vectors)