In [32]:
import numpy as np 
import pandas as pd 
import spacy
import string
import gensim
import operator
import re


In [33]:
# Load data from a CSV file
df = pd.read_csv('FINAL.csv')
df.head(5)


Unnamed: 0,OFFER,RETAILER,BRAND,BRAND_BELONGS_TO_CATEGORY,IS_CHILD_CATEGORY_TO
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB,,
1,"Beyond Meat Plant-Based products, spend $25",,BEYOND MEAT,,
2,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR,,
3,"Butterball, select varieties, spend $10 at Dil...",DILLONS FOOD STORE,BUTTERBALL,,
4,"GATORADE Fast Twitch, 12-ounce 12 pack, at Ama...",AMAZON,GATORADE,Medicines & Treatments,Health & Wellness


In [34]:
df.shape

(414, 5)

In [35]:
df['OFFER'] = df.apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)

# Display the DataFrame with the new 'OFFER' column
print(df['OFFER'])

0      Spend $50 on a Full-Priced new Club Membership...
1      Beyond Meat Plant-Based products, spend $25 BE...
2      Good Humor Viennetta Frozen Vanilla Cake GOOD ...
3      Butterball, select varieties, spend $10 at Dil...
4      GATORADE Fast Twitch, 12-ounce 12 pack, at Ama...
                             ...                        
409                             Spend $10 at KFC KFC KFC
410               Sargento Product SARGENTO Cheese Dairy
411              Thomas' Bagel Thins THOMAS Bread Pantry
412          Spend $270 at Pavilions PAVILIONS PAVILIONS
413    Back to the Roots Soils, select varieties, at ...
Name: OFFER, Length: 414, dtype: object


In [36]:
output_file = 'offfer_retailer_nonascii.csv'
df.to_csv(output_file, index=False)

In [5]:
df = df.drop_duplicates()
print(df)

                                                 OFFER            RETAILER  \
0    Spend $50 on a Full-Priced new Club Membership...           SAMS CLUB   
1    Beyond Meat Plant-Based products, spend $25 BE...                 NaN   
2    Good Humor Viennetta Frozen Vanilla Cake GOOD ...                 NaN   
3    Butterball, select varieties, spend $10 at Dil...  DILLONS FOOD STORE   
4    GATORADE Fast Twitch, 12-ounce 12 pack, at Ama...              AMAZON   
..                                                 ...                 ...   
409                           Spend $10 at KFC KFC KFC                 KFC   
410             Sargento Product SARGENTO Cheese Dairy                 NaN   
411            Thomas' Bagel Thins THOMAS Bread Pantry                 NaN   
412        Spend $270 at Pavilions PAVILIONS PAVILIONS           PAVILIONS   
413  Back to the Roots Soils, select varieties, at ...             WALMART   

                 BRAND BRAND_BELONGS_TO_CATEGORY IS_CHILD_CATEG

In [6]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.





def spacy_tokenizer(sentence):
 
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    #creating token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    
    
    #return tokens
    return tokens

In [7]:


print ('Cleaning and Tokenizing...')
df['OFFER_tokenized'] = df['OFFER'].map(lambda x: spacy_tokenizer(x))

df.head()

Cleaning and Tokenizing...


Unnamed: 0,OFFER,RETAILER,BRAND,BRAND_BELONGS_TO_CATEGORY,IS_CHILD_CATEGORY_TO,OFFER_tokenized
0,Spend $50 on a Full-Priced new Club Membership...,SAMS CLUB,SAMS CLUB,,,"[spend, , on, a, full, price, new, club, membe..."
1,"Beyond Meat Plant-Based products, spend $25 BE...",,BEYOND MEAT,,,"[beyond, meat, plant, based, product, , spend,..."
2,Good Humor Viennetta Frozen Vanilla Cake GOOD ...,,GOOD HUMOR,,,"[good, humor, viennetta, frozen, vanilla, cake..."
3,"Butterball, select varieties, spend $10 at Dil...",DILLONS FOOD STORE,BUTTERBALL,,,"[butterball, , select, variety, , spend, , at,..."
4,"GATORADE Fast Twitch, 12-ounce 12 pack, at Ama...",AMAZON,GATORADE,Medicines & Treatments,Health & Wellness,"[gatorade, fast, twitch, , ounce, pack, , at, ..."


In [8]:
OFFER_df= df['OFFER_tokenized']
OFFER_df.head()

0    [spend, , on, a, full, price, new, club, membe...
1    [beyond, meat, plant, based, product, , spend,...
2    [good, humor, viennetta, frozen, vanilla, cake...
3    [butterball, , select, variety, , spend, , at,...
4    [gatorade, fast, twitch, , ounce, pack, , at, ...
Name: OFFER_tokenized, dtype: object

In [9]:
from gensim import corpora

#creating term dictionary
%time dictionary = corpora.Dictionary(OFFER_df)

#filter out terms which occurs in less than 4 documents and more than 20% of the documents.
#NOTE: Since we have smaller dataset, we will keep this commented for now.

#dictionary.filter_extremes(no_below=4, no_above=0.2)

#list of few which which can be further removed
stoplist = set('hello and if this can would should could tell ask stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)


CPU times: user 1.73 ms, sys: 8 µs, total: 1.74 ms
Wall time: 1.74 ms


In [10]:
#print top 50 items from the dictionary with their unique token-id
dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 50]]
print (dict_tokens)

[[['', 0], ['club', 1], ['full', 2], ['membership', 3], ['new', 4], ['on', 5], ['price', 6], ['sams', 7], ['spend', 8], ['based', 9], ['beyond', 10], ['meat', 11], ['plant', 12], ['product', 13], ['cake', 14], ['frozen', 15], ['good', 16], ['humor', 17], ['vanilla', 18], ['viennetta', 19], ['at', 20], ['butterball', 21], ['dillon', 22], ['dillons', 23], ['food', 24], ['select', 25], ['store', 26], ['variety', 27], ['amazon', 28], ['fast', 29], ['gatorade', 30], ['health', 31], ['medicines', 32], ['ounce', 33], ['pack', 34], ['storefront', 35], ['treatments', 36], ['twitch', 37], ['wellness', 38], ['drinks', 39], ['enhanced', 40], ['sports', 41], ['water', 42], ['brownie', 43], ['cookies', 44], ['emmys', 45], ['or', 46], ['organics', 47], ['pop', 48], ['snaps', 49], ['up', 50]]]


In [11]:
corpus = [dictionary.doc2bow(desc) for desc in OFFER_df]

word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

print(word_frequencies)

[[('', 1), ('club', 3), ('full', 1), ('membership', 1), ('new', 1), ('on', 1), ('price', 1), ('sams', 2), ('spend', 1)], [('', 2), ('spend', 1), ('based', 1), ('beyond', 2), ('meat', 2), ('plant', 1), ('product', 1)], [('cake', 1), ('frozen', 1), ('good', 2), ('humor', 2), ('vanilla', 1), ('viennetta', 1)]]


In [12]:
OFFER_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
OFFER_lsi_model = gensim.models.LsiModel(OFFER_tfidf_model[corpus], id2word=dictionary, num_topics=300)

In [13]:
gensim.corpora.MmCorpus.serialize('OFFER_tfidf_model_mm', OFFER_tfidf_model[corpus])
gensim.corpora.MmCorpus.serialize('OFFER_lsi_model_mm',OFFER_lsi_model[OFFER_tfidf_model[corpus]])

In [14]:
#Load the indexed corpus
OFFER_tfidf_corpus = gensim.corpora.MmCorpus('OFFER_tfidf_model_mm')
OFFER_lsi_corpus = gensim.corpora.MmCorpus('OFFER_lsi_model_mm')

print(OFFER_tfidf_corpus)
print(OFFER_lsi_corpus)

MmCorpus(413 documents, 672 features, 3409 non-zero entries)
MmCorpus(413 documents, 300 features, 119129 non-zero entries)


In [15]:
from gensim.similarities import MatrixSimilarity

OFFER_index = MatrixSimilarity(OFFER_lsi_corpus, num_features = OFFER_lsi_corpus.num_terms)

In [47]:
from operator import itemgetter

def search_similar_OFFER(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = OFFER_tfidf_model[query_bow]
    query_lsi = OFFER_lsi_model[query_tfidf]

    OFFER_index.num_best = 10

    OFFER_list = OFFER_index[query_lsi]

    OFFER_list.sort(key=itemgetter(1), reverse=True)
    OFFER_names = []

    for j, OFFER in enumerate(OFFER_list):

        OFFER_names.append (
            {
                'Relevance': round((OFFER[1] * 100),2),
                'OFFER': df['OFFER'][OFFER[0]],
                'Retailer': df['RETAILER'][OFFER[0]]
                
            }

        )
        if j == (OFFER_index.num_best-1):
            break

    return pd.DataFrame(OFFER_names, columns=['Relevance','OFFER','Retailer'])


In [56]:
search_similar_OFFER('chocolates')

Unnamed: 0,Relevance,OFFER,Retailer
0,76.01,"DOVE chocolate, select varieties DOVE CHOCOLATE",
1,72.4,"DOVE Chocolate, select sizes, buy 1 DOVE CHOCO...",
2,34.09,"M&M'S chocolate candies, select varieties M&MS...",
3,32.41,"SNICKERS chocolate candy bar, select varieties...",
4,30.73,"Reese's Chocolate Cones, 8 count at GIANT OR M...",MARTINS FOODS
5,28.58,"Reese's Chocolate Cones, 8 count at GIANT OR M...",GIANT FOOD
6,0.02,"ORBIT, select varieties ORBIT Gum Candy",
7,0.01,"GATORADE Fast Twitch, 12-ounce single serve, b...",FOOD4LESS
8,-0.01,"GATORADE Fast Twitch, 12-ounce single serve, b...",FRED MEYER
9,-0.01,"GATORADE Fast Twitch, 12-ounce single serve GA...",
