### 1. GOAL

The goal of this task is to find similar keywords, for instance: 'matresses sale' and 'matresses sales'

### 2. DATA

The data is from excel file organic keywords. We only use first which contains the keywords


### 3. TECHNIQUE SUMMARY

    3.1 Load data
    3.2 Tokenize and clean data
    3.3 Find keywords similarity

In [297]:
%matplotlib inline

import nltk.data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import string
import textdistance

import pandas as pd
import numpy as np

import time

In [298]:
# get keywords column from macys_organic_keywords excel file
# this file already modified from the original csv file to make easy to get the keywords
df = pd.read_excel('macys_organic_keywords.xlsx', sheetname='macys_organic_keywords')

In [299]:
df["Keyword"]

0                                    mattress sale
1                                   mattress sales
2                                   mattress deals
3                           mattress sales near me
4                              macys mattress sale
5                              mattresses for sale
6                             macy's mattress sale
7                                mattress for sale
8                            mattress sale near me
9                                   macys mattress
10                                    matress sale
11                              mattresses on sale
12                     mattresses for sale near me
13                                macys mattresses
14                              macy mattress sale
15                                   matress sales
16                       mattress for sale near me
17                                mattress on sale
18                              sale on mattresses
19                     mattress

### 3.2 Tokenize and Clean Data

In [300]:
# tokenize keywords
df["tokenized_keywords"] = df["Keyword"].apply(nltk.word_tokenize)

In [301]:
# convert dataframe to list of list
tokenized_keywords = df["tokenized_keywords"].values.tolist()

In [302]:
# Clean keywords
# Convert to lowercase
# Remove punctuation from each word
# Filter out remaining tokens that are not alphabetic
# Filter out tokens that are stop words
# lemmatization
def clean_text(list_tokens):
    #ps = PorterStemmer()
    word_lemma = WordNetLemmatizer()
    cleaned_tokens = list()
    for tokens in list_tokens:
        # convert to lower case
        tokens = [w.lower() for w in tokens]
        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # filter out stop words
        #nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        # stemming
        # words = [ps.stem (w) for w in words]
        # lemmatize
        # nltk.download('wordnet')
        words = [word_lemma.lemmatize (w) for w in words]
        cleaned_tokens.append(words)
    return cleaned_tokens

In [303]:
# apply clean_text method to the tokenized_keywords
cleaned_keywords = clean_text(tokenized_keywords)

In [304]:
cleaned_keywords

[['mattress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'deal'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress', 'sale'],
 ['mattress', 'sale'],
 ['macy', 'mattress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress'],
 ['matress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress'],
 ['macy', 'mattress', 'sale'],
 ['matress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['mattress', 'sale'],
 ['sale', 'mattress'],
 ['mattress', 'sale', 'weekend'],
 ['sale', 'mattress'],
 ['sale', 'mattress'],
 ['macy', 'mattress'],
 ['best', 'mattress', 'deal'],
 ['macys', 'mattress', 'sale', 'date'],
 ['mattress', 'sale', 'near'],
 ['macys', 'matress', 'sale'],
 ['mattress', 'deal', 'near'],
 ['mattress', 'sale', 'macy'],
 ['macys', 'mattress', 'sale'],
 ['macy', 'mattress'],
 ['mattress', 'sale', 'today'],
 ['mattress', 'sale', 'macys'],
 ['bed', 'mattress', 'deal'],
 ['macy', 'mattress'],
 ['macy', 'mattress', 'sale', 'coupo

In [305]:
# convert token to sentence
sent_keywords = list()
def token_to_sent(tokenized_keywords):
    for token_keywords in tokenized_keywords:
        sent_keywords.append(' '.join(token_keywords))
    return sent_keywords

In [306]:
sentence_keywords = token_to_sent(cleaned_keywords)

In [307]:
sentence_keywords

['mattress sale',
 'mattress sale',
 'mattress deal',
 'mattress sale near',
 'macys mattress sale',
 'mattress sale',
 'macy mattress sale',
 'mattress sale',
 'mattress sale near',
 'macys mattress',
 'matress sale',
 'mattress sale',
 'mattress sale near',
 'macys mattress',
 'macy mattress sale',
 'matress sale',
 'mattress sale near',
 'mattress sale',
 'sale mattress',
 'mattress sale weekend',
 'sale mattress',
 'sale mattress',
 'macy mattress',
 'best mattress deal',
 'macys mattress sale date',
 'mattress sale near',
 'macys matress sale',
 'mattress deal near',
 'mattress sale macy',
 'macys mattress sale',
 'macy mattress',
 'mattress sale today',
 'mattress sale macys',
 'bed mattress deal',
 'macy mattress',
 'macy mattress sale coupon',
 'macy mattress sale',
 'bed mattress sale',
 'bed mattress sale',
 'macy matress sale',
 'mattress sale today',
 'macy labor day mattress sale',
 'best mattress sale today',
 'mattress deal',
 'mattress macy',
 'matress sale',
 'buy matt

In [308]:
# distinct keywords
distinct_keywords = list(set(sentence_keywords))

In [309]:
distinct_keywords

['costco mattress full size',
 'firm mattress discount',
 'new mattress set',
 'mattress outlet online',
 'best time year mattress sale',
 'best mattress deal weekend',
 'mattress firm',
 'mattress king denver',
 'mattress firm cyber monday',
 'mattress warehouse louisville ky',
 'futon columbus ohio',
 'mattress one heath ohio',
 'buy discount mattress',
 'macy mattress department',
 'futon mattress tulsa',
 'mattress sale around',
 'cheap bed mattress sale',
 'cheap name brand mattress',
 'mattress firm jacksonville',
 'affordable mattress nyc',
 'memory foam talk',
 'mattress sale',
 'low cost mattress set',
 'low cost mattress online',
 'sealy futon',
 'king mattress sale near',
 'show mattress',
 'best semi firm mattress',
 'best matress deal',
 'macy foam mattress',
 'mattress dollar',
 'box spring san diego',
 'macys sa',
 'bed mattress cheap',
 'huge bed sale',
 'cheap bed mattress included',
 'memorial day sale mattress',
 'cheapest mattress near',
 'mattress firm semi annual 

In [310]:
len(distinct_keywords)

1137

### 3.3 Find Keywords Similarity
We use Levenshtein distance. This distance is computed by finding the number of edits which will transform one string to another. The transformations allowed are insertion — adding a new character, deletion — deleting a character and substitution — replace one character by another. By performing these three operations, the algorithm tries to modify first string to match the second one.

In [311]:
# find keywords similarity
# find similar keyword by applying levensthein algorithm with simillarity threshold value between 0.85 and 1.
# the output is in list of tuple of the keywords and its similar words

def keywords_matching(the_keywords):
    start = time.time()
    
    similar_keywords = list()
    copy_distinct_keywords = the_keywords
    for keywords in distinct_keywords:
        for idx in range (len(copy_distinct_keywords)):
            if 0.90 <= textdistance.levenshtein.normalized_similarity(keywords, distinct_keywords[idx]) < 1.0:
                similar_keywords.append((keywords, distinct_keywords[idx]))
          
    print('Total time: ' + str((time.time() - start)) + ' secs')
    return similar_keywords

In [312]:
keywords_matching(distinct_keywords)

Total time: 3642.7749683856964 secs


[('firm mattress discount', 'foam mattress discount'),
 ('mattress warehouse louisville ky', 'mattress warehouse louisville'),
 ('affordable mattress nyc', 'affordable mattress inc'),
 ('mattress sale', 'mattresssale'),
 ('mattress sale', 'mattres sale'),
 ('mattress sale', 'matress sale'),
 ('mattress sale', 'matteress sale'),
 ('best matress deal', 'best mattress deal'),
 ('mattress firm columbus oh', 'mattress firm columbus ohio'),
 ('online mattress store', 'online matress store'),
 ('macy full size mattress', 'macys full size mattress'),
 ('mattresssale', 'mattress sale'),
 ('mattresssale', 'mattres sale'),
 ('mattress sale macys', 'mattress sale macy'),
 ('mattress store near', 'matress store near'),
 ('macy mattress sale date', 'macys mattress sale date'),
 ('best matress price', 'best mattress price'),
 ('best mattress sale year', 'best mattress sale near'),
 ('macy mattress', 'macy matress'),
 ('macy mattress', 'macys mattress'),
 ('futon mattress stlouis', 'futon mattress st 

In [313]:
similar_keywords = [('firm mattress discount', 'foam mattress discount'),
 ('mattress warehouse louisville ky', 'mattress warehouse louisville'),
 ('affordable mattress nyc', 'affordable mattress inc'),
 ('mattress sale', 'mattresssale'),
 ('mattress sale', 'mattres sale'),
 ('mattress sale', 'matress sale'),
 ('mattress sale', 'matteress sale'),
 ('best matress deal', 'best mattress deal'),
 ('mattress firm columbus oh', 'mattress firm columbus ohio'),
 ('online mattress store', 'online matress store'),
 ('macy full size mattress', 'macys full size mattress'),
 ('mattresssale', 'mattress sale'),
 ('mattresssale', 'mattres sale'),
 ('mattress sale macys', 'mattress sale macy'),
 ('mattress store near', 'matress store near'),
 ('macy mattress sale date', 'macys mattress sale date'),
 ('best matress price', 'best mattress price'),
 ('best mattress sale year', 'best mattress sale near'),
 ('macy mattress', 'macy matress'),
 ('macy mattress', 'macys mattress'),
 ('futon mattress stlouis', 'futon mattress st louis'),
 ('macy mattress sale', 'macy matress sale'),
 ('macy mattress sale', 'macys mattress sale'),
 ('best low priced mattress', 'best low price mattress'),
 ('u mattresscom coupon', 'u mattress com coupon'),
 ('mattress discount', 'matress discount'),
 ('discount mattress store', 'discount matress store'),
 ('discount mattress near', 'discounted mattress near'),
 ('discounted mattress', 'discount bed mattress'),
 ('mattres sale', 'mattress sale'),
 ('mattres sale', 'mattresssale'),
 ('affordable mattress inc', 'affordable mattress nyc'),
 ('biggest mattress sale', 'biggest mattress size'),
 ('bed mattress sale near', 'best mattress sale near'),
 ('bed mattress sale near', 'used mattress sale near'),
 ('mattress warehouse houston tx', 'mattress warehouse houston texas'),
 ('macy matress', 'macy mattress'),
 ('macy matress', 'macys matress'),
 ('best price mattress', 'best price matress'),
 ('best price mattress', 'best priced mattress'),
 ('mattress firm denver', 'mattress firm deliver'),
 ('best price matresses', 'best price matress'),
 ('matress sale', 'mattress sale'),
 ('new mattress sale cheap', 'bed mattress sale cheap'),
 ('macys mattress queen', 'macy mattress queen'),
 ('matress deal', 'mattress deal'),
 ('best king mattress', 'best king matress'),
 ('biggest mattress size', 'biggest mattress sale'),
 ('biggest mattress size', 'biggest matress size'),
 ('firm mattress online', 'foam mattress online'),
 ('macys mattress set', 'macy mattress set'),
 ('queen mattress set free delivery', 'queen mattress sale free delivery'),
 ('macy matress sale', 'macy mattress sale'),
 ('macy matress sale', 'macys matress sale'),
 ('macy mattress outlet', 'macys mattress outlet'),
 ('foam mattress discount', 'firm mattress discount'),
 ('bed mattress sale', 'bed matress sale'),
 ('macys matress', 'macy matress'),
 ('macys matress', 'macys mattress'),
 ('u mattress com coupon', 'u mattresscom coupon'),
 ('u mattress com coupon', 'mattress com coupon'),
 ('best mattress sale near', 'best mattress sale year'),
 ('best mattress sale near', 'bed mattress sale near'),
 ('mismatched mattress sale', 'mismatch mattress sale'),
 ('mattress firm outlet', 'matress firm outlet'),
 ('macys mattress sale date', 'macy mattress sale date'),
 ('futon mattress st louis', 'futon mattress stlouis'),
 ('macys mattress sale', 'macy mattress sale'),
 ('macys mattress sale', 'macys matress sale'),
 ('mattress firm memorial day sale', 'mattress king memorial day sale'),
 ('mattress sale near', 'mattress sale nearby'),
 ('mattress sale near', 'matress sale near'),
 ('mismatch mattress sale', 'mismatched mattress sale'),
 ('sell mattress online', 'full mattress online'),
 ('mattress firm near', 'matress firm near'),
 ('macys matress sale', 'macy matress sale'),
 ('macys matress sale', 'macys mattress sale'),
 ('matress firm near', 'mattress firm near'),
 ('best time buy mattress', 'best time buy matress'),
 ('mattress sale nearby', 'mattress sale near'),
 ('macys mattress store', 'macy mattress store'),
 ('best mattress deal memorial day', 'best mattress sale memorial day'),
 ('macy mattress queen', 'macys mattress queen'),
 ('macy mattress set', 'macys mattress set'),
 ('w mattress sale', 'u mattress sale'),
 ('cheapest best mattress', 'cheapest bed mattress'),
 ('mattress memorial day sale', 'u mattress memorial day sale'),
 ('best price matress', 'best price mattress'),
 ('best price matress', 'best price matresses'),
 ('best price matress', 'best priced mattress'),
 ('bed matress sale', 'bed mattress sale'),
 ('matress firm outlet', 'mattress firm outlet'),
 ('used mattress sale near', 'bed mattress sale near'),
 ('bed mattress sale cheap', 'new mattress sale cheap'),
 ('u mattress memorial day sale', 'mattress memorial day sale'),
 ('discount matress store', 'discount mattress store'),
 ('macys full size mattress', 'macy full size mattress'),
 ('macys mattress', 'macy mattress'),
 ('macys mattress', 'macys matress'),
 ('mattress warehouse houston texas', 'mattress warehouse houston tx'),
 ('best king matress', 'best king mattress'),
 ('best matress sale', 'best mattress sale'),
 ('macys mattress outlet', 'macy mattress outlet'),
 ('cheapest bed mattress', 'cheapest best mattress'),
 ('mattress firm columbus ohio', 'mattress firm columbus oh'),
 ('matress sale near', 'mattress sale near'),
 ('best mattress sale', 'best matress sale'),
 ('cheap bed mattress set', 'cheap new mattress set'),
 ('macy bed sale', 'macys bed sale'),
 ('cheap bed mattress deal', 'cheap bed matress deal'),
 ('memorial day matress sale', 'memorial day mattress sale'),
 ('best low price mattress', 'best low priced mattress'),
 ('mattress macy', 'mattress macys'),
 ('online matress store', 'online mattress store'),
 ('macy mattress store', 'macys mattress store'),
 ('mattress sale macy', 'mattress sale macys'),
 ('best mattress sale memorial day', 'best mattress deal memorial day'),
 ('memorial day mattress sale', 'memorial day matress sale'),
 ('best mattress deal', 'best matress deal'),
 ('queen mattress sale free delivery', 'queen mattress set free delivery'),
 ('discount bed mattress', 'discounted mattress'),
 ('cheap new mattress set', 'cheap bed mattress set'),
 ('bed mattress set', 'bed matress set'),
 ('best time buy matress', 'best time buy mattress'),
 ('foam mattress online', 'firm mattress online'),
 ('bed matress set', 'bed mattress set'),
 ('macys bed sale', 'macy bed sale'),
 ('mattress macys', 'mattress macy'),
 ('full mattress online', 'sell mattress online'),
 ('best priced mattress', 'best price mattress'),
 ('best priced mattress', 'best price matress'),
 ('mattress deal', 'matress deal'),
 ('mattress com coupon', 'u mattress com coupon'),
 ('discounted mattress near', 'discount mattress near'),
 ('cheap bed matress deal', 'cheap bed mattress deal'),
 ('mattress king memorial day sale', 'mattress firm memorial day sale'),
 ('mattress king memorial day sale', 'mattress one memorial day sale'),
 ('mattress one memorial day sale', 'mattress king memorial day sale'),
 ('mattress warehouse louisville', 'mattress warehouse louisville ky'),
 ('u mattress sale', 'w mattress sale'),
 ('matress discount', 'mattress discount'),
 ('matress store near', 'mattress store near'),
 ('biggest matress size', 'biggest mattress size'),
 ('discount mattress baltimore md', 'discount mattress baltimore'),
 ('discount mattress baltimore', 'discount mattress baltimore md'),
 ('best mattress price', 'best matress price'),
 ('mattress firm deliver', 'mattress firm denver'),
 ('matteress sale', 'mattress sale')]

In [314]:
df_similar_keyword = pd.DataFrame(similar_keywords, columns=['keyword', 'similar keyword'])

In [315]:
df_similar_keyword

Unnamed: 0,keyword,similar keyword
0,firm mattress discount,foam mattress discount
1,mattress warehouse louisville ky,mattress warehouse louisville
2,affordable mattress nyc,affordable mattress inc
3,mattress sale,mattresssale
4,mattress sale,mattres sale
5,mattress sale,matress sale
6,mattress sale,matteress sale
7,best matress deal,best mattress deal
8,mattress firm columbus oh,mattress firm columbus ohio
9,online mattress store,online matress store
