### 1. GOAL

The goal of this task is to find similar keywords, for instance: 'matresses sale' and 'matresses sales'

### 2. DATA

The data is from excel file organic keywords. We only use first which contains the keywords


### 3. TECHNIQUE SUMMARY

    3.1 Load data
    3.2 Tokenize and clean data
    3.3 Find keywords similarity

In [39]:
%matplotlib inline

import nltk.data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import string
import textdistance

import pandas as pd
import numpy as np

import time

In [40]:
# get keywords column from macys_organic_keywords excel file
# this file already modified from the original csv file to make easy to get the keywords
df = pd.read_excel('macys_organic_keywords.xlsx', sheetname='macys_organic_keywords')

In [41]:
df["Keyword"]

0                                    mattress sale
1                                   mattress sales
2                                   mattress deals
3                           mattress sales near me
4                              macys mattress sale
5                              mattresses for sale
6                             macy's mattress sale
7                                mattress for sale
8                            mattress sale near me
9                                   macys mattress
10                                    matress sale
11                              mattresses on sale
12                     mattresses for sale near me
13                                macys mattresses
14                              macy mattress sale
15                                   matress sales
16                       mattress for sale near me
17                                mattress on sale
18                              sale on mattresses
19                     mattress

### 3.2 Tokenize and Clean Data

In [42]:
# tokenize keywords
df["tokenized_keywords"] = df["Keyword"].apply(nltk.word_tokenize)

In [43]:
# convert dataframe to list of list
tokenized_keywords = df["tokenized_keywords"].values.tolist()

In [44]:
# Clean keywords
# Convert to lowercase
# Remove punctuation from each word
# Filter out remaining tokens that are not alphabetic
# Filter out tokens that are stop words
# lemmatization
def clean_text(list_tokens):
    #ps = PorterStemmer()
    word_lemma = WordNetLemmatizer()
    cleaned_tokens = list()
    for tokens in list_tokens:
        # convert to lower case
        tokens = [w.lower() for w in tokens]
        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # filter out stop words
        #nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        # stemming
        # words = [ps.stem (w) for w in words]
        # lemmatize
        # nltk.download('wordnet')
        words = [word_lemma.lemmatize (w) for w in words]
        cleaned_tokens.append(words)
    return cleaned_tokens

In [45]:
# apply clean_text method to the tokenized_keywords
cleaned_keywords = clean_text(tokenized_keywords)

In [46]:
cleaned_keywords

[['mattress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'deal'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress', 'sale'],
 ['mattress', 'sale'],
 ['macy', 'mattress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress'],
 ['matress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress'],
 ['macy', 'mattress', 'sale'],
 ['matress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['mattress', 'sale'],
 ['sale', 'mattress'],
 ['mattress', 'sale', 'weekend'],
 ['sale', 'mattress'],
 ['sale', 'mattress'],
 ['macy', 'mattress'],
 ['best', 'mattress', 'deal'],
 ['macys', 'mattress', 'sale', 'date'],
 ['mattress', 'sale', 'near'],
 ['macys', 'matress', 'sale'],
 ['mattress', 'deal', 'near'],
 ['mattress', 'sale', 'macy'],
 ['macys', 'mattress', 'sale'],
 ['macy', 'mattress'],
 ['mattress', 'sale', 'today'],
 ['mattress', 'sale', 'macys'],
 ['bed', 'mattress', 'deal'],
 ['macy', 'mattress'],
 ['macy', 'mattress', 'sale', 'coupo

In [47]:
# convert token to sentence
sent_keywords = list()
def token_to_sent(tokenized_keywords):
    for token_keywords in tokenized_keywords:
        sent_keywords.append(' '.join(token_keywords))
    return sent_keywords

In [48]:
sentence_keywords = token_to_sent(cleaned_keywords)

In [49]:
sentence_keywords

['mattress sale',
 'mattress sale',
 'mattress deal',
 'mattress sale near',
 'macys mattress sale',
 'mattress sale',
 'macy mattress sale',
 'mattress sale',
 'mattress sale near',
 'macys mattress',
 'matress sale',
 'mattress sale',
 'mattress sale near',
 'macys mattress',
 'macy mattress sale',
 'matress sale',
 'mattress sale near',
 'mattress sale',
 'sale mattress',
 'mattress sale weekend',
 'sale mattress',
 'sale mattress',
 'macy mattress',
 'best mattress deal',
 'macys mattress sale date',
 'mattress sale near',
 'macys matress sale',
 'mattress deal near',
 'mattress sale macy',
 'macys mattress sale',
 'macy mattress',
 'mattress sale today',
 'mattress sale macys',
 'bed mattress deal',
 'macy mattress',
 'macy mattress sale coupon',
 'macy mattress sale',
 'bed mattress sale',
 'bed mattress sale',
 'macy matress sale',
 'mattress sale today',
 'macy labor day mattress sale',
 'best mattress sale today',
 'mattress deal',
 'mattress macy',
 'matress sale',
 'buy matt

In [50]:
# distinct keywords
distinct_keywords = list(set(sentence_keywords))

In [51]:
distinct_keywords

['queen mattress set free delivery',
 'mattress sale mn',
 'pick mattress',
 'mattress sale today',
 'best matress sale',
 'firm mattress near',
 'mattress firm free mattress',
 'twin bargain',
 'buy mattress online free shipping',
 'mattress firm denver co',
 'mattress macy',
 'best price matresses',
 'mattress one veteran day sale',
 'king size mattress delivered',
 'denver mattress president day sale',
 'simmons outlet near',
 'discount bed store near',
 'discount mattress green bay',
 'mattress store cleveland',
 'talk today deal',
 'best price quality mattress',
 'matress store near',
 'biggest matress size',
 'mattress firm adjustable base',
 'cheap mattress sale',
 'discount mattress dallas tx',
 'memory foam talk',
 'queen mattress set sale free shipping',
 'mattress firm outlet',
 'bed sale near',
 'discount mattress store jacksonville fl',
 'serta mattress factory outlet',
 'firm mattress deal',
 'best price mattress box spring',
 'cheap queen mattress set near',
 'place sell

In [52]:
len(distinct_keywords)

1137

### 3.3 Find Keywords Similarity
We use Levenshtein distance. This distance is computed by finding the number of edits which will transform one string to another. The transformations allowed are insertion — adding a new character, deletion — deleting a character and substitution — replace one character by another. By performing these three operations, the algorithm tries to modify first string to match the second one.

In [53]:
# find similar keyword by applying levensthein algorithm with simillarity threshold value between 0.91 and 1.
# the output is in list of tuple of the keywords and its similar words

def keywords_matching(the_keywords):
    start = time.time()
    
    similar_keywords = list()
    copy_distinct_keywords = the_keywords
    for keywords in distinct_keywords:
        for idx in range (len(copy_distinct_keywords)):
            if 0.91 <= textdistance.levenshtein.normalized_similarity(keywords, distinct_keywords[idx]) < 1.0:
                similar_keywords.append((keywords, distinct_keywords[idx]))
          
    print('Total time: ' + str((time.time() - start)) + ' secs')
    return similar_keywords

In [54]:
similar_keywords = keywords_matching(distinct_keywords)

Total time: 2811.1560292243958 secs


In [55]:
similar_keywords 

[('best matress sale', 'best mattress sale'),
 ('mattress macy', 'mattress macys'),
 ('matress store near', 'mattress store near'),
 ('biggest matress size', 'biggest mattress size'),
 ('mattress firm outlet', 'matress firm outlet'),
 ('affordable mattress nyc', 'affordable mattress inc'),
 ('macy full size mattress', 'macys full size mattress'),
 ('macys mattress outlet', 'macy mattress outlet'),
 ('macys matress sale', 'macys mattress sale'),
 ('macys matress sale', 'macy matress sale'),
 ('memorial day mattress sale', 'memorial day matress sale'),
 ('matress firm outlet', 'mattress firm outlet'),
 ('best mattress price', 'best matress price'),
 ('best king mattress', 'best king matress'),
 ('matress deal', 'mattress deal'),
 ('mismatched mattress sale', 'mismatch mattress sale'),
 ('matteress sale', 'mattress sale'),
 ('w mattress sale', 'u mattress sale'),
 ('online matress store', 'online mattress store'),
 ('matress firm near', 'mattress firm near'),
 ('u mattress sale', 'w mattr

In [56]:
df_similar_keyword = pd.DataFrame(similar_keywords, columns=['keyword', 'similar keyword'])

In [57]:
df_similar_keyword

Unnamed: 0,keyword,similar keyword
0,best matress sale,best mattress sale
1,mattress macy,mattress macys
2,matress store near,mattress store near
3,biggest matress size,biggest mattress size
4,mattress firm outlet,matress firm outlet
5,affordable mattress nyc,affordable mattress inc
6,macy full size mattress,macys full size mattress
7,macys mattress outlet,macy mattress outlet
8,macys matress sale,macys mattress sale
9,macys matress sale,macy matress sale
