### 1. GOAL

The goal of this task is to find similar keywords, for instance: 'matresses sale' and 'matresses sales'

### 2. DATA

The data is from excel file organic keywords. We only use first which contains the keywords


### 3. TECHNIQUE SUMMARY

    3.1 Load data
    3.2 Tokenize and clean data
    3.3 Find keywords similarity

In [18]:
%matplotlib inline

import nltk.data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
import string
import textdistance

import pandas as pd
import numpy as np

import time

In [19]:
# get keywords column from macys_organic_keywords excel file
# this file already modified from the original csv file to make easy to get the keywords
df = pd.read_excel('macys_organic_keywords.xlsx', sheetname='macys_organic_keywords')

In [20]:
df["Keyword"]

0                                    mattress sale
1                                   mattress sales
2                                   mattress deals
3                           mattress sales near me
4                              macys mattress sale
5                              mattresses for sale
6                             macy's mattress sale
7                                mattress for sale
8                            mattress sale near me
9                                   macys mattress
10                                    matress sale
11                              mattresses on sale
12                     mattresses for sale near me
13                                macys mattresses
14                              macy mattress sale
15                                   matress sales
16                       mattress for sale near me
17                                mattress on sale
18                              sale on mattresses
19                     mattress

### 3.2 Tokenize and Clean Data

In [21]:
# tokenize keywords
df["tokenized_keywords"] = df["Keyword"].apply(nltk.word_tokenize)

In [22]:
# convert dataframe to list of list
tokenized_keywords = df["tokenized_keywords"].values.tolist()

In [23]:
# Clean keywords
# Convert to lowercase
# Remove punctuation from each word
# Filter out remaining tokens that are not alphabetic
# Filter out tokens that are stop words
# lemmatization
def clean_text(list_tokens):
    #ps = PorterStemmer()
    word_lemma = WordNetLemmatizer()
    cleaned_tokens = list()
    for tokens in list_tokens:
        # convert to lower case
        tokens = [w.lower() for w in tokens]
        # remove punctuation from each word
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # filter out stop words
        #nltk.download('stopwords')
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        # stemming
        # words = [ps.stem (w) for w in words]
        # lemmatize
        # nltk.download('wordnet')
        words = [word_lemma.lemmatize (w) for w in words]
        cleaned_tokens.append(words)
    return cleaned_tokens

In [24]:
# apply clean_text method to the tokenized_keywords
cleaned_keywords = clean_text(tokenized_keywords)

In [25]:
cleaned_keywords

[['mattress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'deal'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress', 'sale'],
 ['mattress', 'sale'],
 ['macy', 'mattress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress'],
 ['matress', 'sale'],
 ['mattress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['macys', 'mattress'],
 ['macy', 'mattress', 'sale'],
 ['matress', 'sale'],
 ['mattress', 'sale', 'near'],
 ['mattress', 'sale'],
 ['sale', 'mattress'],
 ['mattress', 'sale', 'weekend'],
 ['sale', 'mattress'],
 ['sale', 'mattress'],
 ['macy', 'mattress'],
 ['best', 'mattress', 'deal'],
 ['macys', 'mattress', 'sale', 'date'],
 ['mattress', 'sale', 'near'],
 ['macys', 'matress', 'sale'],
 ['mattress', 'deal', 'near'],
 ['mattress', 'sale', 'macy'],
 ['macys', 'mattress', 'sale'],
 ['macy', 'mattress'],
 ['mattress', 'sale', 'today'],
 ['mattress', 'sale', 'macys'],
 ['bed', 'mattress', 'deal'],
 ['macy', 'mattress'],
 ['macy', 'mattress', 'sale', 'coupo

In [26]:
# convert token to sentence
sent_keywords = list()
def token_to_sent(tokenized_keywords):
    for token_keywords in tokenized_keywords:
        sent_keywords.append(' '.join(token_keywords))
    return sent_keywords

In [27]:
sentence_keywords = token_to_sent(cleaned_keywords)

In [28]:
sentence_keywords

['mattress sale',
 'mattress sale',
 'mattress deal',
 'mattress sale near',
 'macys mattress sale',
 'mattress sale',
 'macy mattress sale',
 'mattress sale',
 'mattress sale near',
 'macys mattress',
 'matress sale',
 'mattress sale',
 'mattress sale near',
 'macys mattress',
 'macy mattress sale',
 'matress sale',
 'mattress sale near',
 'mattress sale',
 'sale mattress',
 'mattress sale weekend',
 'sale mattress',
 'sale mattress',
 'macy mattress',
 'best mattress deal',
 'macys mattress sale date',
 'mattress sale near',
 'macys matress sale',
 'mattress deal near',
 'mattress sale macy',
 'macys mattress sale',
 'macy mattress',
 'mattress sale today',
 'mattress sale macys',
 'bed mattress deal',
 'macy mattress',
 'macy mattress sale coupon',
 'macy mattress sale',
 'bed mattress sale',
 'bed mattress sale',
 'macy matress sale',
 'mattress sale today',
 'macy labor day mattress sale',
 'best mattress sale today',
 'mattress deal',
 'mattress macy',
 'matress sale',
 'buy matt

In [29]:
# distinct keywords
distinct_keywords = list(set(sentence_keywords))

In [30]:
distinct_keywords

['mattress firm cyber monday',
 'mattress online sale',
 'macys bedding deal',
 'costco mattress full size',
 'new mattress sale cheap',
 'sleep mattress near',
 'best mattress store near',
 'mattress firm la vega',
 'california twin mattress',
 'mattress one sale ad',
 'doctor choice firm mattress',
 'buy good cheap mattress',
 'denver mattress pillow top',
 'price bed mattress',
 'buy cheap mattress online',
 'queen king mattress',
 'mattress sale free shipping',
 'furniture mattress discount king',
 'macys mattress store',
 'cheap new mattress set',
 'twin mattress set mattress firm',
 'mattress firm close',
 'sleep country mattress sale',
 'best time buy matress',
 'three quarter bed mattress box spring',
 'buy mattress',
 'matresses sale',
 'sleep train mattress sale',
 'quality mattress discount price',
 'king mattress near',
 'futon mattress san antonio',
 'mattress firm semi annual sale',
 'mattress king chicago',
 'mattress store online',
 'best deal mattress furniture',
 'bes

In [31]:
len(distinct_keywords)

1137

### 3.3 Find Keywords Similarity
We use Levenshtein distance. This distance is computed by finding the number of edits which will transform one string to another. The transformations allowed are insertion — adding a new character, deletion — deleting a character and substitution — replace one character by another. By performing these three operations, the algorithm tries to modify first string to match the second one.

In [32]:
# find similar keyword by applying levensthein algorithm with simillarity threshold value between 0.91 and 1.
# the output is in list of tuple of the keywords and its similar words

def keywords_matching(the_keywords):
    start = time.time()
    
    similar_keywords = list()
    copy_distinct_keywords = the_keywords
    for keywords in distinct_keywords:
        for idx in range (len(copy_distinct_keywords)):
            if 0.91 <= textdistance.levenshtein.normalized_similarity(keywords, distinct_keywords[idx]) < 1.0:
                similar_keywords.append((keywords, distinct_keywords[idx]))
          
    print('Total time: ' + str((time.time() - start)) + ' secs')
    return similar_keywords

In [33]:
similar_keywords = keywords_matching(distinct_keywords)

Total time: 2846.1817483901978 secs


In [34]:
similar_keywords 

[('new mattress sale cheap', 'bed mattress sale cheap'),
 ('macys mattress store', 'macy mattress store'),
 ('best time buy matress', 'best time buy mattress'),
 ('matress sale', 'mattress sale'),
 ('futon mattress stlouis', 'futon mattress st louis'),
 ('matteress sale', 'mattress sale'),
 ('cheap bed mattress deal', 'cheap bed matress deal'),
 ('macy bed sale', 'macys bed sale'),
 ('macys mattress sale date', 'macy mattress sale date'),
 ('bed matress sale', 'bed mattress sale'),
 ('matress sale near', 'mattress sale near'),
 ('macy mattress sale date', 'macys mattress sale date'),
 ('macys mattress', 'macys matress'),
 ('macys mattress', 'macy mattress'),
 ('bed matress set', 'bed mattress set'),
 ('u mattress sale', 'w mattress sale'),
 ('best low priced mattress', 'best low price mattress'),
 ('used mattress sale near', 'bed mattress sale near'),
 ('discount mattress near', 'discounted mattress near'),
 ('mattress sale macys', 'mattress sale macy'),
 ('mattress sale macy', 'mattre

In [35]:
df_similar_keyword = pd.DataFrame(similar_keywords, columns=['keyword', 'similar keyword'])

In [36]:
df_similar_keyword

Unnamed: 0,keyword,similar keyword
0,new mattress sale cheap,bed mattress sale cheap
1,macys mattress store,macy mattress store
2,best time buy matress,best time buy mattress
3,matress sale,mattress sale
4,futon mattress stlouis,futon mattress st louis
5,matteress sale,mattress sale
6,cheap bed mattress deal,cheap bed matress deal
7,macy bed sale,macys bed sale
8,macys mattress sale date,macy mattress sale date
9,bed matress sale,bed mattress sale


In [37]:
df_similar_keyword.to_csv('similar_keywords.csv')