In [6]:
import enchant

In [11]:
from nltk.metrics import edit_distance

class SpellingReplacer:
    def __init__(self, dict_name='en_US', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = 2

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)

        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

In [12]:
def spell_check(word_list):
    checked_list = []
    for item in word_list:
        replacer = SpellingReplacer()
        r = replacer.replace(item)
        checked_list.append(r)
    return checked_list


In [23]:
 word_list = ['car', 'Helo','hellp','venmo']

In [24]:
print(spell_check(word_list))

['car', 'Help', 'hello', 'venom']


In [48]:
import json
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances


DATA_DIR = './'   # navigation to the directory

def load_df(json_path='name.json'):
    """
    source: borrowed to kaggle competition gstore
    """
    df = pd.read_json(DATA_DIR+json_path)
    
    for column in ['Issues']:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [str(column+"_"+subcolumn) for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    
    ## function allows to keep the index if we need to merge on the orginal data.
    df = pd.DataFrame([dict(y, index=i) for i, x in enumerate(df['Issues_Messages'].values.tolist()) for y in x])
    
    print(df.shape)
    return df


def splitDataFrameList(df,target_column,separator):
    
    ''' 
    source: https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation
    df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def split_text(line, separator):
        splited_line =  [e+d for e in line.split(separator) if e]
        return splited_line
    
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df




class Autocompleter:
    def __init__(self):
        pass

    def import_json(self, json_filename):   # converts json to dataframe
        print("load json file...")
        df = load_df(json_filename)
        return df
        
    def process_data(self, new_df):

        print("select representative threads...")
        new_df = new_df[new_df.IsFromCustomer==False]
        
        print("split sentenses on punctuation...")
        for sep in ['. ',', ','? ', '! ', '; ']:
            new_df = splitDataFrameList(new_df, 'Text', sep)
            
        print("Text Cleaning using simple regex...")
        new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
        new_df['Text']=new_df['Text'].apply(lambda x: x.strip("."))
        new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' i ',' I '))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' ?','?'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' !','!'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' .','.'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace('OK','Ok'))
        new_df['Text']=new_df['Text'].apply(lambda x: x[0].upper()+x[1:])
        new_df['Text']=new_df['Text'].apply(lambda x: x+"?" if re.search(r'^(Wh|How).+([^?])$',x) else x)
        
        print("calculate nb words of sentenses...")
        new_df['nb_words'] = new_df['Text'].apply(lambda x: len(str(x).split(' ')))
        new_df = new_df[new_df['nb_words']>2]
        
        print("count occurence of sentenses...")
        new_df['Counts'] = new_df.groupby(['Text'])['Text'].transform('count')
        
        print("remove duplicates (keep last)...")
        new_df = new_df.drop_duplicates(subset=['Text'], keep='last')
        
        new_df = new_df.reset_index(drop=True)
        print(new_df.shape)  
        
        return new_df
    
    def calc_matrice(self, df):
        # define tfidf parameter in order to count/vectorize the description vector and then normalize it.
        model_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 5), min_df=0)
        tfidf_matrice = model_tf.fit_transform(df['Text'])
        print("tfidf_matrice ", tfidf_matrice.shape)
        return model_tf, tfidf_matrice

    def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):
        
        prefix_string = str(prefix_string)              #make a copy of the prefix index
        new_df = data.reset_index(drop=True)            #resets indexes to cover the removed elements
        
        #giving more weight to those phrases that occure more frequently
        
        weights = new_df['Counts'].apply(lambda x: 1+ np.log1p(x)).values      
        
       

        # tranform the string using the tfidf model
        tfidf_matrice_spelling = model_tf.transform([prefix_string])
        # calculate cosine_matrix
        # to find the similarity between the string and dataframe
        cosine_similarite = linear_kernel(tfidf_matrice, tfidf_matrice_spelling)
        
        #sort by order of similarity from 1 to 0:
        similarity_scores = list(enumerate(cosine_similarite))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        #selects top 10
        similarity_scores = similarity_scores[0:10]
        #print (similarity_scores)

        similarity_scores = [i for i in similarity_scores]
        similarity_indices = [i[0] for i in similarity_scores]
        #print(similarity_indices)
        #print(similarity_scores)

        #add weight to the potential results that had high frequency in orig data
        for i in range(len(similarity_scores)):
            similarity_scores[i][1][0]=similarity_scores[i][1][0]*weights[similarity_indices][i]
        
        #sort in terms of score

        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        #select top 3 suggestions
        similarity_scores = similarity_scores[0:3]
        #obtain the indexes of the top suggestions
        similarity_indices_w = [i[0] for i in similarity_scores]
        
        #print(similarity_scores)
        #returns the top suggestions form the dataframe
        return new_df.loc[similarity_indices_w]['Text'].tolist()

In [49]:
autocompl = Autocompleter()

In [50]:
df = autocompl.import_json("sample_file.json")
df.shape, df.columns

load json file...
(22264, 3)


  column_as_df = json_normalize(df[column])


((22264, 3), Index(['IsFromCustomer', 'Text', 'index'], dtype='object'))

In [51]:

new_df = autocompl.process_data(df)
new_df.shape, new_df.columns

select representative threads...
split sentenses on punctuation...
Text Cleaning using simple regex...
calculate nb words of sentenses...
count occurence of sentenses...
remove duplicates (keep last)...
(8560, 5)


((8560, 5),
 Index(['IsFromCustomer', 'Text', 'index', 'nb_words', 'Counts'], dtype='object'))

In [56]:

model_tf, tfidf_matrice = autocompl.calc_matrice(new_df)
print (model_tf , tfidf_matrice)

tfidf_matrice  (8560, 99397)
TfidfVectorizer(min_df=0, ngram_range=(1, 5))   (0, 33836)	0.22506411601749762
  (0, 87558)	0.23525303720560317
  (0, 32778)	0.23525303720560317
  (0, 44183)	0.22506411601749762
  (0, 33835)	0.2178349613878467
  (0, 87557)	0.23525303720560317
  (0, 32777)	0.23525303720560317
  (0, 32959)	0.1846205964173706
  (0, 44182)	0.2178349613878467
  (0, 33834)	0.2178349613878467
  (0, 87556)	0.23525303720560317
  (0, 32774)	0.22506411601749762
  (0, 96514)	0.16332887254348138
  (0, 32868)	0.11505321593595512
  (0, 44181)	0.2178349613878467
  (0, 33830)	0.2122275934232327
  (0, 87552)	0.22506411601749762
  (0, 32773)	0.22506411601749762
  (0, 81915)	0.1239111676844768
  (0, 92912)	0.04764879691120654
  (0, 32780)	0.10717536829998196
  (0, 44108)	0.15487367126385715
  (0, 33739)	0.1466392237166858
  (0, 87551)	0.2122275934232327
  (0, 32760)	0.2037723921436084
  :	:
  (8559, 25383)	0.12567308091854906
  (8559, 68058)	0.12541884066144987
  (8559, 68057)	0.12541884066144

In [53]:
prefix = 'What is your'

print(prefix,"    \n ")

autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

What is your     
 
[(4938, array([0.62496768])), (7704, array([0.60085879])), (1723, array([0.5619263])), (3401, array([0.52970721])), (6526, array([0.52830733])), (4400, array([0.4896813])), (5326, array([0.48546842])), (8445, array([0.48546842])), (7694, array([0.48025837])), (6652, array([0.46864655]))]
[(8445, array([2.32257299])), (6652, array([2.09285171])), (7694, array([1.78082213]))]


['What is your account number?',
 'What is your order number?',
 'What is your phone number?']

In [54]:
prefix = 'How can'
print(prefix,"     ")
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

How can      
[(6303, array([0.55285577])), (7084, array([0.41376716])), (6285, array([0.36161791])), (8364, array([0.30942956])), (8376, array([0.29339882])), (6271, array([0.2367474])), (1747, array([0.17666316])), (2250, array([0.16287445])), (6693, array([0.15181223])), (6906, array([0.14698879]))]
[(6303, array([0.93606619])), (7084, array([0.86833685])), (8364, array([0.64937268]))]


['How can I help you?',
 'How can I help you today?',
 'Ok lets see how I can help']

In [38]:
new_df.head()

Unnamed: 0,IsFromCustomer,Text,index,nb_words,Counts
0,False,Hello Werner how may I help you today?,3,8,1
1,False,Sure I can help you with that,3,7,1
2,False,Let me update that information on our system,3,8,1
3,False,I have updated your address to the system,3,8,1
4,False,Ok let me go ahead and request a work order fo...,3,14,1


In [37]:
new_df.reset_index()


Unnamed: 0,level_0,IsFromCustomer,Text,index,nb_words,Counts
0,0,False,Hello Werner how may I help you today?,3,8,1
1,1,False,Sure I can help you with that,3,7,1
2,2,False,Let me update that information on our system,3,8,1
3,3,False,I have updated your address to the system,3,8,1
4,4,False,Ok let me go ahead and request a work order fo...,3,14,1
...,...,...,...,...,...,...
8555,8555,False,Sorry for the wait,1505,4,9
8556,8556,False,I can help you,1505,4,37
8557,8557,False,I can help,1506,3,20
8558,8558,False,Sorry to hear that I can help you with that,1507,10,1


In [39]:
model_tf.transform(['How can'])

<1x99397 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>