In [1]:
import pandas as pd

In [17]:
import json
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

In [18]:
def splitDataFrameList(df,target_column,separator):
    
    ''' 
    source: https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation
    df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def split_text(line, separator):
        splited_line =  [e+d for e in line.split(separator) if e]
        return splited_line
    
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df


In [28]:
class Autocompleter:
    def __init__(self):
        pass

    def import_json(self, json_filename):
        print("load json file...")
        df = load_df(json_filename)
        return df
        
    def process_data(self, new_df):

        
        print("split sentenses on punctuation...")
        for sep in ['. ',', ','? ', '! ', '; ']:
            new_df = splitDataFrameList(new_df, 'Text', sep)
            
        print("Text Cleaning using simple regex...")
        new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
        new_df['Text']=new_df['Text'].apply(lambda x: x.strip("."))
        new_df['Text']=new_df['Text'].apply(lambda x: " ".join(x.split()))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' i ',' I '))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' ?','?'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' !','!'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace(' .','.'))
        new_df['Text']=new_df['Text'].apply(lambda x: x.replace('OK','Ok'))
        new_df['Text']=new_df['Text'].apply(lambda x: x[0].upper()+x[1:])
        new_df['Text']=new_df['Text'].apply(lambda x: x+"?" if re.search(r'^(Wh|How).+([^?])$',x) else x)
        
        print("calculate nb words of sentenses...")
        new_df['nb_words'] = new_df['Text'].apply(lambda x: len(str(x).split(' ')))
        new_df = new_df[new_df['nb_words']>2]
        
        print("count occurence of sentenses...")
        new_df['Counts'] = new_df.groupby(['Text'])['Text'].transform('count')
        
        print("remove duplicates (keep last)...")
        new_df = new_df.drop_duplicates(subset=['Text'], keep='last')
        
        new_df = new_df.reset_index(drop=True)
        print(new_df.shape)  
        
        return new_df
    
    def calc_matrice(self, df):
        # define tfidf parameter in order to count/vectorize the description vector and then normalize it.
        model_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 5), min_df=0)
        tfidf_matrice = model_tf.fit_transform(df['Text'])
        print("tfidf_matrice ", tfidf_matrice.shape)
        return model_tf, tfidf_matrice

    def generate_completions(self, prefix_string, data, model_tf, tfidf_matrice):
        
        prefix_string = str(prefix_string)
        new_df = data.reset_index(drop=True)
        weights = new_df['Counts'].apply(lambda x: 1+ np.log1p(x)).values

        # tranform the string using the tfidf model
        tfidf_matrice_spelling = model_tf.transform([prefix_string])
        # calculate cosine_matrix
        cosine_similarite = linear_kernel(tfidf_matrice, tfidf_matrice_spelling)
        
        #sort by order of similarity from 1 to 0:
        similarity_scores = list(enumerate(cosine_similarite))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:10]

        similarity_scores = [i for i in similarity_scores]
        similarity_indices = [i[0] for i in similarity_scores]

        #add weight to the potential results that had high frequency in orig data
        for i in range(len(similarity_scores)):
            similarity_scores[i][1][0]=similarity_scores[i][1][0]*weights[similarity_indices][i]

        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
        similarity_scores = similarity_scores[0:3]
        similarity_indices_w = [i[0] for i in similarity_scores]
        
        return new_df.loc[similarity_indices_w]['Text'].tolist()

# Loading

In [22]:
#import autocompleter 
autocompl = Autocompleter()

In [None]:
df = autocompl.import_json("sample_conversations.json")
df.shape, df.columns

load json file...
(22264, 3)


((22264, 3), Index(['IsFromCustomer', 'Text', 'index'], dtype='object'))

In [23]:
df = pd.read_csv("g02-federalspending.txt")

The file contains 22K conversations between a customer and a representative.
For the purpose of this project, we are only interested in completing the threads of the representative.

In [24]:
df.head()

Unnamed: 0,Given,When,Then
0,As a UI designer,I want to redesign the Resources page,so that it matches the new Broker design styles.
1,As a UI designer,I want to report to the Agencies about user t...,so that they are aware of their contributions...
2,As a UI designer,I want to move on to round 2 of DABS or FABS ...,so that I can get approvals from leadership.
3,As a UI designer,I want to move on to round 2 of Homepage edits,so that I can get approvals from leadership.
4,As a UI designer,I want to move on to round 3 of the Help page...,so that I can get approvals from leadership.


# Data Selection and Cleaning

The data is going to separate the threads from the customer and the representative, separate the sentenses based on the punctuation (we will keep the punctuation), the final text will be cleaned up with some light regex and only the sentense larger than 1 word will be kept.

Finally, since the representative has the tendency to ask the same question over and over again, the autocomplete is extremely useful by suggesting a complete sentense. In our case, we will count the number of occurence of the same sentense so we can use it as a feature later on and delete the duplicates.

In [30]:
#new_df = autocompl.process_data(df)
#new_df.shape, new_df.columns
new_df=df

# Model and TFIDF matrix

A matrice of similarity is calculated based on the frequency of all the words in the data using tfidfvectorizer

In [31]:
model_tf, tfidf_matrice = autocompl.calc_matrice(new_df)

KeyError: ignored

# Ranking Function

# New Section

Finally, the autocomplete is calculating the similarity between the sentense in the data and the prefix of the sentense written by the representative. As a weight feature, we chose to reorder using the frequency of the most common similar sentense.

examples of auto completions

In [None]:
prefix = 'What is your'

print(prefix,"    \n ")

autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

What is your     
 


['What is your account number?',
 'What is your order number?',
 'What is your phone number?']

In [None]:
prefix = 'How can'
print(prefix,"     ")
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

How can      


['How can I help you?',
 'How can I help you today?',
 'Ok lets see how I can help']

In [None]:
prefix = 'Let me'
print(prefix,"     ")
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

Let me      


['Let me investigate', 'Let me assist you', 'Let me look']

In [None]:
prefix = 'when was'
print(prefix,"     ")
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

when was      


['When was the last time you changed your password?',
 'When was your flight scheduled for?',
 'When was the last time you tried?']

Now, without any uppercase and just with the important words...

In [None]:
prefix = 'when time password'
print(prefix,"     ")
autocompl.generate_completions(prefix, new_df, model_tf,tfidf_matrice)

when time password      


['When was the last time you changed your password?',
 'When you select you password?',
 'Take your time']

# Online Sources for this project

In [None]:
# https://gist.github.com/jlln/338b4b0b55bd6984f883 modified to keep punctuation
# kaggle google store competition for json read
# https://www.kaggle.com/hamishdickson/weighted-word-autocomplete-using-star-wars-dataset

In [None]:
#