In [8]:
import pandas as pd
import re
import operator

In [9]:
def fn_clean_data(s):
    s = s.lower() # making everything lower case
    s = re.sub('''[(),.!?:;\"'-]''', '', s)  # remove special characters
    s = re.sub(' +', ' ', s) # remove multiple spaces
    return s

def fn_getData():
    # Reading the data-
    reference_df = pd.read_csv('reference.csv')
    query_df = pd.read_csv('query.csv')
    query_df_orig = query_df
    
    # Concatenating the relevant columns-
    reference_df['ref_name'] = reference_df.apply(lambda x:'%s%s%s%s' % (x['name'],x['address'],x['city'],x['cuisine']),axis=1)
    reference_df['ref_name'] = reference_df['ref_name'].apply(lambda x: fn_clean_data(x))
    reference_df = reference_df[['id', 'ref_name']]
    
    query_df['query_name'] = query_df.apply(lambda x:'%s%s%s%s' % (x['name'],x['address'],x['city'],x['cuisine']),axis=1)
    query_df['query_name'] = query_df['query_name'].apply(lambda x: fn_clean_data(x))
    query_df = query_df[['id', 'query_name']]
    
    return reference_df, query_df, query_df_orig

reference_df, query_df, query_df_orig = fn_getData()

In [10]:
def fn_getSearchTokens(row):
    search_dict = {}
    search_id = row[0]
    search_name = row[1]
    
    token_dict = {}
    for token in search_name.split(' '):
        if token not in token_dict:
            token_dict[token] = 1
    
    search_dict[search_id] = token_dict
    
    return search_dict

query_search_arr = query_df.apply(lambda row: fn_getSearchTokens(row), axis = 1).values # returns an array for query_search

reference_search_arr = reference_df.apply(lambda row: fn_getSearchTokens(row), axis = 1).values # returns an array for ref search

In [11]:
def fn_computeSimilarity(query_search_arr, reference_search_arr):
    score_query_dict = {}
    for query in query_search_arr:
        for query_id, query_dict in query.items():
            score_ref_dict = {}
            for reference in reference_search_arr:
                for ref_id, ref_dict in reference.items():
                    total = 0.0
                    found = 0.0
                    score = 0.0
                    for k, v in query_dict.items():
                        total += 1
                        if k in ref_dict.keys():
                            found += 1
                    score = 100*(found / total)
                    score_ref_dict[ref_id] = score
            score_query_dict[query_id] = score_ref_dict
    return score_query_dict

score_query_dict = fn_computeSimilarity(query_search_arr, reference_search_arr)

In [12]:
def fn_matchRefId(row, item_match_dict):
    q_id = row[0]
    ref_item_score = item_match_dict[q_id]
    return ref_item_score[0]

def fn_matchRefScore(row, item_match_dict):
    q_id = row[0]
    ref_item_score = item_match_dict[q_id]
    return round(ref_item_score[1], 2)

def fnQueryRefMap(score_query_dict, query_df_orig):
    item_match_dict = {}
    for q_id, r_dict in score_query_dict.items():
        sorted_ref_tup = sorted(r_dict.items(), key=operator.itemgetter(1), reverse=True)
        item_match_dict[q_id] = sorted_ref_tup[0] # taking the top match
        
    query_df_orig['pred_ref_id'] = query_df_orig.apply(lambda row: fn_matchRefId(row, item_match_dict), axis = 1)
    query_df_orig['pred_ref_score'] = query_df_orig.apply(lambda row: fn_matchRefScore(row, item_match_dict), axis = 1)
    
    del query_df_orig['query_name']
    
    return query_df_orig

query_df_orig = fnQueryRefMap(score_query_dict, query_df_orig)

In [15]:
query_df_orig

Unnamed: 0,id,name,address,city,cuisine,pred_ref_id,pred_ref_score
0,1001,chanterelle,2 harrison st.,new york city,french (new),31,75.00
1,1002,daniel,20 e. 76th st.,new york city,french (new),32,88.89
2,1003,dawat,210 e. 58th st.,new york city,indian,33,77.78
3,1004,felidia,243 e. 58th st.,new york city,italian,34,88.89
4,1005,four seasons,99 e. 52nd st.,new york city,american (new),35,90.00
5,1006,gotham bar & grill,12 e. 12th st.,new york city,american (new),36,91.67
6,1007,gramercy tavern,42 e. 20th st.,new york city,american (new),37,90.00
7,1008,island spice,402 w. 44th st.,new york city,caribbean,38,90.00
8,1009,jo jo,160 e. 64th st.,new york city,french bistro,39,70.00
9,1010,la caravelle,33 w. 55th st.,new york city,french (classic),40,81.82
