In [6]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

bert_base_mean = SentenceTransformer('bert-base-nli-mean-tokens')
bert_base_max = SentenceTransformer('bert-base-nli-max-tokens')

In [7]:
bert_large_mean = SentenceTransformer('bert-large-nli-mean-tokens')

In [8]:
bert_large_max = SentenceTransformer('bert-large-nli-max-tokens')

In [9]:
import sys
sys.path.append("/Users/dhairya/Documents/talla_sys/magic")
from magic.models import metrics

def get_metrics(max_results: int, results: pd.Series, ) -> dict:
    Rs = np.zeros(len(results)) + 1

    truncated_pk = [np.mean([metrics.precision_at_k(k + 1, yrel)
                             for yrel in results]) for k in range(max_results)]
    truncated_pk = [round(val, 4) for val in truncated_pk]

    truncated_rk = [np.mean([metrics.recall_at_k(k + 1, yrel, R=1)
                             for yrel in results]) for k in range(max_results)]
    truncated_rk = [round(val, 4) for val in truncated_rk]

    return {"mrr": round(metrics.mrr(max_results, results), 4),
            "map": round(metrics.map(max_results, results), 4),
            "truncated mrr": round(metrics.truncated_mrr(results, Rs), 4),
            "truncated map": round(metrics.truncated_map(results, Rs), 4),
            "truncated precision@k": truncated_pk,
            "truncated recall-1@k": truncated_rk}

In [10]:
import faiss
from typing import List
import numpy as np

def build_semantic_index(feature_size: int, embeddings: np.array) -> 'index':
    index = faiss.IndexIDMap(faiss.IndexFlatIP(feature_size))
    index.add_with_ids(embeddings, np.array(range(0, len(embeddings))))
    return index

def search(query_embedding: np.array, 
           index:faiss.swigfaiss.IndexIDMap,
           num_results: int = 4) -> List[int]:
    query_vector = np.array(query_embedding)
    k = num_results
    top_k = index.search(query_vector, k)
    return top_k[1][0]

# Demo

In [11]:
import time

data = [
    "How do I signup for Autofile in Texas?",
    "How do I signup for Autofile in Wisconsin?",
    "How do I signup for Autofile in New Jersey?",
    "Texas Autofile note?",
    "Autofile Tax Setup?",
    "Creative Cloud",
    "What are the primary colors? The primary colors red, blue, and green.",
    "Red Red Red. THat is primary.",
    "Do I need to be online to access my desktop apps?",
    "How do I access the applications in Creative Cloud?",
    "How do I set applications password in Creative Cloud?",
    "As a Creative Cloud member, am I required to install an upgrade to a desktop application when it becomes avaiable?"
]

de = np.array(bert_large_max.encode(data))

index = build_semantic_index(1024, de)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


In [15]:
ri = search(bert_large_max.encode(["Autofiling in Texas"]),
            index, 4)

for _, i in enumerate(ri):
    print(f"{_+1}: {data[i]}")

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.99it/s]

1: Texas Autofile note?
2: How do I signup for Autofile in Texas?
3: Autofile Tax Setup?
4: How do I signup for Autofile in New Jersey?





# TaxJar Experiments

## Load Taxjar data

In [1]:
import pandas as pd
import numpy as np
import pickle 

tj_doc_headers = pd.read_csv("data/taxjar_doc_headers.csv")
tj_ke_headers = pd.read_csv("data/taxjar_ke_headers.csv")
tj_map = pickle.load(open("data/tj_map.pkl", "rb"))
tj_queries = pd.read_csv("data/taxjar_queries.csv")

tj_ke_headers["title"] = tj_ke_headers["title"].apply(lambda x: str(x).strip())
tj_doc_headers["title"] = tj_doc_headers["title"].apply(lambda x: str(x).strip())

### Generate Embeddings and Index
Embeddings Strat: Bert Large + Max pooling

In [104]:
ke_embeddings = bert_large_max.encode(tj_ke_headers["title"])

Batches: 100%|██████████| 29/29 [01:00<00:00,  3.56s/it]


In [105]:
doc_embeddings = bert_large_max.encode(tj_doc_headers["title"])

Batches: 100%|██████████| 14/14 [00:28<00:00,  2.75s/it]


In [107]:
ke_embeddings = np.array(ke_embeddings)
doc_embeddings = np.array(doc_embeddings)
ke_embeddings.shape

(923, 1024)

In [109]:
ke_index = build_semantic_index(1024, ke_embeddings)
doc_index = build_semantic_index(1024, doc_embeddings)

In [189]:
# from sklearn.metrics.pairwise import cosine_similarity
# import ast 

# final_res = []

# for i, row in tj_queries.iterrows():
#     query = row["query"]
#     gold_ke_id = row["gold_knowledge_id"]
#     gold_doc_id = row["gold_document_id"]
    
#     query_embed = np.array(bert_large_max.encode([query]))
#     ke_res_ids = [tj_ke_headers.iloc[i].ke_id for i in search(query_embed, ke_index, 4)]
#     ke_res_text = [tj_map[i]["title"] for i in ke_res_ids]
#     doc_res_ids = [tj_doc_headers.iloc[i].doc_id for i in search(query_embed, doc_index, 4)]
#     doc_res_text = [tj_map[i]["title"] for i in doc_res_ids]
    
#     all_ids = np.hstack([ke_res_ids, doc_res_ids])
#     all_text = np.hstack([ke_res_text, doc_res_text])
#     all_text_embed = np.array(bert_large_max.encode(all_text))
#     cos_scores = cosine_similarity(query_embed, all_text_embed)
#     best_scores =  np.argsort(cos_scores)[::-1][0][:4]
#     best_ids = [all_ids[i] for i in best_scores]
    
#     bert_eval = [int(i in [gold_ke_id, gold_doc_id]) for i in best_ids]
    
#     final_res.append({"query_id": row.query_id,
#                       "query": row.query,
#                       "bert_eval": bert_eval,
#                       "bert_correct": np.sum(bert_eval),
#                       "bert_rank": np.argmax(bert_eval) + 1 if np.sum(bert_eval) > 0 else -1,
#                       "baseline_eval": ast.literal_eval(row.eval_results),
#                       "baseline_correct": row.correct_flag,
#                       "baseline_rank": np.argmax(ast.literal_eval(row.eval_results)) + 1 if np.sum(ast.literal_eval(row.eval_results)) > 0 else -1})


In [176]:
query_type = tj_queries[["query_id", "query_type"]]
df = pd.DataFrame(final_res)

df = pd.merge(df, query_type)

In [178]:
print("Overall BERT-Large metrics")
display(get_metrics(4, df["bert_eval"]))

print("BERT metrics for questions")
display(get_metrics(4, df.query("query_type=='question'")["bert_eval"]))

print("BERT metrics for search")
display(get_metrics(4, df.query("query_type!='question'")["bert_eval"]))

Overall BERT-Large metrics


{'mrr': 0.0794,
 'map': 0.0803,
 'truncated mrr': 0.0794,
 'truncated map': 0.1048,
 'truncated precision@k': [0.051, 0.0449, 0.0422, 0.0429],
 'truncated recall-1@k': [0.051, 0.0898, 0.1265, 0.1714]}

BERT metrics for questions


{'mrr': 0.0779,
 'map': 0.0786,
 'truncated mrr': 0.0779,
 'truncated map': 0.1073,
 'truncated precision@k': [0.0459, 0.0419, 0.0423, 0.0453],
 'truncated recall-1@k': [0.0459, 0.0838, 0.127, 0.1811]}

BERT metrics for search


{'mrr': 0.084,
 'map': 0.0854,
 'truncated mrr': 0.084,
 'truncated map': 0.0969,
 'truncated precision@k': [0.0667, 0.0542, 0.0417, 0.0354],
 'truncated recall-1@k': [0.0667, 0.1083, 0.125, 0.1417]}

In [187]:
ri = len(df.query("bert_correct > 0 and baseline_correct == 0")) 
print(f"Recall improvement: {ri}, {ri/len(df)}%")
 
rru = len(df.query("bert_rank > 0 & baseline_rank > bert_rank"))
print(f"Rank improvement: {rru}, {rru / len(df)}", )

Recall improvement: 5, 0.01020408163265306%
Rank improvement: 13, 0.026530612244897958


In [184]:
df.query("bert_correct > 0 and baseline_correct == 0")

Unnamed: 0,query_id,query,bert_eval,bert_correct,bert_rank,baseline_eval,baseline_correct,baseline_rank,query_type
41,f6e48be2-71ea-4a2d-be6f-391a0e5ecfc1,How do I import a transaction via CSV,"[0, 1, 0, 0]",1,2,"[0, 0, 0, 0, 0, 0, 0, 0]",0,-1,question
47,bc0e21c5-0c2b-49d6-8292-4e9d2d0cdadc,How do I import a CSV,"[0, 1, 0, 0]",1,2,"[0, 0, 0, 0, 0, 0, 0, 0]",0,-1,question
163,90119224-18c2-43a6-8576-fd5c3712f435,I need legal advice for my business. Can TaxJa...,"[1, 0, 0, 0]",1,1,[0],0,-1,question
366,ef453889-148b-41be-9a65-97abbb2fb05f,what should I do if i didn't collect sales taxes?,"[0, 1, 0, 0]",1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,-1,question
414,97b35439-2ff0-4133-ab0d-c8b405ab7f55,Does taxjar collect for me?,"[0, 0, 0, 1]",1,4,"[0, 0, 0, 0, 0, 0, 0, 0]",0,-1,question


In [188]:
df.query("bert_rank > 0 & baseline_rank > bert_rank")

Unnamed: 0,query_id,query,bert_eval,bert_correct,bert_rank,baseline_eval,baseline_correct,baseline_rank,query_type
3,61c7fea3-3ac3-4df7-867b-776f16bd1277,Why is a state on my dashboard?,"[0, 0, 1, 1]",2,3,"[0, 0, 0, 1, 0, 0, 0, 0, 0]",1,4,question
37,fda37a15-8258-43ca-9982-694b625e3a88,How do I install TaxJar in Magento,"[1, 0, 0, 0]",1,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,3,question
154,769a595a-8ed7-445b-bff3-696d94d4bd86,How should I set up PayPal rates?,"[1, 1, 0, 0]",2,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,3,question
155,f3c91f63-329a-436b-9917-fe3524d45417,How should I set up PayPal rates?,"[1, 1, 0, 0]",2,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,3,question
169,fcfa4f29-e174-4c3b-b0f1-3eaf68540d75,Can I use eBay managed payments?,"[1, 0, 0, 0]",1,1,"[0, 1, 0]",1,2,question
195,d8214eb6-f247-4b7b-b51a-56f7333431d4,can i link multiple stores to taxjar?,"[1, 0, 1, 0]",2,1,"[0, 0, 1, 0, 0, 0, 0]",1,3,question
259,ca038075-3280-4772-ba7f-1d797f74bbc0,I use Square Register,"[1, 1, 0, 0]",2,1,"[0, 1, 0, 0, 0, 0, 0, 0]",1,2,search
291,423dba1b-5a14-41bd-b25b-9e7dbdf66182,will i be billed more if i add more than one a...,"[0, 0, 0, 1]",1,4,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",1,6,question
355,c2e0485f-446c-41ff-9902-cdc3352533e7,can i still see rates after installing taxjar,"[0, 1, 1, 0]",2,2,"[0, 0, 1, 0, 0, 0, 0, 0]",1,3,question
374,1979cacc-3ce0-42b0-9bcf-363a2b264b0d,can i connect all of my sales channels to taxjar?,"[1, 0, 1, 0]",2,1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1,3,question


In [2]:
tj_queries.query("query_id=='fcfa4f29-e174-4c3b-b0f1-3eaf68540d75'")

Unnamed: 0,gold_document_id,gold_knowledge_id,label_source,org,org_id,query,query_type,useful_results,organization_id,query_id,eval_results,correct_flag
169,07181406-aed9-4586-8adb-dd8ec9dc1bf3,328715d5-45bb-4fc3-8580-5c002bee2fe3,feedback,TaxJar,82167,Can I use eBay managed payments?,question,1.0,82167.0,fcfa4f29-e174-4c3b-b0f1-3eaf68540d75,"[0, 1, 0]",1


In [3]:
tj_map["328715d5-45bb-4fc3-8580-5c002bee2fe3"]

{'title': 'How do I connect my eBay Account to TaxJar?',
 'content': "We do not currently have a direct integration with eBay or with their new Managed Payments system.        However, we do have an integration with PayPal, and if your eBay payments are processed through PayPal, you can connect your eBay account to TaxJar by   connecting PayPal as a Linked Account   .          Once you connect your PayPal account, TaxJar will   automatically import your PayPal transactions   with the following status:        Pending      Processing      Success      Denied      Reversed        If your PayPal account contains transactions outside of your eBay sales that you don't want included in your TaxJar Reports, then we'd recommend that you   upload your eBay sales by CSV   rather than connecting your PayPal account to TaxJar.",
 'doc_id': '07181406-aed9-4586-8adb-dd8ec9dc1bf3',
 'doc_title': 'How do I connect my eBay Account to TaxJar?'}