In [8]:
import pkg_resources
from symspellpy import SymSpell, Verbosity
import time
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np 
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.quantization import quantize_dynamic
from torch.nn import Embedding, Linear
import random

In [9]:
global_rng = random.Random()

def ids_tensor(shape, vocab_size, rng=None, name=None):
    #  Creates a random int32 tensor of the shape within the vocab size
    if rng is None:
        rng = global_rng

    total_dims = 1
    for dim in shape:
        total_dims *= dim

    values = []
    for _ in range(total_dims):
        values.append(rng.randint(0, vocab_size - 1))

    return torch.tensor(data=values, dtype=torch.long, device='cpu').view(shape).contiguous()



In [15]:
input_ids = ids_tensor([8, 128], 2)
attention_mask = ids_tensor([8, 128], vocab_size=2)
dummy_input = {"input_ids":input_ids, "attention_mask":attention_mask}


In [17]:
model_org = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

q_model = quantize_dynamic(model_org, {Linear, Embedding})
traced_model = torch.jit.trace(q_model, dummy_input,strict=False)
torch.jit.save(traced_model, "quant/pytorch_model.pth")

loaded_quantized_model = torch.jit.load( "quant/pytorch_model.pth")

loaded_quantized_model.cpu()
loaded_quantized_model.eval()

model_org.cpu()
model_org.eval()
1+1

  assert all(
  if a.grad is not None:


2

In [18]:
sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [19]:
with open("DO_NOT_CHECK.txt","r") as f:
    DO_NOT_CHECK = f.read()
    DO_NOT_CHECK.split(" ")

In [20]:
def fix_sentence(sentence,DO_NOT_CHECK=DO_NOT_CHECK):
    fixed = []
    for word in sentence.split(" "):
        word = word.lower()
        if word in DO_NOT_CHECK:
            fixed.append(word)
        else:
            suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3, include_unknown=True)
            fixed.append(suggestions[0].__str__().split(",")[0])
    return fixed

In [21]:
text = " ".join(fix_sentence('Whre can I fnid informion about my past pruchases'))
text

'were can i find information about my past purchases'

In [22]:
data = pd.read_csv("input and responses.csv", low_memory=False)
data

Unnamed: 0,INPUT_TEXT,RESPONSE
0,Can I speak to the Human,"Yes, please wait while we are connecting you t..."
1,Hi,Hello. How can I help you?
2,Hello,Hello. How can I help you?
3,Good Morning,Hello. How can I help you?
4,Good Afternoon,Hello. How can I help you?
5,Greetings,Hello. How can I help you?
6,I want to cancel my subscription,To cancel you subscription you need to go to <...
7,I want to renew a subscription,To renew you subscription you need to go to <l...
8,How can I edit my personal details,"In order to edit your personal details, please..."
9,Where can I find information on my payments,All information on past and future payemnts ca...


In [23]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output['token_embeddings'] 
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def calculate_emb(sentence, model = loaded_quantized_model, tokenizer = tokenizer):
    encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    features = {'input_ids':encoded_input['input_ids'], 'attention_mask':encoded_input['attention_mask']}
    with torch.no_grad():
        output = model(features)

    sentence_embeddings = mean_pooling(output, encoded_input['attention_mask'])
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

In [24]:
inp_res_dicto = {}
for inp,res in zip(data.INPUT_TEXT, data.RESPONSE):
    inp_res_dicto.update({inp:res})
np.save('inp_res_dicto.npy', inp_res_dicto) 

In [25]:
inp_emb_dicto = {}
for i,inp in enumerate(data.INPUT_TEXT.tolist()):
    inp_emb_dicto.update({inp:calculate_emb(inp)})
np.save('inp_emb_dicto.npy', inp_emb_dicto) 

In [26]:
fixed = " ".join(fix_sentence( 'Whre can I fnid informion about my past pruchases' ))
embeddings1 = calculate_emb(fixed)
best_score = 0
best_key = None
for key in inp_emb_dicto.keys():
    temp_score = util.pytorch_cos_sim(embeddings1, inp_emb_dicto[key])
    if temp_score > best_score:
        best_key = key
        best_score = temp_score
best_score,best_key


(tensor([[0.9595]]), 'Where can I find information about my past purchases')