In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install sentence_transformers

In [3]:
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime
import numpy as np
from sklearn.cluster import AgglomerativeClustering

In [4]:
saved_model_path = "/content/drive/MyDrive/Embeddings/make-multilingual-en-ne-2023-09-12_21-09-30`b"
model = SentenceTransformer(saved_model_path)

In [5]:
# Annotated Queries
f = open("/content/drive/MyDrive/SBert/annotedqueries.txt", "r")
queries = f.read().split("\n")
filtered_queries = []

for query in queries:
  query = query.replace("-", "").strip()
  filtered_queries.append(query)

print(filtered_queries)

['portfolio not update', 'i could not open my share holder', 'sir ma uae ma xu malai demat and mero share account open garni xa yaha kati thauma bujheko xu but khi help vayana', '299 wala tireko ho paisa chai 360 ho ki kati ho kateko thiyo', 'i could not import my share.', 'why i cannt add ime life stock in my portfoli', 'i have import error', 'import bhayenaw meroshare bata', 'i have import issue', 'already exists with this number and email vancha', 'what is the difference between bull and shark features?', 'payment vaii sako tara enrolled vaxaina kina ho', 'how to delete porfolio', 'hello in my protfolio csv file is not impot', 'i am not able to import portfolio its showing error', "i'm unable to get otp while singup", 'cannot delete my portfolio stocks', "yes i have already download and after uploading file is sowing internal server error, i am facing this problem with my second portfolio as well. i can't refresh", "share can't add come error", 'i opened my mobil but not opend in my

## Encoding with self trained model

In [6]:
import scipy.spatial

corpus_embeddings = model.encode(filtered_queries)

In [7]:
# Query sentences:
queries = ['subscription cancel kasari garne?', 'portfolio app ma import garnai milena']
query_embeddings = model.encode(queries)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n======================\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:\n")

    for idx, distance in results[0:closest_n]:
        print(filtered_queries[idx].strip(), "(Score: %.4f)" % (1-distance))



Query: subscription cancel kasari garne?

Top 5 most similar sentences in corpus:

maile course lin kojeko sir mistake sark 1yr plan subscription vaye jasto ch yeslai cancel garer course ma change garnu  mildinw ? (Score: 0.7470)
how can i delete my portfolio from gurumantra (Score: 0.6990)
how to delete portfolio (Score: 0.6820)
yo app upgrade garna lai payment garna lai aaru subidha xaian? (Score: 0.6791)
how to delete sold stock on protfolio (Score: 0.6739)


Query: portfolio app ma import garnai milena

Top 5 most similar sentences in corpus:

how i can import my portfolio from mero share & tms (Score: 0.7940)
how to import my portfolio from mero share (Score: 0.7894)
portfolio import garnaw milk chainaw (Score: 0.7281)
तपाईँ हरूको subscription lida k kasto facilities xa janna maan lagera sir (Score: 0.7126)
trending garn garn yo technical analysis course matrai kiye hudain ra sir (Score: 0.6553)


## Encoding with MiniLM-L6

In [8]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

corpus_embeddings_2 = embedder.encode(filtered_queries)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [11]:
# Query sentences:
queries = ['why my subscription still not activated', 'portfolio app ma import garnai milena']
query_embeddings_2 = embedder.encode(queries)

# Find the closest 3 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
for query, query_embedding_2 in zip(queries, query_embeddings_2):
    distances = scipy.spatial.distance.cdist([query_embedding_2], corpus_embeddings_2, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n======================\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:\n")

    for idx, distance in results[0:closest_n]:
        print(filtered_queries[idx].strip(), "(Score: %.4f)" % (1-distance))



Query: why my subscription still not activated

Top 5 most similar sentences in corpus:

what happened my subscription is still not activated? (Score: 0.9410)
my subscription is not showed.....what's problem ?? (Score: 0.8021)
how can i get to know that i have got my subscription or not sir ?? (Score: 0.6325)
even after subscription. i got the above error. (Score: 0.6313)
how to activate my account (Score: 0.6016)


Query: portfolio app ma import garnai milena

Top 5 most similar sentences in corpus:

portfolio import garnaw paro (Score: 0.7245)
i cannot import my portfolio in this app (Score: 0.6896)
portfolio import (Score: 0.6566)
portfolio import garda kati stock matrai available ho ? (Score: 0.6502)
how to import me portfolio (Score: 0.6371)


## Clustering

In [13]:
# Clustering

# Normalize the embeddings to unit length
corpus_embeddings_2 = corpus_embeddings_2 /  np.linalg.norm(corpus_embeddings_2, axis=1, keepdims=True)

# Perform kmean clustering
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=2) #, affinity='cosine', linkage='average', distance_threshold=0.4)
clustering_model.fit(corpus_embeddings_2)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(filtered_queries[sentence_id])

In [14]:
cluster_embeddings = []
for i, cluster in clustered_sentences.items():
    cluster_embeddings.append(embedder.encode(cluster))

In [15]:
clustered_sentences = dict(sorted(clustered_sentences.items()))
print(clustered_sentences)

{0: ['already exists with this number and email vancha', 'i opened my mobil but not opend in my laptop', "mero share maa allotted 10 in ur app nothing... what's wrong in this...do you have any idea", 'wht cant i use mega bargain option nin basket although i have taken subscription', 'i have entered wrong mobile no', 'the problem is what i subscribe jumbo combo course & want to watch in laptop', "it require otp but i don't have my mobile number because i have been in qatar", 'i can’t open this app', 'can you delete my account i want to do a subscription with conplete new', "it did't work", 'too slow app', 'invalid referral code vanxa ta', "hello sir/ madam, i am trying to buy your course but i couldn't sent  money through esewa", 'hello sir , i am living in foreign country and i want yearly subscription but i don’t have esewa and ips connection', 'facing problems sir', 'is it possible to transfer the subscription fee through himb mobile banking app?', 'why this app dont work', 'how i ca

## Encoding and Clustering

In [26]:
# Query sentences:
queries = ['i have not got any otp yet', 'portfolio app ma import garnai milena']
query_embeddings_2 = embedder.encode(queries)

# Find the closest 10 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 10

def most_frequent(List):
    unique, counts = np.unique(List, return_counts=True)
    print(unique, counts)
    index = np.argmax(counts)
    return unique[index]

for query, query_embedding_2 in zip(queries, query_embeddings_2):
    cluster_number = []
    distances = scipy.spatial.distance.cdist([query_embedding_2], corpus_embeddings_2, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n======================\n")
    print("Query:", query)
    print("\nMost Similar Cluster:\n")

    # Finds the 10 closest sentences for each corpus
    for idx, distance in results[0:closest_n]:
        matched_result = filtered_queries[idx]
        for i, cluster in clustered_sentences.items():
          if matched_result in cluster:
            cluster_number.append(i)
            break

    # Determines the most frequently occuring cluster number
    most_frequent_cluster = most_frequent(cluster_number)
    print(f"Cluster {most_frequent_cluster + 1}")
    print(clustered_sentences[most_frequent_cluster])





Query: i have not got any otp yet

Most Similar Cluster:

[ 0 15 20 24] [1 1 1 7]
Cluster 25
["i'm unable to get otp while singup", 'did not get any otp??', 'the window where i should put my otp dissapearwd', "mistakely i didn't put otp so my new account is not verified so please further information", "mistakly i didn't put otp so my new account is not verified so please how to activate my account", 'now i got the email with otp but can find a place to put in', 'did not get any otp']


Query: portfolio app ma import garnai milena

Most Similar Cluster:

[ 3 10 17] [7 1 2]
Cluster 4
['portfolio not update', 'i am not able to import portfolio its showing error', "yes i have already download and after uploading file is sowing internal server error, i am facing this problem with my second portfolio as well. i can't refresh", 'hello portfolio refresh issue', 'i am unable to attached my portfolio in the app', "i'm not being able to import my portfolio.", 'my portfolio is not imported', 'i 

In [16]:
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
['already exists with this number and email vancha', 'i opened my mobil but not opend in my laptop', "mero share maa allotted 10 in ur app nothing... what's wrong in this...do you have any idea", 'wht cant i use mega bargain option nin basket although i have taken subscription', 'i have entered wrong mobile no', 'the problem is what i subscribe jumbo combo course & want to watch in laptop', "it require otp but i don't have my mobile number because i have been in qatar", 'i can’t open this app', 'can you delete my account i want to do a subscription with conplete new', "it did't work", 'too slow app', 'invalid referral code vanxa ta', "hello sir/ madam, i am trying to buy your course but i couldn't sent  money through esewa", 'hello sir , i am living in foreign country and i want yearly subscription but i don’t have esewa and ips connection', 'facing problems sir', 'is it possible to transfer the subscription fee through himb mobile banking app?', 'why this app dont work', 'h