In [13]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('all-MiniLM-L6-v2')

def query_engine(corpus,queries):
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)


    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    for query in queries:
        query_embedding = embedder.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 5 most similar tickets matching your Query:")

        for score, idx in zip(top_results[0], top_results[1]):
            print(int(idx), "(Score: {:.4f})".format(score))

In [60]:
embedder2 = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

def query_engine_ml(corpus,queries):
    corpus_embeddings = embedder2.encode(corpus, convert_to_tensor=True)


    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    for query in queries:
        query_embedding = embedder2.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 5 most similar tickets matching your Query:")

        for score, idx in zip(top_results[0], top_results[1]):
            print(int(idx), "(Score: {:.4f})".format(score))

In [3]:
corpus=['this car came to us with a flashing urea lamp. message on display "starting not possible within 400km" car had error code P20EE in the ECU.so I ran TSB B1HW01KTQ0. the programming of the engine ECU did not go well, see attached photo. then carried out a manual telecoding, which also gave an error code, see photo I also have no communication with the Ad-bleu tank in this car. I also tried to n Ad-bleu tank via the CAN interface, but it immediately gave 5 green check marks, so it was already up to date. but communication with the tank via the CAN interface. tank is full, self-filled. but after the error code P20EE was no longer in the ECU, not even after a test drive.I can therefore not implement the last 2 points of the TSB.resetting the denox system fault codes as i have no communication guided diagnosis because I no longer have a fault P20EE',
           'client goes to the workshop again, with urea and faults on, we downloaded it a few days ago but it has turned on again',
           'eml on urea light on starting inhibited in 100 miles on display',
           'the customer is complaining of a urea indicator light with message start impossible in 100kms + engine warning light',
           'The customer complain that the urea system warning lamp has been lit up',
           'engine light on with urea and service key on just ad blue made by the customer and without driving problems',
           'voyant motor + adblue + breakdown',
           'urea warning light is on',
           'Vehicle is back in with engine light and urea lightson ,P20EE stored again, vehicle got a  urea injector last time due to visual sign of dried urea fluid coming from injector',
           'p20ee in my ccm diag light on']

In [4]:
queries = ["i have problem with ECU"]

In [5]:
query_engine(corpus,queries)





Query: i have problem with ECU

Top 5 most similar tickets matching your Query:
0 (Score: 0.4072)
2 (Score: 0.2310)
3 (Score: 0.2258)
9 (Score: 0.2027)
7 (Score: 0.1851)


In [6]:
sentences_ml=[
    'deze auto is bij ons binnen gekomen met een knipperde urea lamp. melding op display "starten niet mogelijk binnen 400km" auto had storing code P20EE in de ECU staan.dus ik heb TSB B1HW01KTQ0 uitgevoerd. bij  de van de motor ECU ging de programmering niet goed, zie bijgevoegde foto. daarna handmatig een telecodering uitgevoerd, deze gaf ook een foutcode, zie foto ook heb ik bij deze auto geen communicatie met de Ad-bleu tank. ik heb ook geprobeerd Ad-bleu tank via de CAN interface te n , maar deze gaf direct 5 groene vinkjes, dus hij was al up to date. maar wel communicatie met de tank via de CAN interface. tank is vol, zelf afgevuld. maar na de  was foutcode P20EE niet meer in de ECU ook niet na een proefrit. de laatste 2 punten van de TSB kan ik dus niet uitvoeren. resetten van de storingcodes van het denox-systeem, omdat ik geen communicatie heb geleide diagnose omdat ik geen storing P20EE meer heb',
    'cliente acude de nuevo ataller , con urea y fallos encendiddos , hicimos hace unos dias telecarga pero ha vuelto a encender',
    'eml on urea light on starting inhibited in 100 miles on display',
    "le client se plaitn d'un allumage voyant d'uree avec message demarrage impossible dans 100kms + temoin moteur",
    'the customer complain that the urea system warning lamp has been lit up',
    'voyant moteur allume avec uree et cles de service allume pein ad blue fait par le client  et sans probleme de conduite',
    'voyant moteur + adblue + decompte km',
    '尿素警示燈亮',
    'Vehicle is back in with engine light and urea lightson ,P20EE stored again, vehicle got a  urea injector last time due to visual sign of dried urea fluid coming from injector',
    'p20ee dans mon ccm voyant diag allume'
]

In [7]:
queries2=['i have problem with urea light',"j'ai un problème avec la lumière d'urée"]

In [8]:
query_engine_ml(sentences_ml,queries2)





Query: i have problem with urea light

Top 5 most similar tickets matching your Query:
2 (Score: 0.5697)
4 (Score: 0.4817)
9 (Score: 0.4774)
8 (Score: 0.4364)
7 (Score: 0.4362)




Query: j'ai un problème avec la lumière d'urée

Top 5 most similar tickets matching your Query:
9 (Score: 0.5568)
7 (Score: 0.5082)
3 (Score: 0.4554)
0 (Score: 0.4242)
4 (Score: 0.4093)


In [2]:
from datasets import load_dataset

In [10]:
dataset = load_dataset("embedding-data/flickr30k-captions")

Found cached dataset json (C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/embedding-data___json/embedding-data--flickr30k-captions-2712c8cf1b16a4cb/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['set'],
        num_rows: 31783
    })
})

In [12]:
dataset["train"][2]

{'set': ['A child in a pink dress is climbing up a set of stairs in an entry way.',
  'A girl going into a wooden building.',
  'A little girl climbing into a wooden playhouse',
  'A little girl in a pink dress going into a wooden cabin.',
  'A little girl climbing the stairs to her playhouse.']}

In [13]:
dataset["train"][1]

{'set': ['Four men on top of a tall structure.',
  'Workers look down from up above on a piece of equipment.',
  'Several men in hard hats are operating a giant pulley system.',
  'Three men on a large rig.',
  'Two men working on a machine wearing hard hats.']}

In [14]:
dataset["train"][31782]

{'set': ['A man riding a small boat in a harbor, with fog and mountains in the background.',
  'A man in shorts and a Hawaiian shirt leans over the rail of a pilot boat, with fog and mountains in the background.',
  'A young man hanging over the side of a boat, which is in a like with fog rolling over a hill behind it.',
  'A man is leaning off of the side of a blue and white boat as it sits in a body of water.',
  'A man on a moored blue and white boat with hills and mist in the background.']}

In [15]:
dataset2 = load_dataset("embedding-data/coco_captions")

Found cached dataset json (C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/embedding-data___json/embedding-data--coco_captions-18033e4d0db7f137/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
dataset2["train"][0]

{'set': ['A clock that blends in with the wall hangs in a bathroom. ',
  'A very clean and well decorated empty bathroom',
  'A bathroom with a border of butterflies and blue paint on the walls above it.',
  'An angled view of a beautifully decorated bathroom.',
  'A blue and white bathroom with butterfly themed wall tiles.']}

In [17]:
type(dataset2)

datasets.dataset_dict.DatasetDict

In [18]:
dataset2["train"][82782]

{'set': ['A couple of women with some stuffed animals.',
  'Two women smile for the camea while posing iwth some suffed animals ',
  'Fans pose with stuffed animals at an ice rink.',
  'Two women smiling together, one holds a stuffed animal the other has a stuffed animal on her shoulder.',
  'Two women sit and pose with stuffed animals.']}

In [32]:
from sentence_transformers import SentenceTransformer, util

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("./final cleaning/Cleaned_text_of_all_languages.csv")
data.shape

(347081, 6)

In [5]:
data['ID_HISTORICAL_TICKET_ID'].value_counts()

21939    7708
20060    6373
24037    6256
17210    3859
3837     3675
         ... 
30392       1
36639       1
35848       1
23669       1
19757       1
Name: ID_HISTORICAL_TICKET_ID, Length: 8877, dtype: int64

In [20]:
morethan50 = data['ID_HISTORICAL_TICKET_ID'].value_counts()[data['ID_HISTORICAL_TICKET_ID'].value_counts()>49].index

In [21]:
data_50 = data[data['ID_HISTORICAL_TICKET_ID'].isin(morethan50[0:10])]

In [22]:
data_50['ID_HISTORICAL_TICKET_ID'].value_counts()

21939    7708
20060    6373
24037    6256
17210    3859
3837     3675
16729    3315
19010    2403
22353    2147
22737    2051
19951    1944
Name: ID_HISTORICAL_TICKET_ID, dtype: int64

In [23]:
top_50 = data_50.groupby('ID_HISTORICAL_TICKET_ID').head(50)

In [24]:
top_50['ID_HISTORICAL_TICKET_ID'].value_counts()

16729    50
22353    50
20060    50
21939    50
17210    50
24037    50
19010    50
19951    50
3837     50
22737    50
Name: ID_HISTORICAL_TICKET_ID, dtype: int64

In [25]:
import datasets

In [26]:
tktlst=[]
for hist_tkt in top_50['ID_HISTORICAL_TICKET_ID'].value_counts().index:
    tkkts=[]
    tkkts = top_50[top_50['ID_HISTORICAL_TICKET_ID']==hist_tkt]['clean_text_3'].to_list()
    tkss_strip = [t.strip() for t in tkkts]
    tktlst.append(tkss_strip)

d1 = datasets.Dataset.from_dict({'set':tktlst})
dd = datasets.DatasetDict({'train':d1})  

In [27]:
dd

DatasetDict({
    train: Dataset({
        features: ['set'],
        num_rows: 10
    })
})

In [28]:
model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [29]:
from sentence_transformers import InputExample

In [30]:
train_examples = []
train_data = dd['train']['set']
n_examples = dd['train'].num_rows

for i in range(n_examples):
  example = train_data[i]
  train_examples.append(InputExample(texts=example[0:50]))

In [31]:
from sentence_transformers import losses
from torch.utils.data import DataLoader

In [32]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [33]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3 [00:00<?, ?it/s]

In [34]:
model.save("./ticket_similarity_model/ticketsimilarity_xlm-r-distilroberta-base-paraphrase-v1")

In [34]:
import torch

In [53]:
embedder_custom = SentenceTransformer('./ticket_similarity_model/ticketsimilarity_xlm-r-distilroberta-base-paraphrase-v1/')

def query_engine_custom_ml(corpus,queries):
    corpus_embeddings = embedder_custom.encode(corpus, convert_to_tensor=True)


    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    for query in queries:
        query_embedding = embedder_custom.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 5 most similar tickets matching your Query:")

        for score, idx in zip(top_results[0], top_results[1]):
            print(int(idx), "(Score: {:.4f})".format(score))

In [41]:
data[data['ID_HISTORICAL_TICKET_ID']==16729].iloc[51].DS_REPAIRER_LANGUAGE_TICKET_ADDITIONAL_DESC    

"I have changed the Airbag computer because of internal defect in the old one . Now when i have changed it i can't configurate or download software to the airbag or the BSI . Fault-code from BSI is B10AA. The confugure in BSI is the same as in Airbag computer . My diagboix version is 09.103 . the airbag fault-lamp is activated permanent with no fault code is Airbag"

In [None]:
the airbag fault lamp is activated permanent with no fault code is airbag

In [29]:
sent = data[data['ID_HISTORICAL_TICKET_ID']==16729].iloc[52:72]['clean_text_3'].to_list()

In [30]:
quer = ["the airbag fault lamp is activated permanent with no fault code in airbag","le voyant de défaut de l'airbag est activé en permanence sans code d'erreur dans l'airbag","die airbag-fehlerlampe leuchtet dauerhaft und es liegt kein fehlercode im airbag vor"]

In [76]:
query_engine_custom_ml(sent,quer)





Query: the airbag fault lamp is activated permanent with no fault code in airbag

Top 5 most similar tickets matching your Query:
15 (Score: 0.6357)
12 (Score: 0.4984)
2 (Score: 0.4800)
6 (Score: 0.4789)
10 (Score: 0.4743)




Query: le voyant de défaut de l'airbag est activé en permanence sans code d'erreur dans l'airbag

Top 5 most similar tickets matching your Query:
15 (Score: 0.7179)
12 (Score: 0.5496)
2 (Score: 0.5273)
17 (Score: 0.5271)
16 (Score: 0.5165)




Query: die airbag-fehlerlampe leuchtet dauerhaft und es liegt kein fehlercode im airbag vor

Top 5 most similar tickets matching your Query:
15 (Score: 0.6701)
12 (Score: 0.5427)
6 (Score: 0.5339)
2 (Score: 0.5192)
10 (Score: 0.5189)


In [77]:
query_engine_ml(sent,quer)





Query: the airbag fault lamp is activated permanent with no fault code in airbag

Top 5 most similar tickets matching your Query:
15 (Score: 0.5345)
2 (Score: 0.3718)
10 (Score: 0.3427)
17 (Score: 0.3422)
3 (Score: 0.3289)




Query: le voyant de défaut de l'airbag est activé en permanence sans code d'erreur dans l'airbag

Top 5 most similar tickets matching your Query:
15 (Score: 0.6302)
17 (Score: 0.4352)
13 (Score: 0.4330)
2 (Score: 0.4053)
12 (Score: 0.3901)




Query: die airbag-fehlerlampe leuchtet dauerhaft und es liegt kein fehlercode im airbag vor

Top 5 most similar tickets matching your Query:
15 (Score: 0.5963)
13 (Score: 0.4142)
2 (Score: 0.4119)
16 (Score: 0.4088)
12 (Score: 0.4025)


In [74]:
"Die Airbag-Fehlerlampe leuchtet dauerhaft und es liegt kein Fehlercode im Airbag vor".lower()

'die airbag-fehlerlampe leuchtet dauerhaft und es liegt kein fehlercode im airbag vor'

In [70]:
data[data['ID_HISTORICAL_TICKET_ID']==16729].iloc[0:50]['CD_REPAIRER_LANGUAGE_ISO_CODE'].value_counts()

FR    22
DE    11
ES     6
PT     3
NL     2
EN     2
JA     1
TR     1
HE     1
IT     1
Name: CD_REPAIRER_LANGUAGE_ISO_CODE, dtype: int64

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("./final cleaning/Cleaned_text_of_all_languages.csv")
data.shape

(347081, 6)

# training on all historical tickets(347K) with 105 sentences per ticket(balanced basis on language)

In [3]:
langs=['FR', 'ES', 'DE', 'EN', 'PT', 'IT'] #first 6
other_langs=['NL', 'DA', 'PL', 'NO', 'SV', 'HE',
       'CS', 'TR', 'FI', 'JA', 'HR', 'HU', 'RO', 'SL', 'SK', 'EL', 'GB', 'RU',
       'ZH', 'UK', 'AR', 'ET', 'MS', 'LV', 'LT', 'BG', 'TN', 'SR', 'MK', 'VN',
       'DK', 'SE', 'KO', 'BE', 'BR', 'TH', 'PF', 'BS', 'JP', 'CN', 'SI', 'CH',
       'EE', 'IL', 'CL', 'UA', 'IS', 'AD', 'AT', 'CZ', 'EC', 'GP', 'MG', 'DS',
       'DZ', 'MQ', 'ZN', 'GR', 'N', 'NE']

In [4]:
data['length_clean_text']=data['clean_text_3'].apply(lambda x: len(str(x)))

In [5]:
data_50chars = data[data['length_clean_text']>=50]

In [6]:
hist_tickets = data_50chars['ID_HISTORICAL_TICKET_ID'].value_counts()[data_50chars['ID_HISTORICAL_TICKET_ID'].value_counts()>=105].index #histtickets with atleast 105 sentences

In [7]:
data_n_sentence_tickets = data_50chars[data_50chars['ID_HISTORICAL_TICKET_ID'].isin(hist_tickets)]

In [8]:
df_balanced = pd.DataFrame()

In [9]:
for hst_tkt in hist_tickets:
    df_hist_tkt = data_n_sentence_tickets[data_n_sentence_tickets['ID_HISTORICAL_TICKET_ID']==hst_tkt]
    df_hist_tkt_balanced_toplangs = df_hist_tkt[df_hist_tkt['CD_REPAIRER_LANGUAGE_ISO_CODE'].isin(langs)].groupby(["CD_REPAIRER_LANGUAGE_ISO_CODE"]).head(15)
    df_hist_tkt_balanced_otherlangs = df_hist_tkt[df_hist_tkt['CD_REPAIRER_LANGUAGE_ISO_CODE'].isin(other_langs)].head(15)
    df_tkt_balanced = pd.concat([df_hist_tkt_balanced_toplangs,df_hist_tkt_balanced_otherlangs], ignore_index=True, axis=0)
    df_balanced = pd.concat([df_balanced,df_tkt_balanced], ignore_index=True, axis=0)

In [10]:
df_balanced.shape

(34365, 7)

In [11]:
final_hist_tikts = df_balanced['ID_HISTORICAL_TICKET_ID'].value_counts()[df_balanced['ID_HISTORICAL_TICKET_ID'].value_counts()==105].index

In [12]:
df_balanced_final = df_balanced[df_balanced['ID_HISTORICAL_TICKET_ID'].isin(final_hist_tikts)]

In [13]:
len(final_hist_tikts)*105

5040

In [14]:
df_balanced_final.shape

(5040, 7)

In [15]:
len(final_hist_tikts)

48

#### 48 historical tickets left which has 105 sentences with atleast 50 characters

In [16]:
import datasets

In [17]:
def prepare_data():
    tktlst=[]
    
    for hist_tkt in df_balanced_final['ID_HISTORICAL_TICKET_ID'].value_counts().index:
        tkkts=[]
        tkkts = df_balanced_final[df_balanced_final['ID_HISTORICAL_TICKET_ID']==hist_tkt]['clean_text_3'].to_list()
        tkss_strip = [t.strip() for t in tkkts]
        tktlst.append(tkss_strip)
        
    d1 = datasets.Dataset.from_dict({'set':tktlst})
    dd = datasets.DatasetDict({'train':d1})  

    return dd

In [18]:
datas = prepare_data()

In [19]:
from sentence_transformers import InputExample

In [20]:
def prepare_input_example(dd):
    train_examples = []
    train_data = dd['train']['set']
    n_examples = dd['train'].num_rows

    for i in range(n_examples):
      example = train_data[i]
      train_examples.append(InputExample(texts=example[0:105]))
    
    return train_examples

In [21]:
train_examples = prepare_input_example(datas)

In [22]:
from sentence_transformers import SentenceTransformer

In [23]:
model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [24]:
from sentence_transformers import losses
from torch.utils.data import DataLoader

In [25]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 2
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [26]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,show_progress_bar=True,
          checkpoint_path = "./ticket_similarity_model/",
          checkpoint_save_steps = 6      
     )

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/12 [00:00<?, ?it/s]

Iteration:   0%|          | 0/12 [00:00<?, ?it/s]

In [27]:
model.save("./ticket_similarity_model/ticketsimilarity_xlm-r-distilroberta-base-paraphrase-v2")

In [28]:
embedder_custom = SentenceTransformer('./ticket_similarity_model/ticketsimilarity_xlm-r-distilroberta-base-paraphrase-v2/')

def query_engine_custom_ml2(corpus,queries):
    corpus_embeddings = embedder_custom.encode(corpus, convert_to_tensor=True)


    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    for query in queries:
        query_embedding = embedder_custom.encode(query, convert_to_tensor=True)

        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)

        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 5 most similar tickets matching your Query:")

        for score, idx in zip(top_results[0], top_results[1]):
            print(int(idx), "(Score: {:.4f})".format(score))

In [50]:
query_engine_custom_ml2(sent,["download of engine ecu failed now engine wont start"])





Query: download of engine ecu failed now engine wont start

Top 5 most similar tickets matching your Query:
3 (Score: 0.7488)
11 (Score: 0.6018)
17 (Score: 0.5992)
2 (Score: 0.5905)
18 (Score: 0.5897)


In [55]:
sent[18]

' pm27 logiciel diagbox  de  p22   erentiel rpo majim deion du    a e  de telement moteur le   demarre  avec e  de   404pendant le telement cdlt'

In [2]:
pip install mteb

Collecting mteb
  Downloading mteb-1.0.2-py3-none-any.whl (88 kB)
                                              0.0/88.1 kB ? eta -:--:--
     ---------------------------------------- 88.1/88.1 kB 2.5 MB/s eta 0:00:00
Collecting jsonlines (from mteb)
  Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)
Collecting rich (from mteb)
  Downloading rich-13.4.2-py3-none-any.whl (239 kB)
                                              0.0/239.4 kB ? eta -:--:--
     -------------------------------------  235.5/239.4 kB 4.8 MB/s eta 0:00:01
     -------------------------------------  235.5/239.4 kB 4.8 MB/s eta 0:00:01
     -------------------------------------- 239.4/239.4 kB 2.4 MB/s eta 0:00:00
Collecting markdown-it-py>=2.2.0 (from rich->mteb)
  Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
                                              0.0/87.5 kB ? eta -:--:--
     -------------------------------------    81.9/87.5 kB ? eta -:--:--
     ---------------------------------------

In [3]:
from mteb import MTEB
from sentence_transformers import SentenceTransformer

In [4]:
model_name = "./ticket_similarity_model/ticketsimilarity_xlm-r-distilroberta-base-paraphrase-v2/"
model = SentenceTransformer(model_name)

In [6]:
evaluation = MTEB(tasks=["STS22"])

In [7]:
results = evaluation.run(model, output_folder=f"./ticket_similarity_model/MTEBresults/{model_name}")

Downloading and preparing dataset sts22-crosslingual-sts/en to C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/mteb___sts22-crosslingual-sts/en/1.0.0/563d7d9067b4162f5e964eb988aaa492b59e7ed47a03f16ec94e19b0e60ee8c1...
Dataset sts22-crosslingual-sts downloaded and prepared to C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/mteb___sts22-crosslingual-sts/en/1.0.0/563d7d9067b4162f5e964eb988aaa492b59e7ed47a03f16ec94e19b0e60ee8c1. Subsequent calls will reuse this data.
Downloading and preparing dataset sts22-crosslingual-sts/de to C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/mteb___sts22-crosslingual-sts/de/1.0.0/563d7d9067b4162f5e964eb988aaa492b59e7ed47a03f16ec94e19b0e60ee8c1...
Dataset sts22-crosslingual-sts downloaded and prepared to C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/mteb___sts22-crosslingual-sts/de/1.0.0/563d7d9067b4162f5e964eb988aaa492b59e7ed47a03f16ec94e19b0e60ee8c1. Subsequent calls will reuse this data.
Downloading and preparing da

Downloading and preparing dataset sts22-crosslingual-sts/fr-pl to C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/mteb___sts22-crosslingual-sts/fr-pl/1.0.0/563d7d9067b4162f5e964eb988aaa492b59e7ed47a03f16ec94e19b0e60ee8c1...
Dataset sts22-crosslingual-sts downloaded and prepared to C:/Users/v.sai.teja.kukunuri/.cache/huggingface/datasets/mteb___sts22-crosslingual-sts/fr-pl/1.0.0/563d7d9067b4162f5e964eb988aaa492b59e7ed47a03f16ec94e19b0e60ee8c1. Subsequent calls will reuse this data.

Task: STS22, split: test, language: en. Running...

Task: STS22, split: test, language: de. Running...

Task: STS22, split: test, language: es. Running...

Task: STS22, split: test, language: pl. Running...

Task: STS22, split: test, language: tr. Running...

Task: STS22, split: test, language: ar. Running...

Task: STS22, split: test, language: ru. Running...

Task: STS22, split: test, language: zh. Running...

Task: STS22, split: test, language: fr. Running...

Task: STS22, split: test, language: 