In [1]:
# !pip install sentence-transformers

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [3]:
df = pd.read_excel('DLT-Test-Clean.xlsx')

In [4]:
df.head()

Unnamed: 0,Question,Actual Answer,Generated Answer
0,What programming languages or frameworks does ...,Lamini currently has support in python and a R...,Lamini supports a variety of programming langu...
1,Does Lamini AI provide any features for genera...,"No, Lamini AI does not provide any features fo...","Yes, Lamini AI provides features for generatin..."
2,What are the possible causes of Internal Serve...,Internal server errors are usually caused by a...,Internal Server 500 errors in Lamini Python pa...
3,Are there any hyperparameter tuning options av...,Lamini is a powerful engine used to fine-tunin...,"Yes, Lamini provides several hyperparameter tu..."
4,How does the performance of LLMs trained using...,"According to the information provided, Lamini ...",The performance of LLMs trained using Lamini c...


### TF-IDF Similarity

In [5]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Question'])
cosine_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)

### BERT Embeddings Similarity

In [6]:
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(df['Question'], convert_to_tensor=True)
cosine_sim_bert = util.cos_sim(embeddings, embeddings)

In [7]:
tfidf_threshold = 0.3
bert_threshold = 0.75

similar_pairs = []
num_questions = len(df)
for i in range(num_questions):
    for j in range(i+1, num_questions):  # Avoid comparing a question with itself and duplicate pairs
        tfidf_sim = cosine_sim_tfidf[i, j]
        bert_sim = cosine_sim_bert[i, j]
        if tfidf_sim >= tfidf_threshold and bert_sim >= bert_threshold:
            pair_info = {
                "Question 1": df['Question'][i],
                "Answer 1": df['Actual Answer'][i],
                "Question 2": df['Question'][j],
                "Answer 2": df['Actual Answer'][j],
                "TF-IDF Similarity": tfidf_sim,
                "BERT Similarity": bert_sim
            }
            similar_pairs.append(pair_info)

# Convert to DataFrame for easy viewing
similar_pairs_df = pd.DataFrame(similar_pairs)

In [8]:
similar_pairs_df.head()

Unnamed: 0,Question 1,Answer 1,Question 2,Answer 2,TF-IDF Similarity,BERT Similarity
0,What programming languages or frameworks does ...,Lamini currently has support in python and a R...,What programming languages does the Lamini lib...,The Lamini library extends its support to mult...,0.458054,tensor(0.8511)
1,What are the possible causes of Internal Serve...,Internal server errors are usually caused by a...,How do I handle network connection errors in L...,Network connection errors can be handled by ma...,0.355729,tensor(0.8294)
2,Is it possible to fine-tune Lamini on a specif...,"Yes, it is possible to fine-tune Lamini on a s...",Is it possible to fine-tune an openAI model us...,"Yes, it is possible to fine-tune an OpenAI mod...",0.324722,tensor(0.7812)
3,Is it possible to fine-tune Lamini on a specif...,"Yes, it is possible to fine-tune Lamini on a s...",Can I fine-tune Lamini on my own dataset or sp...,"Absolutely, you can train your custom Language...",0.484205,tensor(0.7665)
4,"Can Lamini be used for transfer learning, wher...","Yes, Lamini can be used for transfer learning....",Can the Lamini documentation help me understan...,"Yes, the Lamini documentation provides guidanc...",0.44948,tensor(0.8506)


In [9]:
similar_pairs_df.to_clipboard()

In [10]:
similar_pairs_df['BERT Similarity']

0     tensor(0.8511)
1     tensor(0.8294)
2     tensor(0.7812)
3     tensor(0.7665)
4     tensor(0.8506)
5     tensor(0.8362)
6     tensor(0.8038)
7     tensor(0.8981)
8     tensor(0.8521)
9     tensor(0.9143)
10    tensor(0.8927)
11    tensor(0.7931)
12    tensor(0.7605)
13    tensor(0.8375)
14    tensor(0.7563)
15    tensor(0.8561)
16    tensor(0.8276)
17    tensor(0.8528)
18    tensor(0.8313)
19    tensor(0.7961)
20    tensor(0.8609)
Name: BERT Similarity, dtype: object

### Between train and test datasets

In [11]:
file_path = 'train-00000-of-00001-5cdebbc48da41394.parquet'
train_df = pd.read_parquet(file_path)

In [12]:
train_df.head()

Unnamed: 0,question,answer,input_ids,attention_mask,labels
0,How can I evaluate the performance and quality...,There are several metrics that can be used to ...,"[2347, 476, 309, 7472, 253, 3045, 285, 3290, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2347, 476, 309, 7472, 253, 3045, 285, 3290, 2..."
1,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ...","[5804, 309, 1089, 1491, 670, 253, 2127, 434, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5804, 309, 1089, 1491, 670, 253, 2127, 434, 2..."
2,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...,"[2347, 1057, 418, 4988, 74, 14980, 6016, 9762,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2347, 1057, 418, 4988, 74, 14980, 6016, 9762,..."
3,Does the `submit_job()` function expose any ad...,It is unclear which `submit_job()` function is...,"[10795, 253, 2634, 21399, 64, 17455, 42702, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[10795, 253, 2634, 21399, 64, 17455, 42702, 11..."
4,Does the `add_data()` function support differe...,"No, the `add_data()` function does not support...","[10795, 253, 2634, 1911, 64, 2203, 42702, 1159...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[10795, 253, 2634, 1911, 64, 2203, 42702, 1159..."


In [13]:
train_df.rename(columns={'question': 'Question', 'answer': 'Actual Answer'}, inplace=True)
train_df['Source'] = 'Train'
df['Source'] = 'Test'

combined_df = pd.concat([train_df[['Question', 'Actual Answer', 'Source']], 
                         df[['Question', 'Actual Answer', 'Source']]], ignore_index=True)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(combined_df['Question'])
tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(combined_df['Question'], convert_to_tensor=True)
bert_sim = util.cos_sim(embeddings, embeddings).numpy()

similar_pairs = []
num_questions = len(combined_df)
for i in range(num_questions):
    for j in range(i+1, num_questions):
        if tfidf_sim[i, j] >= tfidf_threshold and bert_sim[i, j] >= bert_threshold:
            pair_info = {
                "Question 1": combined_df['Question'][i],
                "Source 1": combined_df['Source'][i],
                "Answer 1": combined_df['Actual Answer'][i],
                "Question 2": combined_df['Question'][j],
                "Source 2": combined_df['Source'][j],
                "Answer 2": combined_df['Actual Answer'][j],
                "TF-IDF Similarity": tfidf_sim[i, j],
                "BERT Similarity": bert_sim[i, j]
            }
            similar_pairs.append(pair_info)


similar_pairs_df = pd.DataFrame(similar_pairs)

In [14]:
print(similar_pairs_df.describe())
display(similar_pairs_df.head())

       TF-IDF Similarity  BERT Similarity
count        3468.000000      3468.000000
mean            0.439932         0.841794
std             0.134908         0.056815
min             0.300027         0.750009
25%             0.338656         0.795067
50%             0.395240         0.836562
75%             0.501078         0.882860
max             1.000000         1.000000


Unnamed: 0,Question 1,Source 1,Answer 1,Question 2,Source 2,Answer 2,TF-IDF Similarity,BERT Similarity
0,How can I evaluate the performance and quality...,Train,There are several metrics that can be used to ...,How can I handle bias or sensitive content in ...,Train,To handle bias or sensitive content in the gen...,0.414624,0.845823
1,How can I evaluate the performance and quality...,Train,There are several metrics that can be used to ...,What are the recommended approaches for evalua...,Train,There are several approaches for evaluating th...,0.377987,0.805797
2,How can I evaluate the performance and quality...,Train,There are several metrics that can be used to ...,How can I evaluate the performance of a custom...,Train,"Yes, Lamini provides various evaluation metric...",0.338285,0.846808
3,How can I evaluate the performance and quality...,Train,There are several metrics that can be used to ...,Can the generated data be customized or filter...,Train,"Yes, the generated data can be customized or f...",0.319889,0.753353
4,How can I evaluate the performance and quality...,Train,There are several metrics that can be used to ...,How can I add output scores to compare the con...,Train,One way to add output scores to compare the co...,0.320752,0.840277


In [15]:
similar_pairs_df.to_clipboard()

In [17]:
filtered_pairs_df = similar_pairs_df[similar_pairs_df['Source 1'] != similar_pairs_df['Source 2']]

comparison_data = []

for index, row in filtered_pairs_df.iterrows():
    question_2 = row['Question 2']
    answer_1 = row['Answer 1']
    answer_2 = row['Answer 2']
    generated_answer = df[df['Question'] == question_2]['Generated Answer'].values[0]

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([answer_1, answer_2, generated_answer])
    tfidf_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    bert_embeddings = model.encode([answer_1, answer_2, generated_answer], convert_to_tensor=True)
    bert_sim = util.cos_sim(bert_embeddings, bert_embeddings).numpy()

    answer_1_terms = set(answer_1.split())
    answer_2_terms = set(answer_2.split())
    generated_answer_terms = set(generated_answer.split())
    overlap_1 = len(answer_1_terms.intersection(generated_answer_terms)) / len(answer_1_terms.union(generated_answer_terms))
    overlap_2 = len(answer_2_terms.intersection(generated_answer_terms)) / len(answer_2_terms.union(generated_answer_terms))

    comparison_data.append({
        'Question 2': question_2,
        'Answer 1': answer_1,
        'Answer 2': answer_2,
        'Generated Answer': generated_answer,
        'TF-IDF Similarity (Answer 1)': tfidf_sim[0, 2],
        'TF-IDF Similarity (Answer 2)': tfidf_sim[1, 2],
        'BERT Similarity (Answer 1)': bert_sim[0, 2],
        'BERT Similarity (Answer 2)': bert_sim[1, 2],
        'Term Overlap (Answer 1)': overlap_1,
        'Term Overlap (Answer 2)': overlap_2
    })

comparison_df = pd.DataFrame(comparison_data)

comparison_df.reset_index(drop=True, inplace=True)


In [18]:
comparison_df.head()

Unnamed: 0,Question 2,Answer 1,Answer 2,Generated Answer,TF-IDF Similarity (Answer 1),TF-IDF Similarity (Answer 2),BERT Similarity (Answer 1),BERT Similarity (Answer 2),Term Overlap (Answer 1),Term Overlap (Answer 2)
0,Can I find information about the code's approa...,"Yes, the code includes methods for submitting ...","Yes, the code includes classes for handling se...",There is no information in Lamini’s python lib...,0.147719,0.247572,0.410386,0.407161,0.080645,0.112676
1,How does Lamini AI handle requests for generat...,Lamini AI offers features for generating text ...,Lamini AI offers features for generating text ...,Lamini AI does not offer any features for gene...,0.052086,0.061547,0.508511,0.50711,0.173913,0.171053
2,How does the `add_data()` function work in Lam...,"No, the `add_data()` function does not support...",The `add_data()` function in Lamini is used to...,The `add_data()` function in Lamini allows for...,0.43246,0.616555,0.47516,0.787118,0.117647,0.278481
3,What is the process involved when using the `s...,The `submit_job()` function in Lamini is used ...,When using the `submit_job()` function in Lami...,The `submit_job()` function in Lamini is used ...,0.678742,0.815125,0.89438,0.94377,0.352113,0.363636
4,What programming languages does the Lamini lib...,"Yes, Lamini supports generating code through i...",The Lamini library extends its support to mult...,The Lamini library supports multiple programmi...,0.122683,0.535277,0.779657,0.902348,0.030303,0.31068


In [19]:
comparison_df.to_clipboard()

#### ONE TO ONE MAPPING

In [22]:
best_pairs_df = (filtered_pairs_df
                 .sort_values('BERT Similarity', ascending=False) )
best_pairs_df.head()
print(best_pairs_df['Question 2'].nunique())

78


In [27]:
best_pairs_df = (filtered_pairs_df
                 .sort_values('BERT Similarity', ascending=False) 
                 .drop_duplicates('Question 2')) 

print(best_pairs_df['Question 2'].nunique())
print(best_pairs_df.shape)
print(best_pairs_df['BERT Similarity'].describe())

78
(78, 8)
count    78.000000
mean      0.885720
std       0.060220
min       0.756090
25%       0.848270
50%       0.888325
75%       0.936068
max       0.995183
Name: BERT Similarity, dtype: float64


In [24]:
best_pairs_df.head()

Unnamed: 0,Question 1,Source 1,Answer 1,Question 2,Source 2,Answer 2,TF-IDF Similarity,BERT Similarity
3411,How do I instantiate the LLM engine in the Lam...,Train,You can instantiate the LLM engine in the Lami...,How do I instantiate the LLM engine using the ...,Test,You can instantiate the LLM engine using the l...,0.95662,0.995183
782,Is it possible to fine-tune Lamini on a specif...,Train,"Yes, it is possible to fine-tune Lamini on a s...",Is it possible to fine-tune Lamini on a specif...,Test,"Yes, it is possible to fine-tune Lamini on a s...",0.814437,0.988153
3119,Can I fine-tune the pre-trained models provide...,Train,"Yes, Lamini allows for fine-tuning of pre-trai...",Can I fine-tune the pre-trained models provide...,Test,"Yes, you can fine-tune the pre-trained models ...",0.753866,0.982559
3403,How can I integrate Lamini into my software de...,Train,Lamini can be integrated into software develop...,How can I integrate Lamini into my existing so...,Test,Integrating Lamini into your existing software...,0.935194,0.979599
1383,Does Lamini support generating code for speech...,Train,"Yes, Lamini supports generating code for speec...",Does Lamini support generating code for speech...,Test,"Yes, Lamini supports generating code for speec...",0.712742,0.976268


In [28]:
final_comparison_df = pd.merge(best_pairs_df[['Question 1', 'Question 2', 'Answer 1', 'Answer 2', 'BERT Similarity']], 
                               comparison_df, 
                               left_on=['Question 2', 'Answer 1', 'Answer 2'], 
                               right_on=['Question 2', 'Answer 1', 'Answer 2'])

final_comparison_df.reset_index(drop=True, inplace=True)

In [30]:
final_comparison_df.to_clipboard()

In [31]:
# Define a function to tokenize a text into terms
def tokenize(text):
    return set(text.lower().split())

# Initialize new columns
final_comparison_df['Unique Terms (Answer 2)'] = 0
final_comparison_df['Unique Terms (Generated Answer)'] = 0
final_comparison_df['Terms in Answer 2 not in Generated Answer'] = None
final_comparison_df['Terms in Generated Answer not in Answer 2'] = None

# Iterate through each row
for index, row in final_comparison_df.iterrows():
    # Tokenize Answer 2 and Generated Answer
    answer_2_terms = tokenize(row['Answer 2'])
    generated_answer_terms = tokenize(row['Generated Answer'])

    # Count unique terms
    final_comparison_df.at[index, 'Unique Terms (Answer 2)'] = len(answer_2_terms)
    final_comparison_df.at[index, 'Unique Terms (Generated Answer)'] = len(generated_answer_terms)

    # Find terms difference
    final_comparison_df.at[index, 'Terms in Answer 2 not in Generated Answer'] = list(answer_2_terms - generated_answer_terms)
    final_comparison_df.at[index, 'Terms in Generated Answer not in Answer 2'] = list(generated_answer_terms - answer_2_terms)

In [32]:
final_comparison_df.to_clipboard()