In [None]:
# Required libraries

#Code for embedding and comparision of intents from ground truth and gemini, sarvam and gpt data

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import euclidean

# Load Sentence-BERT multilingual model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Load the datasets
ground_truth_df = pd.read_excel("unseen_data1xlsx.xlsx")
gpt_df = pd.read_excel("Gpt_Unseen.xlsx")
sarvam_df = pd.read_excel("Sarvam_Unseen.xlsx")
gemini_df = pd.read_excel("Gemini_Unseen.xlsx")

# Extract intent columns
ground_truth_intents = ground_truth_df['002_pa'].astype(str).tolist()
gpt_intents = gpt_df['Intent'].astype(str).tolist()
sarvam_intents = sarvam_df['Intent'].astype(str).tolist()
gemini_intents = gemini_df['Intent'].astype(str).tolist()

# Encode all intents using SBERT
print("Encoding intents...")

ground_truth_embeddings = model.encode(ground_truth_intents, show_progress_bar=True)
gpt_embeddings = model.encode(gpt_intents, show_progress_bar=True)
sarvam_embeddings = model.encode(sarvam_intents, show_progress_bar=True)
gemini_embeddings = model.encode(gemini_intents, show_progress_bar=True)

# Compute Euclidean distances
print("Computing distances...")

gpt_distances = []
sarvam_distances = []
gemini_distances = []

for i in range(len(ground_truth_embeddings)):
    gpt_distances.append(euclidean(ground_truth_embeddings[i], gpt_embeddings[i]))
    sarvam_distances.append(euclidean(ground_truth_embeddings[i], sarvam_embeddings[i]))
    gemini_distances.append(euclidean(ground_truth_embeddings[i], gemini_embeddings[i]))

# Aggregate total and average distances
gpt_total_distance = np.sum(gpt_distances)
sarvam_total_distance = np.sum(sarvam_distances)
gemini_total_distance = np.sum(gemini_distances)

gpt_avg_distance = np.mean(gpt_distances)
sarvam_avg_distance = np.mean(sarvam_distances)
gemini_avg_distance = np.mean(gemini_distances)

# Prepare summary table
summary_df = pd.DataFrame({
    'Model': ['GPT', 'Sarvam', 'Gemini'],
    'Total Distance': [gpt_total_distance, sarvam_total_distance, gemini_total_distance],
    'Average Distance': [gpt_avg_distance, sarvam_avg_distance, gemini_avg_distance]
})

# Add Ranking (1 = best)
summary_df['Rank'] = summary_df['Average Distance'].rank(method='min')

# Save to Excel / CSV
summary_df.to_excel("Comparision_Intents.xlsx", index=False)
summary_df.to_csv("Comparision_Intents.csv", index=False)

print("✅ Comparison complete! Saved to Comparision_Intents.xlsx and Comparision_Intents.csv.")


Encoding intents...


Batches: 100%|██████████| 7/7 [00:01<00:00,  4.16it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.32it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.23it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  3.81it/s]

Computing distances...
✅ Comparison complete! Saved to Comparision_Intents.xlsx and Comparision_Intents.csv.





In [4]:
# Required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity  # ← new import

# Load Sentence-BERT multilingual model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Load the datasets
ground_truth_df = pd.read_excel("unseen_data1xlsx.xlsx")
gpt_df = pd.read_excel("Gpt_Unseen.xlsx")
sarvam_df = pd.read_excel("Sarvam_Unseen.xlsx")
gemini_df = pd.read_excel("Gemini_Unseen.xlsx")

# Extract intent columns
ground_truth_intents = ground_truth_df['002_pa'].astype(str).tolist()
gpt_intents          = gpt_df['Intent'].astype(str).tolist()
sarvam_intents       = sarvam_df['Intent'].astype(str).tolist()
gemini_intents       = gemini_df['Intent'].astype(str).tolist()

# Encode all intents using SBERT
print("Encoding intents...")
ground_truth_embeddings = model.encode(ground_truth_intents, show_progress_bar=True)
gpt_embeddings          = model.encode(gpt_intents,          show_progress_bar=True)
sarvam_embeddings       = model.encode(sarvam_intents,       show_progress_bar=True)
gemini_embeddings       = model.encode(gemini_intents,       show_progress_bar=True)

# Compute Cosine similarities
print("Computing cosine similarities...")

gpt_sims    = []
sarvam_sims = []
gemini_sims = []

for i in range(len(ground_truth_embeddings)):
    # reshape to 2D arrays for sklearn
    gt_vec    = ground_truth_embeddings[i].reshape(1, -1)
    gpt_vec   = gpt_embeddings[i].reshape(1, -1)
    sarvam_vec= sarvam_embeddings[i].reshape(1, -1)
    gemini_vec= gemini_embeddings[i].reshape(1, -1)

    # cosine_similarity returns a 1×1 matrix
    gpt_sims.append    (cosine_similarity(gt_vec, gpt_vec)[0,0])
    sarvam_sims.append (cosine_similarity(gt_vec, sarvam_vec)[0,0])
    gemini_sims.append (cosine_similarity(gt_vec, gemini_vec)[0,0])

# If you prefer to turn similarity into a distance:  
# gpt_sims    = [1 - s for s in gpt_sims]
# sarvam_sims = [1 - s for s in sarvam_sims]
# gemini_sims = [1 - s for s in gemini_sims]

# Aggregate total and average similarities
gpt_total_sim    = np.sum(gpt_sims)
sarvam_total_sim = np.sum(sarvam_sims)
gemini_total_sim = np.sum(gemini_sims)

gpt_avg_sim    = np.mean(gpt_sims)
sarvam_avg_sim = np.mean(sarvam_sims)
gemini_avg_sim = np.mean(gemini_sims)

# Prepare summary table
summary_df = pd.DataFrame({
    'Model':             ['GPT', 'Sarvam', 'Gemini'],
    'Total Similarity':  [gpt_total_sim, sarvam_total_sim, gemini_total_sim],
    'Average Similarity':[gpt_avg_sim, sarvam_avg_sim, gemini_avg_sim]
})

# Add Ranking (1 = best, i.e. highest average similarity)
summary_df['Rank'] = (-summary_df['Average Similarity']).rank(method='min').astype(int)

# Save to Excel / CSV
summary_df.to_excel("Comparison_Intents_Cosine.xlsx", index=False)
summary_df.to_csv("Comparison_Intents_Cosine.csv", index=False)

print("✅ Cosine-based comparison complete! Saved to Comparison_Intents_Cosine.*")


Encoding intents...


Batches: 100%|██████████| 7/7 [00:04<00:00,  1.46it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  3.56it/s]
Batches: 100%|██████████| 7/7 [00:01<00:00,  4.26it/s]
Batches: 100%|██████████| 7/7 [00:03<00:00,  1.91it/s]


Computing cosine similarities...
✅ Cosine-based comparison complete! Saved to Comparison_Intents_Cosine.*


In [None]:
## code using LaBse model for embedding and comparisions 

# Required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util

# Load LaBSE model
model = SentenceTransformer('sentence-transformers/LaBSE')

# Load the datasets
ground_truth_df = pd.read_excel("unseen_data1xlsx.xlsx")
gpt_df = pd.read_excel("Gpt_Unseen.xlsx")
sarvam_df = pd.read_excel("Sarvam_Unseen.xlsx")
gemini_df = pd.read_excel("Gemini_Unseen.xlsx")

# Extract intent columns as string
ground_truth_intents = ground_truth_df['002_pa'].astype(str).tolist()
gpt_intents = gpt_df['Intent'].astype(str).tolist()
sarvam_intents = sarvam_df['Intent'].astype(str).tolist()
gemini_intents = gemini_df['Intent'].astype(str).tolist()

# Encode all intents using LaBSE
print("Encoding intents...")

ground_truth_embeddings = model.encode(ground_truth_intents, convert_to_tensor=True, show_progress_bar=True)
gpt_embeddings = model.encode(gpt_intents, convert_to_tensor=True, show_progress_bar=True)
sarvam_embeddings = model.encode(sarvam_intents, convert_to_tensor=True, show_progress_bar=True)
gemini_embeddings = model.encode(gemini_intents, convert_to_tensor=True, show_progress_bar=True)

# Compute cosine similarities
print("Computing cosine similarities...")

def compute_avg_similarity(pred_embeddings, true_embeddings):
    similarities = util.cos_sim(true_embeddings, pred_embeddings).diagonal()
    return similarities.mean().item()

gpt_avg_sim = compute_avg_similarity(gpt_embeddings, ground_truth_embeddings)
sarvam_avg_sim = compute_avg_similarity(sarvam_embeddings, ground_truth_embeddings)
gemini_avg_sim = compute_avg_similarity(gemini_embeddings, ground_truth_embeddings)

# Prepare summary table
summary_df = pd.DataFrame({
    'Model': ['GPT', 'Sarvam', 'Gemini'],
    'Average Similarity': [gpt_avg_sim, sarvam_avg_sim, gemini_avg_sim]
})

# Rank: higher similarity is better → rank in descending order
summary_df['Rank'] = summary_df['Average Similarity'].rank(method='min', ascending=False)

# Save results
summary_df.to_excel("Intent_Similarity_Comparison_LaBSE.xlsx", index=False)
summary_df.to_csv("Intent_Similarity_Comparison_LaBSE.csv", index=False)

print("✅ Done! Comparison saved to Intent_Similarity_Comparison_LaBSE.xlsx and .csv")


Encoding intents...


Batches: 100%|██████████| 7/7 [00:07<00:00,  1.00s/it]
Batches: 100%|██████████| 7/7 [00:04<00:00,  1.73it/s]
Batches: 100%|██████████| 7/7 [00:03<00:00,  2.06it/s]
Batches: 100%|██████████| 7/7 [00:04<00:00,  1.60it/s]

Computing cosine similarities...
✅ Done! Comparison saved to Intent_Similarity_Comparison_LaBSE.xlsx and .csv





In [6]:
#code for dot product using paraphrase model which does not have internal normalisation
#labse had internal normalisation thing in it

# Required libraries
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

# Load a model that does NOT normalize by default
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

# Load the datasets
ground_truth_df = pd.read_excel("unseen_data1xlsx.xlsx")
gpt_df = pd.read_excel("Gpt_Unseen.xlsx")
sarvam_df = pd.read_excel("Sarvam_Unseen.xlsx")
gemini_df = pd.read_excel("Gemini_Unseen.xlsx")

# Extract intent columns as string
ground_truth_intents = ground_truth_df['002_pa'].astype(str).tolist()
gpt_intents = gpt_df['Intent'].astype(str).tolist()
sarvam_intents = sarvam_df['Intent'].astype(str).tolist()
gemini_intents = gemini_df['Intent'].astype(str).tolist()

# Encode embeddings WITHOUT normalization
print("Encoding embeddings without normalization...")
ground_truth_embeddings = model.encode(ground_truth_intents, convert_to_tensor=True, normalize_embeddings=False, show_progress_bar=True)
gpt_embeddings = model.encode(gpt_intents, convert_to_tensor=True, normalize_embeddings=False, show_progress_bar=True)
sarvam_embeddings = model.encode(sarvam_intents, convert_to_tensor=True, normalize_embeddings=False, show_progress_bar=True)
gemini_embeddings = model.encode(gemini_intents, convert_to_tensor=True, normalize_embeddings=False, show_progress_bar=True)

# 🔍 Sanity check: print vector norms
print("\nSample vector norms:")
print("Ground Truth norm:", torch.norm(ground_truth_embeddings[0]).item())
print("GPT norm:", torch.norm(gpt_embeddings[0]).item())
print("Sarvam norm:", torch.norm(sarvam_embeddings[0]).item())
print("Gemini norm:", torch.norm(gemini_embeddings[0]).item())

# 🔢 Compute dot product only
def compute_avg_dot(pred_embeddings, true_embeddings):
    dot_products = (true_embeddings * pred_embeddings).sum(dim=1)
    return dot_products.mean().item()

gpt_avg_dot = compute_avg_dot(gpt_embeddings, ground_truth_embeddings)
sarvam_avg_dot = compute_avg_dot(sarvam_embeddings, ground_truth_embeddings)
gemini_avg_dot = compute_avg_dot(gemini_embeddings, ground_truth_embeddings)

# 📊 Save results
summary_df = pd.DataFrame({
    'Model': ['GPT', 'Sarvam', 'Gemini'],
    'Average Dot Product': [gpt_avg_dot, sarvam_avg_dot, gemini_avg_dot]
})

summary_df.to_excel("DotProduct_ParaphraseModel.xlsx", index=False)
summary_df.to_csv("DotProduct_ParaphraseModel.csv", index=False)

print("\n✅ Done! Dot products saved to DotProduct_ParaphraseModel.xlsx and .csv")


Encoding embeddings without normalization...


Batches: 100%|██████████| 7/7 [00:00<00:00,  7.85it/s]
Batches: 100%|██████████| 7/7 [00:02<00:00,  2.96it/s]
Batches: 100%|██████████| 7/7 [00:02<00:00,  2.50it/s]
Batches: 100%|██████████| 7/7 [00:03<00:00,  1.88it/s]


Sample vector norms:
Ground Truth norm: 6.227839946746826
GPT norm: 6.227839946746826
Sarvam norm: 6.227839946746826
Gemini norm: 6.227839946746826

✅ Done! Dot products saved to DotProduct_ParaphraseModel.xlsx and .csv





In [None]:
#Code for running the t-test for proving that LaBSE and Paraphrase model have significant results gap

import pandas as pd
from scipy.stats import ttest_rel, wilcoxon

# 1. Load your two CSVs:
#    – replace these filenames with the actual paths to your SBERT vs. LaBSE files
labse_df = pd.read_csv("Intent_Similarity_Comparison_LaBSE.csv")
sbert_df = pd.read_csv("Comparison_Intents_Cosine.csv")

# 2. Extract the Average Similarity columns
#    (make sure the column name matches exactly)
labse_avgs = labse_df["Average Similarity"]
sbert_avgs = sbert_df["Average Similarity"]

# 3. Paired t-test
t_stat, p_value = ttest_rel(labse_avgs, sbert_avgs)
print(f"Paired t-test:   t = {t_stat:.4f},  p = {p_value:.4e}")

# 4. (Optional) Non-parametric alternative: Wilcoxon signed-rank
w_stat, w_p = wilcoxon(labse_avgs, sbert_avgs)
print(f"Wilcoxon test:   W = {w_stat:.4f},  p = {w_p:.4e}")

# 5. Interpretation
if p_value < 0.05:
    print("⇒ The difference in average similarities is statistically significant (p < 0.05).")
else:
    print("⇒ No statistically significant difference detected (p ≥ 0.05).")


Paired t-test:   t = -13.0224,  p = 5.8452e-03
Wilcoxon test:   W = 0.0000,  p = 2.5000e-01
⇒ The difference in average similarities is statistically significant (p < 0.05).
