<a href="https://colab.research.google.com/github/SuvitKumar003/Pre_trained_Model_comparison_For_text_Similarity/blob/main/Model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import tensorflow_hub as hub
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MinMaxScaler
import pandas as pd  # For table formatting

# Load Pre-trained Models
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')  # SBERT
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")  # USE
mpnet_model = SentenceTransformer('all-mpnet-base-v2')  # MPNet
bert_base_model = SentenceTransformer('bert-base-nli-mean-tokens')  # BERT-NLI
roberta_model = SentenceTransformer('all-roberta-large-v1')  # RoBERTa
xlm_roberta_model = SentenceTransformer('xlm-roberta-base')  # XLM-RoBERTa
bert_large_model = SentenceTransformer('bert-large-nli-mean-tokens')  # BERT-Large

# Sample Sentences
sentences = [
    ("Thapar Institute offers a course on Predictive Statistics.", "Predictive Statistics is part of the curriculum at Thapar Institute."),
    ("Machine learning is an important component of Predictive Statistics.", "Predictive Statistics includes machine learning techniques."),
    ("Students learn regression analysis in Predictive Statistics.", "Regression is a fundamental topic in Predictive Statistics at Thapar."),
    ("The Predictive Statistics course covers time-series forecasting.", "Students explore time-series analysis as part of Predictive Statistics."),
    ("Data preprocessing is crucial for accurate predictions.", "Cleaning and preprocessing data improves prediction accuracy."),
    ("Thapar Institute provides hands-on projects in Predictive Statistics.", "Project-based learning enhances understanding of Predictive Statistics."),
    ("Predictive models help in decision making.", "Statistical models support informed decision-making processes."),
    ("Deep learning techniques are sometimes used in Predictive Statistics.", "Neural networks enhance the predictive capabilities of statistical models."),
    ("Students use Python for data analysis in Predictive Statistics.", "Python is widely used for statistical computing and analysis."),
    ("The final project in Predictive Statistics involves real-world data.", "Students analyze real datasets as part of their final project.")
]

# Compute Similarity Scores
def compute_similarity(sentence1, sentence2, model):
    emb1 = model.encode([sentence1])[0] if isinstance(model, SentenceTransformer) else model([sentence1])[0].numpy()
    emb2 = model.encode([sentence2])[0] if isinstance(model, SentenceTransformer) else model([sentence2])[0].numpy()
    return 1 - cosine(emb1, emb2)

# Build Decision Matrix
similarity_matrix = []
for s1, s2 in sentences:
    similarity_matrix.append([
        compute_similarity(s1, s2, sbert_model),
        compute_similarity(s1, s2, use_model),
        compute_similarity(s1, s2, mpnet_model),
        compute_similarity(s1, s2, bert_base_model),
        compute_similarity(s1, s2, roberta_model),
        compute_similarity(s1, s2, xlm_roberta_model),
        compute_similarity(s1, s2, bert_large_model)
    ])

similarity_matrix = np.array(similarity_matrix)

# Normalize Decision Matrix
scaler = MinMaxScaler()
normalized_matrix = scaler.fit_transform(similarity_matrix)

# Function to Apply TOPSIS
def apply_topsis(matrix, weights):
    weighted_matrix = matrix * weights
    ideal_solution = np.max(weighted_matrix, axis=0)
    anti_ideal_solution = np.min(weighted_matrix, axis=0)
    distance_from_ideal = np.sqrt(np.sum((weighted_matrix - ideal_solution) ** 2, axis=1))
    distance_from_anti_ideal = np.sqrt(np.sum((weighted_matrix - anti_ideal_solution) ** 2, axis=1))
    closeness = distance_from_anti_ideal / (distance_from_ideal + distance_from_anti_ideal)
    ranking = np.argsort(closeness)[::-1]  # Sort in descending order
    return closeness, ranking

# Define Weights
equal_weights = np.array([1/7] * 7)  # Equal weights for all models
custom_weights = np.array([0.1, 0.2, 0.2, 0.15, 0.15, 0.1, 0.1])  # Custom importance

# Apply TOPSIS for Both Weight Configurations
closeness_equal, ranking_equal = apply_topsis(normalized_matrix, equal_weights)
closeness_custom, ranking_custom = apply_topsis(normalized_matrix, custom_weights)

# Get Model Names
model_names = ["SBERT", "USE", "MPNet", "BERT-NLI", "RoBERTa", "XLM-RoBERTa", "BERT-Large"]

# Identify the Best Performing Model in Each Case
best_model_equal = model_names[np.argmax(equal_weights)]
best_model_custom = model_names[np.argmax(custom_weights)]

# Print Results as Tables
df_equal = pd.DataFrame({
    "Rank": range(1, len(sentences) + 1),
    "S1": [f"S{i+1}" for i in ranking_equal],
    "S2": [f"S{i+1}" for i in ranking_equal],
    "TOPSIS Score": [f"{closeness_equal[i]:.4f}" for i in ranking_equal]
})

df_custom = pd.DataFrame({
    "Rank": range(1, len(sentences) + 1),
    "S1": [f"S{i+1}" for i in ranking_custom],
    "S2": [f"S{i+1}" for i in ranking_custom],
    "TOPSIS Score": [f"{closeness_custom[i]:.4f}" for i in ranking_custom]
})

print("\nRanking of Sentence Pairs (Equal Weights, Higher is Better):")
print(df_equal.to_string(index=False))

print("\nRanking of Sentence Pairs (Custom Weights, Higher is Better):")
print(df_custom.to_string(index=False))

print(f"\nBest Performing Model (Equal Weights): {best_model_equal}")
print(f"Best Performing Model (Custom Weights): {best_model_custom}")





Ranking of Sentence Pairs (Equal Weights, Higher is Better):
 Rank  S1  S2 TOPSIS Score
    1  S1  S1       0.9189
    2  S2  S2       0.6549
    3  S4  S4       0.5255
    4  S7  S7       0.4650
    5  S9  S9       0.4646
    6  S3  S3       0.4196
    7  S5  S5       0.3927
    8  S6  S6       0.3496
    9  S8  S8       0.2197
   10 S10 S10       0.1829

Ranking of Sentence Pairs (Custom Weights, Higher is Better):
 Rank  S1  S2 TOPSIS Score
    1  S1  S1       0.9413
    2  S2  S2       0.7321
    3  S4  S4       0.5010
    4  S9  S9       0.4997
    5  S3  S3       0.4087
    6  S7  S7       0.4052
    7  S5  S5       0.3481
    8  S6  S6       0.2948
    9  S8  S8       0.2271
   10 S10 S10       0.1351

Best Performing Model (Equal Weights): SBERT
Best Performing Model (Custom Weights): USE
