In [None]:

!pip install gensim
!pip install nltk
!pip install numpy==1.25.2
!pip install datasets



import numpy as np
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from gensim.downloader import load as gensim_load
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import ttest_ind
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
# ---- Load SICK Dataset ----
print("Loading SICK dataset...")
dataset = load_dataset("sick")
sick = dataset["train"]
sentences1 = sick["sentence_A"]
sentences2 = sick["sentence_B"]
relatedness_scores = sick["relatedness_score"]

# ---- Convert Relatedness Scores to Binary Labels ----
labels = [1 if score >= 3.5 else 0 for score in relatedness_scores]

# ---- TF-IDF + Cosine Similarity ----
def compute_tfidf_similarity(sentences1, sentences2):
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    all_sentences = sentences1 + sentences2
    vectorizer.fit(all_sentences)
    s1 = vectorizer.transform(sentences1)
    s2 = vectorizer.transform(sentences2)
    return [cosine_similarity(a, b)[0][0] for a, b in zip(s1, s2)]

# ---- Word2Vec + Cosine Similarity ----
def compute_word2vec_similarity(sentences1, sentences2):
    w2v_model = gensim_load('word2vec-google-news-300')
    def sentence_vector(sentence):
        tokens = word_tokenize(sentence.lower())
        vectors = [w2v_model[w] for w in tokens if w in w2v_model]
        return np.mean(vectors, axis=0) if vectors else np.zeros(300)
    s1 = [sentence_vector(sent) for sent in sentences1]
    s2 = [sentence_vector(sent) for sent in sentences2]
    return [cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0] for a, b in zip(s1, s2)]

# ---- BERT + Cosine Similarity ----
def compute_bert_similarity(sentences1, sentences2):
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    s1 = model.encode(sentences1, convert_to_numpy=True)
    s2 = model.encode(sentences2, convert_to_numpy=True)
    return [cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0] for a, b in zip(s1, s2)]

# ---- Similarity to Binary Predictions ----
def to_predictions(similarities, threshold=0.75):
    return [1 if s >= threshold else 0 for s in similarities]

# ---- Classification Evaluation ----
def evaluate_classification(name, preds, labels):
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    print(f"{name} ➤ Accuracy: {acc:.4f} | F1 Score: {f1:.4f}")

# ---- Welch’s t-test ----
def run_welchs_test(scores_a, scores_b, label_a="Model A", label_b="Model B"):
    t_stat, p_val = ttest_ind(scores_a, scores_b, equal_var=False)
    print(f"{label_a} vs {label_b} ➤ t = {t_stat:.4f}, p = {p_val:.4f}")
    if p_val < 0.05:
        print("→ Statistically significant difference ✅\n")
    else:
        print("→ No statistically significant difference ❌\n")

# ---- Run All ----
print("Computing similarities...")

# Compute Similarities
tfidf_sim = compute_tfidf_similarity(sentences1, sentences2)
word2vec_sim = compute_word2vec_similarity(sentences1, sentences2)
bert_sim = compute_bert_similarity(sentences1, sentences2)

# Binary predictions
tfidf_preds = to_predictions(tfidf_sim)
word2vec_preds = to_predictions(word2vec_sim)
bert_preds = to_predictions(bert_sim)

# Evaluate
print("\nModel Evaluation on SICK dataset:")
evaluate_classification("TF-IDF", tfidf_preds, labels)
evaluate_classification("Word2Vec", word2vec_preds, labels)
evaluate_classification("BERT", bert_preds, labels)

# Welch’s t-test
print("\nWelch’s t-test between models (similarity scores):")
run_welchs_test(tfidf_sim, word2vec_sim, "TF-IDF", "Word2Vec")
run_welchs_test(word2vec_sim, bert_sim, "Word2Vec", "BERT")
run_welchs_test(tfidf_sim, bert_sim, "TF-IDF", "BERT")


Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
Using cached dill-0.3.8-py3-none-any.whl (116 kB)
Using cached fsspec-2024.12.0-py3-none-any.whl (183 kB)
Using cached multiprocess-0.70.16-py311-none-any.whl (143 kB)
Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
Installing collected packages: xxhash, fsspec, dill, multiprocess, datasets
 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading SICK dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.88k [00:00<?, ?B/s]

sick.py:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

The repository for sick contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/sick.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/218k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4439 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/495 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4906 [00:00<?, ? examples/s]

Computing similarities...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Model Evaluation on SICK dataset:
TF-IDF ➤ Accuracy: 0.5271 | F1 Score: 0.3770
Word2Vec ➤ Accuracy: 0.7526 | F1 Score: 0.8110
BERT ➤ Accuracy: 0.7432 | F1 Score: 0.7507

Welch’s t-test between models (similarity scores):
TF-IDF vs Word2Vec ➤ t = -87.9466, p = 0.0000
→ Statistically significant difference ✅

Word2Vec vs BERT ➤ t = 34.1257, p = 0.0000
→ Statistically significant difference ✅

TF-IDF vs BERT ➤ t = -48.7416, p = 0.0000
→ Statistically significant difference ✅



In [None]:
# ---- Compute Mean and Standard Deviation  ----
def compute_stats(similarities, name):
    mean_val = np.mean(similarities)
    sd_val = np.std(similarities)
    median_val = np.median(similarities)
    print(f"{name} ➤ Mean: {mean_val:.4f} | SD: {sd_val:.4f} | Median: {median_val:.4f}")
    return mean_val, sd_val, median_val

# ---- Print Mean, SD, Median for All Models ----
print("\nSimilarity Score Statistics:")
tfidf_mean, tfidf_sd, tfidf_median = compute_stats(tfidf_sim, "TF-IDF")
word2vec_mean, word2vec_sd, word2vec_median = compute_stats(word2vec_sim, "Word2Vec")
bert_mean, bert_sd, bert_median = compute_stats(bert_sim, "BERT")



Similarity Score Statistics:
TF-IDF ➤ Mean: 0.3770 | SD: 0.3007 | Median: 0.3027
Word2Vec ➤ Mean: 0.8173 | SD: 0.1444 | Median: 0.8480
BERT ➤ Mean: 0.6663 | SD: 0.2569 | Median: 0.7143
