<a href="https://colab.research.google.com/github/NikhilDendeti/-Trasformation-Scales-/blob/main/Task1_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from datasets import load_dataset
import gensim.downloader as api
from transformers import BertTokenizer, BertModel
import torch


class OneHotEncodingSimilarity:
    def __init__(self, vocabulary):
        self.encoder = OneHotEncoder(sparse_output=False)
        self.vocabulary = vocabulary
        reshaped_vocab = np.array(self.vocabulary).reshape(-1, 1)
        self.encoder.fit(reshaped_vocab)

    def encode_sentence(self, sentence):
        words = sentence.split()
        reshaped_words = np.array(words).reshape(-1, 1)
        return self.encoder.transform(reshaped_words)

    def calculate_cosine_similarity(self, sentence1, sentence2):
        encoded1 = self.encode_sentence(sentence1)
        encoded2 = self.encode_sentence(sentence2)
        similarity = cosine_similarity(encoded1.sum(axis=0).reshape(1, -1), encoded2.sum(axis=0).reshape(1, -1))
        return similarity[0][0]


class BagOfWordsSimilarity:
    def __init__(self, vocabulary):
        self.vectorizer = CountVectorizer(vocabulary=vocabulary)

    def calculate_cosine_similarity(self, sentence1, sentence2):
        vectorized_data = self.vectorizer.fit_transform([sentence1, sentence2])
        similarity = cosine_similarity(vectorized_data[:1], vectorized_data[1:2])
        return similarity[0][0]


class TfidfSimilarity:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def calculate_cosine_similarity(self, sentence1, sentence2):
        tfidf_matrix = self.vectorizer.fit_transform([sentence1, sentence2])
        similarity = cosine_similarity(tfidf_matrix[:1], tfidf_matrix[1:2])
        return similarity[0][0]


class FastTextSimilarity:
    def __init__(self):
        self.fasttext_model = api.load("fasttext-wiki-news-subwords-300")

    def get_word2vec_embedding(self, sentence):
        words = sentence.split()
        embeddings = []
        for word in words:
            try:
                embeddings.append(self.fasttext_model[word])
            except KeyError:
                embeddings.append(np.zeros(self.fasttext_model.vector_size))
        return np.mean(embeddings, axis=0)

    def calculate_cosine_similarity(self, sentence1, sentence2):
        embedding1 = self.get_word2vec_embedding(sentence1)
        embedding2 = self.get_word2vec_embedding(sentence2)
        similarity = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity


class BertSimilarity:
    def __init__(self):
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

    def get_bert_embedding(self, sentence):
        inputs = self.bert_tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.bert_model(**inputs)
        cls_embedding = outputs.pooler_output
        return cls_embedding.squeeze().numpy()

    def calculate_cosine_similarity(self, sentence1, sentence2):
        embedding1 = self.get_bert_embedding(sentence1)
        embedding2 = self.get_bert_embedding(sentence2)
        similarity = cosine_similarity([embedding1], [embedding2])[0][0]
        return similarity


class SentenceTransformerPipeline:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        self.dataset = None
        self.train_split = None
        self.vocabulary = []

    def load_dataset(self):
        print(f"Loading dataset: {self.dataset_name}")
        self.dataset = load_dataset(self.dataset_name)
        return self.dataset

    def combine_sentences(self, example):
        return {"combined_sentence": example["sentence1"] + " " + example["sentence2"]}

    def preprocess_data(self):
        self.train_split = self.dataset["train"].map(self.combine_sentences)
        print("Preprocessed data with combined sentences.")

    def build_vocabulary(self):
        combined_sentences = self.train_split["combined_sentence"]
        tokenized_sentences = [sentence.split() for sentence in combined_sentences]
        self.vocabulary = sorted(set(word for sentence in tokenized_sentences for word in sentence))
        print(f"Vocabulary built with {len(self.vocabulary)} unique words.")
        return self.vocabulary


# Example usage:
pipeline = SentenceTransformerPipeline("sentence-transformers/stsb")
pipeline.load_dataset()
pipeline.preprocess_data()
vocabulary = pipeline.build_vocabulary()

# Create instances for each similarity technique
one_hot_sim = OneHotEncodingSimilarity(vocabulary)
bow_sim = BagOfWordsSimilarity(vocabulary)
tfidf_sim = TfidfSimilarity()
fasttext_sim = FastTextSimilarity()
bert_sim = BertSimilarity()


Loading dataset: sentence-transformers/stsb
Preprocessed data with combined sentences.
Vocabulary built with 18348 unique words.


In [None]:
import pandas as pd
train_data = load_dataset("sentence-transformers/stsb")
train_split = train_data["train"][:100]
df = pd.DataFrame(train_split)

In [None]:
df["one_hot_encoding_score"] = 0;
df["bag_of_words_score"] = 0;
df["TF_IDF_score"] = 0;
df["word2VecScore"] = 0;
df["bertScore"] = 0;
df

Unnamed: 0,sentence1,sentence2,score,one_hot_encoding_score,bag_of_words_score,TF_IDF_score,word2VecScore,bertScore
0,A plane is taking off.,An air plane is taking off.,1.00,0,0,0,0,0
1,A man is playing a large flute.,A man is playing a flute.,0.76,0,0,0,0,0
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,0.76,0,0,0,0,0
3,Three men are playing chess.,Two men are playing chess.,0.52,0,0,0,0,0
4,A man is playing the cello.,A man seated is playing the cello.,0.85,0,0,0,0,0
...,...,...,...,...,...,...,...,...
95,Two boys are driving.,Two bays are dancing.,0.12,0,0,0,0,0
96,A man is riding on a horse.,A girl is riding a horse.,0.52,0,0,0,0,0
97,A man is riding a bicycle.,A monkey is riding a bike.,0.40,0,0,0,0,0
98,A man is slicing potatoes.,A woman is peeling potato.,0.44,0,0,0,0,0


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
for idx, row in df.iterrows():
    sentence1 = row["sentence1"]
    sentence2 = row["sentence2"]

    # TF-IDF similarity
    TF_IDF_score = tfidf_sim.calculate_cosine_similarity(sentence1, sentence2)
    df.at[idx, 'TF_IDF_score'] = TF_IDF_score

    # FastText similarity
    word2VecScore = fasttext_sim.calculate_cosine_similarity(sentence1, sentence2)
    df.at[idx, 'word2VecScore'] = word2VecScore

    # Bag of Words similarity
    bag_of_word_score = bow_sim.calculate_cosine_similarity(sentence1, sentence2)
    df.at[idx, 'bag_of_words_score'] = bag_of_word_score

    # One-Hot Encoding similarity
    one_hot_encoding_score = one_hot_sim.calculate_cosine_similarity(sentence1, sentence2)
    df.at[idx, 'one_hot_encoding_score'] = one_hot_encoding_score

    # Sentence-BERT similarity
    embedding1 = model.encode([sentence1])
    embedding2 = model.encode([sentence2])
    similarity = cosine_similarity(embedding1, embedding2)
    df.at[idx, 'bertScore'] = similarity[0][0]

    #word2vec
    word2VecScore = fasttext_sim.calculate_cosine_similarity(sentence1, sentence2)
    df.at[idx, 'word2VecScore'] = word2VecScore

df



Unnamed: 0,sentence1,sentence2,score,one_hot_encoding_score,bag_of_words_score,TF_IDF_score,word2VecScore,bertScore
0,A plane is taking off.,An air plane is taking off.,1.00,0.730297,0.816497,0.709297,0.752670,0.939304
1,A man is playing a large flute.,A man is playing a flute.,0.76,0.925820,0.894427,0.818180,0.984209,0.902032
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,0.76,0.737865,0.755929,0.615219,0.877584,0.892001
3,Three men are playing chess.,Two men are playing chess.,0.52,0.800000,0.750000,0.669419,0.986445,0.794561
4,A man is playing the cello.,A man seated is playing the cello.,0.85,0.925820,0.894427,0.846647,0.977316,0.928653
...,...,...,...,...,...,...,...,...
95,Two boys are driving.,Two bays are dancing.,0.12,0.500000,0.500000,0.336097,0.869906,0.090816
96,A man is riding on a horse.,A girl is riding a horse.,0.52,0.771517,0.670820,0.510149,0.941969,0.574164
97,A man is riding a bicycle.,A monkey is riding a bike.,0.40,0.666667,0.500000,0.336097,0.967445,0.499099
98,A man is slicing potatoes.,A woman is peeling potato.,0.44,0.400000,0.250000,0.144384,0.941475,0.549897


In [None]:
from sklearn.metrics import mean_squared_error

mse_results = {
    'one_hot_encoding': mean_squared_error(df['score'], df['one_hot_encoding_score']),
    'bag_of_words': mean_squared_error(df['score'], df['bag_of_words_score']),
    'TF_IDF': mean_squared_error(df['score'], df['TF_IDF_score']),
    'word2Vec': mean_squared_error(df['score'], df['word2VecScore']),
    'bertScore': mean_squared_error(df['score'], df['bertScore']),
}


lowest_mse_method = min(mse_results, key=mse_results.get)
lowest_mse_value = mse_results[lowest_mse_method]

print(mse_results)
print(f"The method with the lowest MSE is '{lowest_mse_method}' with an MSE of {lowest_mse_value:.4f}")

{'one_hot_encoding': 0.08205187789630401, 'bag_of_words': 0.08292169308295656, 'TF_IDF': 0.09152227235203608, 'word2Vec': 0.16270237169277693, 'bertScore': 0.02002811572235229}
The method with the lowest MSE is 'bertScore' with an MSE of 0.0200
