<a href="https://colab.research.google.com/github/Reverse-Rain/Nexus/blob/main/base_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import math
import spacy
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import FastText
from laserembeddings import Laser
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('stopwords')

class SimilarityModels:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.model_paraphrase = SentenceTransformer('paraphrase-distilroberta-base-v1')
        self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
        self.use_model = hub.load(self.module_url)
        self.model_sbert = SentenceTransformer('paraphrase-mpnet-base-v2')
        self.model_distilbert = SentenceTransformer('distilbert-base-nli-mean-tokens')
        self.laser_model = Laser()

    def preprocess_text(self, sentence):
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(sentence.lower())
        filtered_sentence = [word for word in word_tokens if word not in stop_words]
        return filtered_sentence

    def get_cosine_similarity(self, sentence1, sentence2):
        sentence1 = self.preprocess_text(sentence1)
        sentence2 = self.preprocess_text(sentence2)

        vector1 = Counter(sentence1)
        vector2 = Counter(sentence2)

        all_words = set(vector1.keys()).union(set(vector2.keys()))

        dot_product = sum(vector1.get(word, 0) * vector2.get(word, 0) for word in all_words)

        magnitude1 = math.sqrt(sum(vector1.get(word, 0)**2 for word in all_words))
        magnitude2 = math.sqrt(sum(vector2.get(word, 0)**2 for word in all_words))

        if magnitude1 == 0 or magnitude2 == 0:
            return 0
        else:
            return dot_product / (magnitude1 * magnitude2)

    def jaccard_similarity(self, sentence1, sentence2):
        stop_words = set(stopwords.words('english'))

        words1 = [word.lower() for word in word_tokenize(sentence1) if word.isalnum() and word.lower() not in stop_words]
        words2 = [word.lower() for word in word_tokenize(sentence2) if word.isalnum() and word.lower() not in stop_words]

        intersection = len(set(words1).intersection(set(words2)))
        union = len(set(words1).union(set(words2)))

        if union == 0:
            return 0
        else:
            return intersection / union

    def word_movers_distance(self, sentence1, sentence2):
        doc1 = self.nlp(sentence1)
        doc2 = self.nlp(sentence2)

        wmd = doc1.similarity(doc2)

        return wmd

    def sentence_similarity_transformers(self, sentence1, sentence2):
        embeddings1 = self.model_paraphrase.encode(sentence1, convert_to_tensor=True)
        embeddings2 = self.model_paraphrase.encode(sentence2, convert_to_tensor=True)

        similarity_score = util.pytorch_cos_sim(embeddings1, embeddings2)

        return similarity_score.item()

    def sentence_similarity_USE(self, sentence1, sentence2):
        embeddings = self.use_model([sentence1, sentence2])

        similarity_score = tf.keras.losses.cosine_similarity(embeddings[0], embeddings[1]).numpy()

        return similarity_score

    def create_doc2vec_model(self, sentences):
        documents = [TaggedDocument(self.preprocess_text(sentence), [i]) for i, sentence in enumerate(sentences)]
        model = Doc2Vec(documents, vector_size=100, window=5, min_count=1, workers=4)
        return model

    def sentence_similarity_doc2vec(self, sentence1, sentence2, model):
        vec1 = model.infer_vector(self.preprocess_text(sentence1))
        vec2 = model.infer_vector(self.preprocess_text(sentence2))

        similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

        return similarity

    def create_fasttext_model(self, sentences):
        model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
        return model

    def sentence_similarity_fasttext(self, sentence1, sentence2, model):
        vec1 = model.wv[sentence1]
        vec2 = model.wv[sentence2]

        similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

        return similarity

    def sentence_similarity_sbert(self, sentence1, sentence2):
        embeddings1 = self.model_sbert.encode([sentence1])
        embeddings2 = self.model_sbert.encode([sentence2])

        similarity = np.dot(embeddings1[0], embeddings2[0]) / (np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]))

        return similarity

    def sentence_similarity_distilbert(self, sentence1, sentence2):
        embeddings1 = self.model_distilbert.encode([sentence1])
        embeddings2 = self.model_distilbert.encode([sentence2])

        similarity = np.dot(embeddings1[0], embeddings2[0]) / (np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]))

        return similarity

    def sentence_similarity_laser(self, sentence1, sentence2):
        embeddings1 = self.laser_model.embed_sentences([sentence1], lang='en')
        embeddings2 = self.laser_model.embed_sentences([sentence2], lang='en')

        similarity = np.dot(embeddings1[0], embeddings2[0]) / (np.linalg.norm(embeddings1[0]) * np.linalg.norm(embeddings2[0]))

        return similarity

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

def calculate_sentence_similarity(reference_sentence, student_answer):
    def get_synonyms_antonyms(word):
        synonyms = []
        antonyms = []

        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
                if lemma.antonyms():
                    antonyms.append(lemma.antonyms()[0].name())

        return synonyms, antonyms

    def get_synonyms_antonyms_for_sentence(sentence):
        words = nltk.word_tokenize(sentence)

        synonyms_list = []
        antonyms_list = []

        for word in words:
            synonyms, antonyms = get_synonyms_antonyms(word)
            synonyms_list.append(synonyms)
            antonyms_list.append(antonyms)

        result = {
            "sentence": sentence,
            "words": words,
            "synonyms": dict(zip(words, synonyms_list)),
            "antonyms": dict(zip(words, antonyms_list))
        }

        return result

    result1 = get_synonyms_antonyms_for_sentence(reference_sentence)
    result2 = get_synonyms_antonyms_for_sentence(student_answer)

    synonyms1 = result1["synonyms"]
    synonyms2 = result2["synonyms"]

    antonyms1 = result1["antonyms"]
    antonyms2 = result2["antonyms"]

    synonyms1_set = set([syn for synonyms in synonyms1.values() for syn in synonyms])
    synonyms2_set = set([syn for synonyms in synonyms2.values() for syn in synonyms])

    antonyms1_set = set([ant for antonyms in antonyms1.values() for ant in antonyms])
    antonyms2_set = set([ant for antonyms in antonyms2.values() for ant in antonyms])

    set1 = synonyms1_set.union(antonyms1_set)
    set2 = synonyms2_set.union(antonyms2_set)

    doc1 = nlp(" ".join(set1))
    doc2 = nlp(" ".join(set2))

    similarity_score = doc1.similarity(doc2)

    return similarity_score





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:

# Example usage
similarity_models = SimilarityModels()

sentence1 = "happy to hear that"
sentence2 = "sad to hear that"

sentences = [
    "happy to hear that",
    "sad to hear that"
]

reference_sentence = "happy to hear that"
student_answer = "sad to hear that"




similarity1 = similarity_models.get_cosine_similarity(sentence1, sentence2)
print("\n Cosine Similarity:", similarity1)

similarity2 = similarity_models.jaccard_similarity(sentence1, sentence2)
print("\n Jaccard Similarity:", similarity2)

wmd_similarity = similarity_models.word_movers_distance(sentence1, sentence2)
print("\n Word Mover's Distance Similarity:", wmd_similarity)

similarity4 = similarity_models.sentence_similarity_transformers(sentence1, sentence2)
print("\n Sentence Transformers Similarity:", similarity4)

similarity5 = similarity_models.sentence_similarity_USE(sentence1, sentence2)
print("\n Universal Sentence Encoder Similarity:", similarity5)


model1 = similarity_models.create_doc2vec_model(sentences)
similarity7 = similarity_models.sentence_similarity_doc2vec(sentence1, sentence2, model1)
print("\n Doc2Vec Cosine Similarity:", similarity7)

model2 = similarity_models.create_fasttext_model(sentences)
similarity8 = similarity_models.sentence_similarity_fasttext(sentence1, sentence2, model2)
print("\n FastText Cosine Similarity:", similarity8)

similarity9 = similarity_models.sentence_similarity_sbert(sentence1, sentence2)
print("\n SBERT Similarity:", similarity9)

similarity10 = similarity_models.sentence_similarity_distilbert(sentence1, sentence2)
print("\n DistilBERT Similarity:", similarity10)

similarity11 = similarity_models.sentence_similarity_laser(sentence1, sentence2)
print("\n LASER Similarity:", similarity11)

# Example usage:


similarity_score = calculate_sentence_similarity(reference_sentence, student_answer)
print("\n Similarity Score:", similarity_score)

# Create a dictionary to store method names and their scores
similarity_scores = {
    "Cosine Similarity": similarity1,
    "Jaccard Similarity": similarity2,
    "Word Mover's Distance Similarity": wmd_similarity,
    "Sentence Transformers Similarity": similarity4,
    "Universal Sentence Encoder Similarity": similarity5,
    "Doc2Vec Cosine Similarity": similarity7,
    "FastText Cosine Similarity": similarity8,
    "SBERT Similarity": similarity9,
    "DistilBERT Similarity": similarity10,
    "LASER Similarity": similarity11,
    "anonyms_score":similarity_score
}

# Sort the dictionary by values (scores) in ascending order
sorted_scores = {k: v for k, v in sorted(similarity_scores.items(), key=lambda item: item[1])}

# Print the sorted scores
print("\n Sorted Similarity Scores:")
for method, score in sorted_scores.items():
    print(f"{method}: {score}")


 Cosine Similarity: 0.4999999999999999

 Jaccard Similarity: 0.3333333333333333

 Word Mover's Distance Similarity: 0.9300998605625794


  wmd = doc1.similarity(doc2)



 Sentence Transformers Similarity: 0.7601169347763062





 Universal Sentence Encoder Similarity: -0.6496285

 Doc2Vec Cosine Similarity: 0.1918851

 FastText Cosine Similarity: 0.5376856

 SBERT Similarity: 0.42703322

 DistilBERT Similarity: 0.5009988

 LASER Similarity: 0.8913571

 Similarity Score: 0.880781536207257

 Sorted Similarity Scores:
Universal Sentence Encoder Similarity: -0.6496285200119019
Doc2Vec Cosine Similarity: 0.1918850988149643
Jaccard Similarity: 0.3333333333333333
SBERT Similarity: 0.4270332157611847
Cosine Similarity: 0.4999999999999999
DistilBERT Similarity: 0.5009987950325012
FastText Cosine Similarity: 0.5376855731010437
Sentence Transformers Similarity: 0.7601169347763062
anonyms_score: 0.880781536207257
LASER Similarity: 0.8913571238517761
Word Mover's Distance Similarity: 0.9300998605625794


  similarity_score = doc1.similarity(doc2)


In [8]:
# Calculate the average similarity score
total_scores = len(sorted_scores)
avg_similarity = sum(sorted_scores.values()) / total_scores
print("\n Average Similarity Score:", avg_similarity)



 Average Similarity Score: 0.48215117740264035
