In [1]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# Impactful words and their weights
negative_impactful_words = {
    "not": 1.5, "never": 1.5, "but": 1.2, "although": 1.2, "however": 1.2,
    "instead": 1.2, "unless": 1.2, "despite": 1.2, "yet": 1.2, "except": 1.2,
    "only": 1.2, "even": 1.2, "rather": 1.2, "without": 1.2, "before": 1.1,
    "after": 1.1, "until": 1.1, "because": 1.1, "since": 1.1, "though": 1.1
}
positive_impactful_words = {
    "also": 0.9, "additionally": 0.9, "moreover": 0.9, "and": 0.9,
    "furthermore": 0.9, "similarly": 0.9, "likewise": 0.9, "especially": 0.9
}
# Function to adjust scores based on impactful words
def adjust_score_for_impactful_words(doc1, doc2, score):
    doc1_words = doc1.lower().split()
    doc2_words = doc2.lower().split()
    adjustment_factor = 1.0

    for word in negative_impactful_words:
        if word in doc1_words or word in doc2_words:
            adjustment_factor *= -(negative_impactful_words[word])

    for word in positive_impactful_words:
        if word in doc1_words or word in doc2_words:
            adjustment_factor *= positive_impactful_words[word]

    return score + adjustment_factor

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return adjust_score_for_impactful_words(doc1, doc2, score)

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return adjust_score_for_impactful_words(doc1, doc2, score)

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return adjust_score_for_impactful_words(doc1, doc2, score)

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return adjust_score_for_impactful_words(doc1, doc2, score)

# SBERT Similarity
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return adjust_score_for_impactful_words(doc1, doc2, score)

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.1, 0.15, 0.35, 0.35]
    high_threshold = 0.8
    low_threshold = 0.2

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    bert = bert_similarity(doc1, doc2)[0][0]
    sbert = sbert_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    if all(score > high_threshold for score in normalized_scores):
        highest_two_avg = np.mean(sorted(normalized_scores)[-2:])
        final_score = highest_two_avg
        weights = [0.05, 0.05, 0.1, 0.4, 0.4]
    elif all(score < low_threshold for score in normalized_scores):
        lowest_two_avg = np.mean(sorted(normalized_scores)[:2])
        final_score = lowest_two_avg
        weights = [0.4, 0.3, 0.2, 0.05, 0.05]
    else:
        weighted_scores = np.array(normalized_scores) * np.array(weights)
        final_score = np.sum(weighted_scores)

    if bert < 0.4 or sbert < 0.4:
        final_score *= 0.5

    return final_score

# Calculate Similarity for Paragraph Pairs
def paragraph_similarity(paragraph1, paragraph2):
    sentences1 = sent_tokenize(paragraph1)
    sentences2 = sent_tokenize(paragraph2)

    scores = []
    for sentence1 in sentences1:
        for sentence2 in sentences2:
            score = hybrid_similarity(sentence1, sentence2)
            scores.append(score)

    # Calculate the average score
    average_score = np.mean(scores)
    return average_score

# Test with paragraph pairs
paragraph_pairs = [
    ("An operating system (OS) is system software that manages computer hardware and software resources, and provides common services for computer programsTime-sharing operating systems schedule tasks for efficient use of the system and may also include accounting software for cost allocation of processor time, mass storage, peripherals, and other resources.For hardware functions such as input and output and memory allocation, the operating system acts as an intermediary between programs and the computer hardware,[1][2] although the application code is usually executed directly by the hardware and frequently makes system calls to an OS function or is interrupted by it. Operating systems are found on many devices that contain a computer – from cellular phones and video game consoles to web servers and supercomputers. ",
     "An operating system (OS) is a type of system software that controls the hardware and software resources of a computer and offers standard functions to software applications.In addition to scheduling activities for optimal system use, time-sharing operating systems may incorporate accounting software for the cost allocation of peripherals, mass storage, CPU time, and other resources.The operating system serves as a bridge between programs and computer hardware for hardware functions like input and output and memory allocation, even though the application code is typically executed directly by the hardware and frequently calls an OS function or is interrupted by it. Operating systems are present in a wide range of computing devices, including web servers, supercomputers, mobile phones, and gaming consoles. ")]
# Calculate and print the similarity for each paragraph pair
for i, (paragraph1, paragraph2) in enumerate(paragraph_pairs):
    similarity_score = paragraph_similarity(paragraph1, paragraph2)
    print(f"Paragraph Pair {i + 1}:")
    print(f"  Paragraph 1: {paragraph1}")
    print(f"  Paragraph 2: {paragraph2}")
    print(f"  Average Similarity Score: {similarity_score}\n")


Paragraph Pair 1:
  Paragraph 1: An operating system (OS) is system software that manages computer hardware and software resources, and provides common services for computer programsTime-sharing operating systems schedule tasks for efficient use of the system and may also include accounting software for cost allocation of processor time, mass storage, peripherals, and other resources.For hardware functions such as input and output and memory allocation, the operating system acts as an intermediary between programs and the computer hardware,[1][2] although the application code is usually executed directly by the hardware and frequently makes system calls to an OS function or is interrupted by it. Operating systems are found on many devices that contain a computer – from cellular phones and video game consoles to web servers and supercomputers. 
  Paragraph 2: An operating system (OS) is a type of system software that controls the hardware and software resources of a computer and offers 

In [None]:
!pip install nltk




In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import nltk
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to remove the first negatively impactful word from a sentence
def remove_first_negative_impactful_word(doc):
    doc_words = doc.lower().split()
    for word in negative_impactful_words:
        if word in doc_words:
            doc_words.remove(word)
            break
    return ' '.join(doc_words)

# Function to adjust scores based on negatively impactful words
def adjust_score_for_impactful_words(doc):
    doc_words = doc.lower().split()
    total_negative_score = 0.0
    negative_count = 0

    for word, weight in negative_impactful_words.items():
        if word in doc_words:
            total_negative_score += weight
            negative_count += 1

    # Adjust the score based on the parity of the count of negative words
    adjustment_factor = (-1) ** negative_count * total_negative_score

    return adjustment_factor

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.1, 0.15, 0.35, 0.35]

    # Remove the first negatively impactful word from doc1
    modified_doc1 = remove_first_negative_impactful_word(doc1)

    jaccard = jaccard_similarity(modified_doc1, doc2)
    tfidf = tfidf_similarity(modified_doc1, doc2)
    w2v = word2vec_similarity(modified_doc1, doc2)
    bert = bert_similarity(modified_doc1, doc2)[0][0]
    sbert = sbert_similarity(modified_doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    # Calculate the negative adjustment based on the original doc1
    negative_adjustment = adjust_score_for_impactful_words(doc1)
    final_score = total_similarity + negative_adjustment

    return final_score

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        hybrid_score = hybrid_similarity(doc1, doc2)
        results.append({
            'pair': (doc1, doc2),
            'score': hybrid_score
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("I am not going to Delhi",
     "Not that I am going to Delhi")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Hybrid Similarity Score: {result['score']}\n")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence Pair:
  Sentence 1: I am not going to Delhi
  Sentence 2: Not that I am going to Delhi
Hybrid Similarity Score: 0.2818344918997375



In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('all-mpnet-base-v2')  # Updated model

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to remove the first negatively impactful word from a sentence
def remove_first_negative_impactful_word(doc):
    doc_words = doc.lower().split()
    for word in negative_impactful_words:
        if word in doc_words:
            doc_words.remove(word)
            break
    return ' '.join(doc_words)

# Function to adjust scores based on negatively impactful words
def adjust_score_for_impactful_words(doc):
    doc_words = doc.lower().split()
    total_negative_score = 0.0
    negative_count = 0

    for word, weight in negative_impactful_words.items():
        if word in doc_words:
            total_negative_score += weight
            negative_count += 1

    # Adjust the score based on the parity of the count of negative words
    adjustment_factor = (-1) ** negative_count * total_negative_score

    return adjustment_factor

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.1, 0.15, 0.35, 0.35]

    # Remove the first negatively impactful word from doc1
    modified_doc1 = remove_first_negative_impactful_word(doc1)

    jaccard = jaccard_similarity(modified_doc1, doc2)
    tfidf = tfidf_similarity(modified_doc1, doc2)
    w2v = word2vec_similarity(modified_doc1, doc2)
    bert = bert_similarity(modified_doc1, doc2)[0][0]
    sbert = sbert_similarity(modified_doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    # Calculate the negative adjustment based on the original doc1
    negative_adjustment = adjust_score_for_impactful_words(doc1)
    final_score = total_similarity + negative_adjustment

    return final_score

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        hybrid_score = hybrid_similarity(doc1, doc2)
        results.append({
            'pair': (doc1, doc2),
            'score': hybrid_score
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("I am not going to Delhi",
     "Not that I am going to Delhi")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Hybrid Similarity Score: {result['score']}\n")


Sentence Pair:
  Sentence 1: I am not going to Delhi
  Sentence 2: Not that I am going to Delhi
Hybrid Similarity Score: 0.130680463993453



In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import re

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('paraphrase-mpnet-base-v2')  # Updated model

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to remove the first negatively impactful word from a sentence
def remove_first_negative_impactful_word(doc):
    doc_words = doc.lower().split()
    for word in negative_impactful_words:
        if word in doc_words:
            doc_words.remove(word)
            break
    return ' '.join(doc_words)

# Function to handle negations
def handle_negations(text):
    # Handle common negation forms
    negation_patterns = [
        (r"not\s+([a-zA-Z]+)", r"\1"),
        (r"n't\s+([a-zA-Z]+)", r"not \1")
    ]
    for pattern, replacement in negation_patterns:
        text = re.sub(pattern, replacement, text)
    return text

# Function to adjust scores based on negatively impactful words
def adjust_score_for_impactful_words(doc):
    doc_words = doc.lower().split()
    total_negative_score = 0.0
    negative_count = 0

    for word, weight in negative_impactful_words.items():
        if word in doc_words:
            total_negative_score += weight
            negative_count += 1

    # Adjust the score based on the parity of the count of negative words
    adjustment_factor = (-1) ** negative_count * total_negative_score

    return adjustment_factor

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.1, 0.15, 0.35, 0.35]

    # Handle negations in doc1
    modified_doc1 = handle_negations(doc1)

    jaccard = jaccard_similarity(modified_doc1, doc2)
    tfidf = tfidf_similarity(modified_doc1, doc2)
    w2v = word2vec_similarity(modified_doc1, doc2)
    bert = bert_similarity(modified_doc1, doc2)[0][0]
    sbert = sbert_similarity(modified_doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    # Calculate the negative adjustment based on the original doc1
    negative_adjustment = adjust_score_for_impactful_words(doc1)
    final_score = total_similarity + negative_adjustment

    return final_score

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        hybrid_score = hybrid_similarity(doc1, doc2)
        results.append({
            'pair': (doc1, doc2),
            'score': hybrid_score
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("not that I am not going to Delhi",
     "I am going to Delhi")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Hybrid Similarity Score: {result['score']}\n")


Sentence Pair:
  Sentence 1: not that I am not going to Delhi
  Sentence 2: I am going to Delhi
Hybrid Similarity Score: 0.3019795970168533



In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import re

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('all-mpnet-base-v2')  # Updated model

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to remove the first negatively impactful word from a sentence
def remove_first_negative_impactful_word(doc):
    doc_words = doc.lower().split()
    for word in negative_impactful_words:
        if word in doc_words:
            doc_words.remove(word)
            break
    return ' '.join(doc_words)

# Function to handle negations
def handle_negations(text):
    # Handle common negation forms
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    clean_words = [word for word in words if word not in negations]
    return ' '.join(clean_words)

# Function to count negation words
def count_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    count = sum(1 for word in words if word in negations)
    return count

# Function to adjust scores based on negatively impactful words
def adjust_score_for_impactful_words(doc):
    doc_words = doc.lower().split()
    total_negative_score = 0.0
    negative_count = 0

    for word, weight in negative_impactful_words.items():
        if word in doc_words:
            total_negative_score += weight
            negative_count += 1

    # Adjust the score based on the parity of the count of negative words
    adjustment_factor = (-1) ** negative_count * total_negative_score

    return adjustment_factor

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.1, 0.15, 0.35, 0.35]

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    bert = bert_similarity(doc1, doc2)[0][0]
    sbert = sbert_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    return total_similarity

# Negation-aware Similarity
def negation_aware_similarity(doc1, doc2):
    # Remove negation from sentence 1
    modified_doc1 = handle_negations(doc1)
    similarity_no_neg1 = hybrid_similarity(modified_doc1, doc2)

    # Remove negation from sentence 2
    modified_doc2 = handle_negations(doc2)
    similarity_no_neg2 = hybrid_similarity(doc1, modified_doc2)

    # Remove negations from both and compare negation counts
    modified_both1 = handle_negations(doc1)
    modified_both2 = handle_negations(doc2)
    similarity_no_neg_both = hybrid_similarity(modified_both1, modified_both2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if (negation_count1 % 2 == negation_count2 % 2):
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = 0

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        sim_no_neg1, sim_no_neg2, sim_adjusted = negation_aware_similarity(doc1, doc2)
        results.append({
            'pair': (doc1, doc2),
            'similarity_no_neg1': sim_no_neg1,
            'similarity_no_neg2': sim_no_neg2,
            'similarity_adjusted': sim_adjusted
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("I am  going to Delhi",
     "Not that I am going to Delhi")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Similarity without negations (sentence 1): {result['similarity_no_neg1']}")
    print(f"Similarity without negations (sentence 2): {result['similarity_no_neg2']}")
    print(f"Adjusted Similarity based on negation count parity: {result['similarity_adjusted']}\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Sentence Pair:
  Sentence 1: I am  going to Delhi
  Sentence 2: Not that I am going to Delhi
Similarity without negations (sentence 1): 0.505680463993453
Similarity without negations (sentence 2): 0.4817295464276516
Adjusted Similarity based on negation count parity: 0



In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import re

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')
model_sbert = SentenceTransformer('all-mpnet-base-v2')  # Updated model

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to handle negations
def handle_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    new_words = []
    skip_next = False
    for i, word in enumerate(words):
        if word in negations:
            if i + 1 < len(words):
                antonyms = []
                for syn in wn.synsets(words[i + 1]):
                    for l in syn.lemmas():
                        if l.antonyms():
                            antonyms.append(l.antonyms()[0].name())
                if antonyms:
                    new_words.append(antonyms[0])
                    skip_next = True
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        elif not skip_next:
            new_words.append(word)
        else:
            skip_next = False
    return ' '.join(new_words)

# Function to count negation words
def count_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    count = sum(1 for word in words if word in negations)
    return count

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# BERT Similarity
def bert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2.T) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.1, 0.15, 0.35, 0.35]

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    bert = bert_similarity(doc1, doc2)[0][0]
    sbert = sbert_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, bert, sbert]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    return total_similarity

# Negation-aware Similarity
def negation_aware_similarity(doc1, doc2):
    # Remove negation from sentence 1 and calculate similarity
    modified_doc1 = handle_negations(doc1)
    similarity_no_neg1 = hybrid_similarity(modified_doc1, doc2)

    # Remove negation from sentence 2 and calculate similarity
    modified_doc2 = handle_negations(doc2)
    similarity_no_neg2 = hybrid_similarity(doc1, modified_doc2)

    # Remove negations from both and calculate similarity
    similarity_no_neg_both = hybrid_similarity(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if (negation_count1 % 2 == negation_count2 % 2):
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.5  # Reduce score if negation parities differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted

# Final Similarity Score Calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted):
    # Weight the three similarity scores
    final_score = max(sim_no_neg1, sim_no_neg2, sim_adjusted)
    return final_score

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        sim_no_neg1, sim_no_neg2, sim_adjusted = negation_aware_similarity(doc1, doc2)
        final_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted)
        results.append({
            'pair': (doc1, doc2),
            'similarity_no_neg1': sim_no_neg1,
            'similarity_no_neg2': sim_no_neg2,
            'similarity_adjusted': sim_adjusted,
            'final_similarity': final_score
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("I am not going to Delhi",
     "Not that I am going to Delhi")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Similarity without negations (sentence 1): {result['similarity_no_neg1']}")
    print(f"Similarity without negations (sentence 2): {result['similarity_no_neg2']}")
    print(f"Adjusted Similarity based on negation count parity: {result['similarity_adjusted']}")
    print(f"Final Similarity Score: {result['final_similarity']}\n")


Sentence Pair:
  Sentence 1: I am not going to Delhi
  Sentence 2: Not that I am going to Delhi
Similarity without negations (sentence 1): 0.5138911491547965
Similarity without negations (sentence 2): 0.18936591063914004
Adjusted Similarity based on negation count parity: 0.5138911491547965
Final Similarity Score: 0.5138911491547965



In [None]:
!pip install transformers torch




In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
import re
from statistics import mean

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_bert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')  # NegBERT model
model_sbert = SentenceTransformer('all-mpnet-base-v2')

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to handle negations
def handle_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    new_words = []
    skip_next = False
    for i, word in enumerate(words):
        if word in negations:
            if i + 1 < len(words):
                antonyms = []
                for syn in wn.synsets(words[i + 1]):
                    for l in syn.lemmas():
                        if l.antonyms():
                            antonyms.append(l.antonyms()[0].name())
                if antonyms:
                    new_words.append(antonyms[0])
                    skip_next = True
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        elif not skip_next:
            new_words.append(word)
        else:
            skip_next = False
    return ' '.join(new_words)

# Function to count negation words
def count_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    count = sum(1 for word in words if word in negations)
    return count

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# NegBERT Similarity
def negbert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity
def hybrid_similarity(doc1, doc2):
    weights = [0.05, 0.05, 0.1, 0.4, 0.4]  # Adjusted weights, increased for NegBERT and SBERT

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    negbert = negbert_similarity(doc1, doc2)
    sbert = sbert_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, negbert, sbert]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    return total_similarity

# Negation-aware Similarity
def negation_aware_similarity(doc1, doc2):
    # Remove negation from sentence 1 and calculate similarity
    modified_doc1 = handle_negations(doc1)
    similarity_no_neg1 = hybrid_similarity(modified_doc1, doc2)

    # Remove negation from sentence 2 and calculate similarity
    modified_doc2 = handle_negations(doc2)
    similarity_no_neg2 = hybrid_similarity(doc1, modified_doc2)

    # Remove negations from both and calculate similarity
    similarity_no_neg_both = hybrid_similarity(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if (negation_count1 % 2 == negation_count2 % 2):
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.5  # Reduce score if negation parities differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted

# Final Similarity Score Calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted):
    # Weight the three similarity scores
    final_score = mean([sim_no_neg1, sim_no_neg2, sim_adjusted])
    return final_score

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        sim_no_neg1, sim_no_neg2, sim_adjusted = negation_aware_similarity(doc1, doc2)
        final_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted)
        results.append({
            'pair': (doc1, doc2),
            'similarity_no_neg1': sim_no_neg1,
            'similarity_no_neg2': sim_no_neg2,
            'similarity_adjusted': sim_adjusted,
            'final_similarity': final_score
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("World is not that weak and not that bad ",
     "World is strong and good")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Similarity without negations (sentence 1): {result['similarity_no_neg1']}")
    print(f"Similarity without negations (sentence 2): {result['similarity_no_neg2']}")
    print(f"Adjusted Similarity based on negation count parity: {result['similarity_adjusted']}")
    print(f"Final Similarity Score: {result['final_similarity']}\n")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence Pair:
  Sentence 1: World is not that weak and not that bad 
  Sentence 2: World is strong and good
Similarity without negations (sentence 1): 0.8261225066158095
Similarity without negations (sentence 2): 0.8261225066158095
Adjusted Similarity based on negation count parity: 0.8261225066158095
Final Similarity Score: 0.8261225066158095



In [None]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
import re
from statistics import mean

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_bert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')  # NegBERT model
model_sbert = SentenceTransformer('all-mpnet-base-v2')
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = RobertaModel.from_pretrained('roberta-base')

# Load Spacy model for NER and syntactic parsing
nlp = spacy.load("en_core_web_sm")

# Load Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Function to retrieve negatively impactful words from SentiWordNet
def get_negative_impactful_words_from_sentiwordnet():
    negative_impactful_words = {}
    for synset in list(wn.all_synsets()):
        if synset.pos() in ['a', 'r']:
            senti_synset = swn.senti_synset(synset.name())
            if senti_synset.neg_score() > 0.5:
                weight = 1 - senti_synset.neg_score()
                negative_impactful_words[synset.lemmas()[0].name()] = weight
    return negative_impactful_words

# Retrieve impactful words
negative_impactful_words = get_negative_impactful_words_from_sentiwordnet()

# Function to handle negations
def handle_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    new_words = []
    skip_next = False
    for i, word in enumerate(words):
        if word in negations:
            if i + 1 < len(words):
                antonyms = []
                for syn in wn.synsets(words[i + 1]):
                    for l in syn.lemmas():
                        if l.antonyms():
                            antonyms.append(l.antonyms()[0].name())
                if antonyms:
                    new_words.append(antonyms[0])
                    skip_next = True
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        elif not skip_next:
            new_words.append(word)
        else:
            skip_next = False
    return ' '.join(new_words)

# Function to count negation words
def count_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    count = sum(1 for word in words if word in negations)
    return count

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    score = jaccard_score(X.toarray()[0], X.toarray()[1])
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    score = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return score

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# NegBERT Similarity
def negbert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    return score

# RoBERTa Similarity
def roberta_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_roberta(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_roberta(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# Named Entity Recognition (NER) Similarity
def named_entity_similarity(doc1, doc2):
    doc1_ents = set([ent.text for ent in nlp(doc1).ents])
    doc2_ents = set([ent.text for ent in nlp(doc2).ents])
    common_ents = doc1_ents.intersection(doc2_ents)
    total_ents = doc1_ents.union(doc2_ents)
    if not total_ents:
        return 0.0
    return len(common_ents) / len(total_ents)

# Sentiment Similarity
def sentiment_similarity(doc1, doc2):
    sentiment1 = sia.polarity_scores(doc1)['compound']
    sentiment2 = sia.polarity_scores(doc2)['compound']
    return 1 - abs(sentiment1 - sentiment2)  # Closer sentiment scores result in higher similarity

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity with New Features
def hybrid_similarity_with_new_features(doc1, doc2):
    weights = [0.05, 0.05, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1]  # Adjust weights accordingly

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    negbert = negbert_similarity(doc1, doc2)
    sbert = sbert_similarity(doc1, doc2)
    roberta = roberta_similarity(doc1, doc2)
    named_entity = named_entity_similarity(doc1, doc2)
    sentiment = sentiment_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, negbert, sbert, roberta, named_entity, sentiment]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    return total_similarity

# Negation-aware Similarity
def negation_aware_similarity(doc1, doc2):
    # Remove negation from sentence 1 and calculate similarity
    modified_doc1 = handle_negations(doc1)
    similarity_no_neg1 = hybrid_similarity_with_new_features(modified_doc1, doc2)

    # Remove negation from sentence 2 and calculate similarity
    modified_doc2 = handle_negations(doc2)
    similarity_no_neg2 = hybrid_similarity_with_new_features(doc1, modified_doc2)

    # Remove negations from both and calculate similarity
    similarity_no_neg_both = hybrid_similarity_with_new_features(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if (negation_count1  == negation_count2 ):
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.5  # Reduce score if negation parities differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted

# Final Similarity Score Calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted):
    # Weight the three similarity scores
    final_score = mean([sim_no_neg1, sim_no_neg2, sim_adjusted])
    return final_score

# Calculate Similarity for Sentence Pairs
def calculate_similarity_for_pairs(sentence_pairs):
    results = []
    for i, (doc1, doc2) in enumerate(sentence_pairs):
        sim_no_neg1, sim_no_neg2, sim_adjusted = negation_aware_similarity(doc1, doc2)
        final_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted)
        results.append({
            'pair': (doc1, doc2),
            'similarity_no_neg1': sim_no_neg1,
            'similarity_no_neg2': sim_no_neg2,
            'similarity_adjusted': sim_adjusted,
            'final_similarity': final_score
        })
    return results

# Sentence pairs
sentence_pairs = [
    ("I will eat apple and oranges daily",
     "I will not only eat apple but also oranges daily")]

similarity_results = calculate_similarity_for_pairs(sentence_pairs)
for result in similarity_results:
    print(f"Sentence Pair:\n  Sentence 1: {result['pair'][0]}\n  Sentence 2: {result['pair'][1]}")
    print(f"Similarity without negations (sentence 1): {result['similarity_no_neg1']}")
    print(f"Similarity without negations (sentence 2): {result['similarity_no_neg2']}")
    print(f"Adjusted Similarity based on negation count parity: {result['similarity_adjusted']}")
    print(f"Final Similarity Score: {result['final_similarity']}\n")




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentence Pair:
  Sentence 1: I will eat apple and oranges daily
  Sentence 2: I will not only eat apple but also oranges daily
Similarity without negations (sentence 1): 0.7030918405928834
Similarity without negations (sentence 2): 0.7035442756095154
Adjusted Similarity based on negation count parity: 0.3518951915938965
Final Similarity Score: 0.5861771025987651



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
import re
from statistics import mean

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_bert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')  # NegBERT model
model_sbert = SentenceTransformer('all-mpnet-base-v2')
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = RobertaModel.from_pretrained('roberta-base')

# Load Spacy model for NER and syntactic parsing
nlp = spacy.load("en_core_web_sm")

# Load Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Function to handle negations
def handle_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    new_words = []
    skip_next = False
    for i, word in enumerate(words):
        if word in negations:
            if i + 1 < len(words):
                antonyms = []
                for syn in wn.synsets(words[i + 1]):
                    for l in syn.lemmas():
                        if l.antonyms():
                            antonyms.append(l.antonyms()[0].name())
                if antonyms:
                    new_words.append(antonyms[0])
                    skip_next = True
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        elif not skip_next:
            new_words.append(word)
        else:
            skip_next = False
    return ' '.join(new_words)

# Function to count negation words
def count_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    count = sum(1 for word in words if word in negations)
    return count

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    intersection = X.toarray()[0] & X.toarray()[1]
    union = X.toarray()[0] | X.toarray()[1]
    score = sum(intersection) / sum(union) if sum(union) > 0 else 0.0
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# NegBERT Similarity
def negbert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    return cosine_similarity([vec1], [vec2])[0][0]

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# RoBERTa Similarity
def roberta_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_roberta(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_roberta(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    return cosine_similarity([vec1], [vec2])[0][0]

# Named Entity Recognition (NER) Similarity
def named_entity_similarity(doc1, doc2):
    doc1_ents = set([ent.text for ent in nlp(doc1).ents])
    doc2_ents = set([ent.text for ent in nlp(doc2).ents])
    common_ents = doc1_ents.intersection(doc2_ents)
    total_ents = doc1_ents.union(doc2_ents)
    return len(common_ents) / len(total_ents) if total_ents else 0.0

# Sentiment Similarity
def sentiment_similarity(doc1, doc2):
    sentiment1 = sia.polarity_scores(doc1)['compound']
    sentiment2 = sia.polarity_scores(doc2)['compound']
    return 1 - abs(sentiment1 - sentiment2)  # Closer sentiment scores result in higher similarity

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity with New Features
def hybrid_similarity_with_new_features(doc1, doc2):
    weights = [0.05, 0.05, 0.1, 0.2, 0.2, 0.1, 0.1, 0.1]  # Adjust weights accordingly

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    negbert = negbert_similarity(doc1, doc2)
    sbert = sbert_similarity(doc1, doc2)
    roberta = roberta_similarity(doc1, doc2)
    named_entity = named_entity_similarity(doc1, doc2)
    sentiment = sentiment_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, negbert, sbert, roberta, named_entity, sentiment]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    return total_similarity

# Negation-aware Similarity
def negation_aware_similarity(doc1, doc2):
    # Remove negation from sentence 1 and calculate similarity
    modified_doc1 = handle_negations(doc1)
    similarity_no_neg1 = hybrid_similarity_with_new_features(modified_doc1, doc2)

    # Remove negation from sentence 2 and calculate similarity
    modified_doc2 = handle_negations(doc2)
    similarity_no_neg2 = hybrid_similarity_with_new_features(doc1, modified_doc2)

    # Remove negations from both and calculate similarity
    similarity_no_neg_both = hybrid_similarity_with_new_features(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if (negation_count1 == negation_count2):
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.5  # Reduce score if negation parities differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted

# Final Similarity Score Calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted):
    # Weight the three similarity scores
    final_score = mean([sim_no_neg1, sim_no_neg2, sim_adjusted])
    return final_score

# Function to calculate similarity scores from an Excel file
def calculate_similarity_from_excel(file_path):
    # Read the Excel file
    df = pd.read_excel(file_path)

    # Ensure there are columns 'Sentence 1' and 'Sentence 2' in the Excel file
    if 'Sentence 1' not in df.columns or 'Sentence 2' not in df.columns:
        raise ValueError("The Excel file must contain 'Sentence 1' and 'Sentence 2' columns.")

    # Process the file and calculate similarity scores
    df['Generated Similarity Score'] = df.apply(
        lambda row: final_similarity_score(
            *negation_aware_similarity(row['Sentence 1'], row['Sentence 2'])
        ), axis=1
    )

    # Save the updated DataFrame back to Excel
    output_file_path = file_path.replace('.xlsx', '_with_similarity_scores.xlsx')
    df.to_excel(output_file_path, index=False)
    return output_file_path

# Example usage of the function
file_path = '/sentence_pairs_with_predefined_scores_with_similarity_scores.xlsx'  # Replace with the path to your uploaded Excel file
output_file = calculate_similarity_from_excel(file_path)
print(f'Similarity scores saved to {output_file}')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import numpy as np
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy
import re
from statistics import mean

# Load models
model_w2v = api.load('word2vec-google-news-300')
tokenizer_bert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_bert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')  # NegBERT model
model_sbert = SentenceTransformer('all-mpnet-base-v2')
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = RobertaModel.from_pretrained('roberta-base')

# Load Spacy model for NER and syntactic parsing
nlp = spacy.load("en_core_web_sm")

# Load Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Function to handle negations
def handle_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    new_words = []
    skip_next = False
    for i, word in enumerate(words):
        if word in negations:
            if i + 1 < len(words):
                antonyms = []
                for syn in wn.synsets(words[i + 1]):
                    for l in syn.lemmas():
                        if l.antonyms():
                            antonyms.append(l.antonyms()[0].name())
                if antonyms:
                    new_words.append(antonyms[0])
                    skip_next = True
                else:
                    new_words.append(word)
            else:
                new_words.append(word)
        elif not skip_next:
            new_words.append(word)
        else:
            skip_next = False
    return ' '.join(new_words)

# Function to count negation words
def count_negations(text):
    negations = ["not", "no", "never", "n't"]
    words = text.lower().split()
    count = sum(1 for word in words if word in negations)
    return count

# Jaccard Similarity
def jaccard_similarity(doc1, doc2):
    vectorizer = CountVectorizer(binary=True)
    X = vectorizer.fit_transform([doc1, doc2])
    intersection = X.toarray()[0] & X.toarray()[1]
    union = X.toarray()[0] | X.toarray()[1]
    score = sum(intersection) / sum(union) if sum(union) > 0 else 0.0
    return score

# TF-IDF Similarity
def tfidf_similarity(doc1, doc2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([doc1, doc2])
    return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Word2Vec Similarity
def word2vec_similarity(doc1, doc2):
    def document_vector(document):
        words = document.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model_w2v.vector_size)

    vec1 = document_vector(doc1)
    vec2 = document_vector(doc2)
    score = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)) if np.linalg.norm(vec1) != 0 and np.linalg.norm(vec2) != 0 else 0.0
    return score

# NegBERT Similarity
def negbert_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_bert(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_bert(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    return cosine_similarity([vec1], [vec2])[0][0]

# SBERT Similarity using the new model
def sbert_similarity(doc1, doc2):
    embeddings = model_sbert.encode([doc1, doc2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# RoBERTa Similarity
def roberta_similarity(doc1, doc2):
    def document_embedding(document):
        inputs = tokenizer_roberta(document, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model_roberta(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

    vec1 = document_embedding(doc1)
    vec2 = document_embedding(doc2)
    return cosine_similarity([vec1], [vec2])[0][0]

# Named Entity Recognition (NER) Similarity
def named_entity_similarity(doc1, doc2):
    doc1_ents = set([ent.text for ent in nlp(doc1).ents])
    doc2_ents = set([ent.text for ent in nlp(doc2).ents])
    common_ents = doc1_ents.intersection(doc2_ents)
    total_ents = doc1_ents.union(doc2_ents)
    return len(common_ents) / len(total_ents) if total_ents else 0.0

# Sentiment Similarity
def sentiment_similarity(doc1, doc2):
    sentiment1 = sia.polarity_scores(doc1)['compound']
    sentiment2 = sia.polarity_scores(doc2)['compound']
    return 1 - abs(sentiment1 - sentiment2)  # Closer sentiment scores result in higher similarity

# Normalize Scores
def normalize_scores(scores):
    scaler = MinMaxScaler()
    return scaler.fit_transform(np.array(scores).reshape(-1, 1)).flatten()

# Hybrid Similarity with New Features
def hybrid_similarity_with_new_features(doc1, doc2):
    # Further tuned weights for improved accuracy
    weights = [0.09, 0.07, 0.13, 0.2, 0.2, 0.11, 0.1, 0.1]  # Adjusted weights

    jaccard = jaccard_similarity(doc1, doc2)
    tfidf = tfidf_similarity(doc1, doc2)
    w2v = word2vec_similarity(doc1, doc2)
    negbert = negbert_similarity(doc1, doc2)
    sbert = sbert_similarity(doc1, doc2)
    roberta = roberta_similarity(doc1, doc2)
    named_entity = named_entity_similarity(doc1, doc2)
    sentiment = sentiment_similarity(doc1, doc2)

    scores = [jaccard, tfidf, w2v, negbert, sbert, roberta, named_entity, sentiment]
    normalized_scores = normalize_scores(scores)

    weighted_scores = np.array(normalized_scores) * np.array(weights)
    total_similarity = np.sum(weighted_scores)

    return total_similarity

# Negation-aware Similarity
def negation_aware_similarity(doc1, doc2):
    # Remove negation from sentence 1 and calculate similarity
    modified_doc1 = handle_negations(doc1)
    similarity_no_neg1 = hybrid_similarity_with_new_features(modified_doc1, doc2)

    # Remove negation from sentence 2 and calculate similarity
    modified_doc2 = handle_negations(doc2)
    similarity_no_neg2 = hybrid_similarity_with_new_features(doc1, modified_doc2)

    # Remove negations from both and calculate similarity
    similarity_no_neg_both = hybrid_similarity_with_new_features(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if (negation_count1 == negation_count2):
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.5  # Reduce score if negation parities differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted

# Final Similarity Score Calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted):
    # Weight the three similarity scores
    final_score = mean([sim_no_neg1, sim_no_neg2, sim_adjusted])
    return final_score

# Function to calculate similarity scores from an Excel file
def calculate_similarity_from_excel(file_path):
    # Read the Excel file
    df = pd.read_excel(file_path)

    # Ensure there are columns 'Sentence 1' and 'Sentence 2' in the Excel file
    if 'Sentence 1' not in df.columns or 'Sentence 2' not in df.columns:
        raise ValueError("The Excel file must contain 'Sentence 1' and 'Sentence 2' columns.")

    # Process the file and calculate similarity scores
    df['Generated Similarity Score'] = df.apply(
        lambda row: final_similarity_score(
            *negation_aware_similarity(row['Sentence 1'], row['Sentence 2'])
        ), axis=1
    )

    # Save the updated DataFrame back to Excel
    output_file_path = file_path.replace('.xlsx', '_with_similarity_scores.xlsx')
    df.to_excel(output_file_path, index=False)
    return output_file_path

# Example usage of the function
file_path = '/sentence_pairs_with_predefined_scores.xlsx'  # Replace with the path to your uploaded Excel file
output_file = calculate_similarity_from_excel(file_path)
print(f'Similarity scores saved to {output_file}')



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Similarity scores saved to /sentence_pairs_with_predefined_scores_with_similarity_scores.xlsx


In [3]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [4]:
from datasets import load_dataset

ds = load_dataset("mteb/stsbenchmark-sts")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/278k [00:00<?, ?B/s]

validation.jsonl.gz:   0%|          | 0.00/86.4k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/63.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [5]:
# Access the train split and print a few examples
train_data = ds['train']
print(train_data[0])


{'split': 'train', 'genre': 'main-captions', 'dataset': 'MSRvid', 'year': '2012test', 'sid': '0001', 'score': 5.0, 'sentence1': 'A plane is taking off.', 'sentence2': 'An air plane is taking off.'}


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
import gensim.downloader as api
import numpy as np
from datasets import load_dataset
from torch.utils.data import DataLoader
import random
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd

# Load pre-trained models
tokenizer_negbert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_negbert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')

tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base', clean_up_tokenization_spaces=False)
model_roberta = RobertaModel.from_pretrained('roberta-base')

# Freeze RoBERTa pooler layers to prevent the warning and avoid untrained parts of the model
for param in model_roberta.pooler.parameters():
    param.requires_grad = False

model_sbert = SentenceTransformer('all-mpnet-base-v2')

# Load Word2Vec model
model_w2v = api.load('word2vec-google-news-300')

# Function to batch process embeddings from different models
def get_negbert_embeddings_batch(sentences):
    inputs = tokenizer_negbert(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model_negbert(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def get_roberta_embeddings_batch(sentences):
    inputs = tokenizer_roberta(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model_roberta(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def get_sbert_embeddings_batch(sentences):
    embeddings = model_sbert.encode(sentences, batch_size=len(sentences))
    return torch.tensor(embeddings)

def get_word2vec_embeddings_batch(sentences):
    word_vectors_batch = []
    for sentence in sentences:
        words = sentence.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        if len(word_vectors) > 0:
            word_vectors_batch.append(torch.tensor(np.mean(word_vectors, axis=0)))
        else:
            word_vectors_batch.append(torch.zeros(300))  # Word2Vec has 300 dimensions
    return torch.stack(word_vectors_batch)

# Prepare embeddings and dataset
def prepare_embeddings_batch(sentences1, sentences2):
    # Get embeddings from all the models in batch
    negbert_emb1 = get_negbert_embeddings_batch(sentences1)
    roberta_emb1 = get_roberta_embeddings_batch(sentences1)
    sbert_emb1 = get_sbert_embeddings_batch(sentences1)
    w2v_emb1 = get_word2vec_embeddings_batch(sentences1)

    negbert_emb2 = get_negbert_embeddings_batch(sentences2)
    roberta_emb2 = get_roberta_embeddings_batch(sentences2)
    sbert_emb2 = get_sbert_embeddings_batch(sentences2)
    w2v_emb2 = get_word2vec_embeddings_batch(sentences2)

    # Concatenate embeddings from all models
    emb1 = torch.cat([negbert_emb1, roberta_emb1, sbert_emb1, w2v_emb1], dim=-1)
    emb2 = torch.cat([negbert_emb2, roberta_emb2, sbert_emb2, w2v_emb2], dim=-1)

    return emb1, emb2

# MSE loss function
def mse_loss(y_true, y_pred):
    return nn.MSELoss()(y_true, y_pred)

# Siamese Network with Self-Attention layer and Sigmoid output
class SiameseNetworkWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(SiameseNetworkWithAttention, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=4, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.fc1 = nn.Linear(hidden_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward_one(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))
        lstm_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.layer_norm(lstm_out)
        x = self.relu(self.fc1(lstm_out))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        return x

    def forward(self, x1, x2):
        output1 = self.forward_one(x1)
        output2 = self.forward_one(x2)
        distance = torch.abs(output1 - output2)
        score = self.fc3(distance)
        return torch.sigmoid(score)  # Constrain output between 0 and 1

# Reduce dataset to manage GPU exhaustion
file_path = '/content/SRIP-Dataset.csv'
custom_dataset = pd.read_csv(file_path)
def sample_dataset(custom_dataset, num_samples):
    sampled_dataset = custom_dataset.sample(n=num_samples).reset_index(drop=True)
    return {
        'sentence1': sampled_dataset['Sentence 1'].tolist(),
        'sentence2': sampled_dataset['Sentence 2'].tolist(),
        'score': (sampled_dataset['Similarity Score'] / 100).tolist()  # Normalize similarity scores to 0-1
    }

# Load dataset and reduce it for training and testing

train_data = sample_dataset(custom_dataset, 800)  # Reduce training data size
test_data = sample_dataset(custom_dataset, 200)  # Reduce test data size

# Custom Dataset class
class SentencePairDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset['sentence1'])

    def __getitem__(self, idx):
        sentence1 = self.dataset['sentence1'][idx]
        sentence2 = self.dataset['sentence2'][idx]
        score = self.dataset['score'][idx]
        return sentence1, sentence2, score

# Instantiate the Siamese network
input_dim = get_negbert_embeddings_batch(["dummy sentence"]).shape[1] + \
            get_roberta_embeddings_batch(["dummy sentence"]).shape[1] + \
            get_sbert_embeddings_batch(["dummy sentence"]).shape[1] + \
            300  # Word2Vec has 300 dimensions
hidden_dim = 128  # Reduced hidden dimension for LSTM

model = SiameseNetworkWithAttention(input_dim, hidden_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Loss and optimizer
optimizer = optim.AdamW(model.parameters(), lr=0.0001)  # Lower learning rate for better convergence
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# DataLoader setup with reduced batch size
batch_size = 8  # Reduce batch size to avoid GPU exhaustion
train_loader = DataLoader(SentencePairDataset(train_data), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(SentencePairDataset(test_data), batch_size=batch_size, shuffle=False)

# Training loop with gradient accumulation
def train_model(num_epochs=15, accumulation_steps=6):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        optimizer.zero_grad()  # Zero the gradients at the start
        for i, (sentence1, sentence2, predefined_similarity) in enumerate(train_loader):
            # Prepare embeddings
            emb1, emb2 = prepare_embeddings_batch(sentence1, sentence2)
            emb1, emb2 = emb1.to(device), emb2.to(device)
            predefined_similarity = predefined_similarity.float().to(device)

            # Forward pass
            output = model(emb1, emb2).squeeze()
            loss = mse_loss(predefined_similarity, output)
            loss = loss / accumulation_steps  # Normalize loss

            # Backward pass
            loss.backward()

            # Gradient accumulation
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item()

        scheduler.step()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss:.4f}")

train_model(num_epochs=15)

def evaluate_model(loader):
    model.eval()
    predefined_scores = []
    calculated_scores = []
    with torch.no_grad():
        for sentence1, sentence2, predefined_similarity in loader:
            emb1, emb2 = prepare_embeddings_batch(sentence1, sentence2)
            emb1, emb2 = emb1.to(device), emb2.to(device)
            predefined_similarity = predefined_similarity.float().to(device)
            output = model(emb1, emb2).squeeze()
            predefined_scores.extend(predefined_similarity.cpu().tolist())
            calculated_scores.extend(output.cpu().tolist())
    # Convert predefined scores and calculated scores to binary labels based on a threshold
    threshold = 0.5  # Example threshold for converting to binary (adjust as needed)
    actual_labels = [1 if score >= threshold else 0 for score in predefined_scores]
    predicted_labels = [1 if score >= threshold else 0 for score in calculated_scores]
    # Calculate AUC-ROC score
    auc = roc_auc_score(actual_labels, calculated_scores)
    print(f"AUC-ROC: {auc:.4f}")
    # Calculate binary classification accuracy
    accuracy = accuracy_score(actual_labels, predicted_labels)
    print(f"Binary Classification Accuracy: {accuracy:.4f}")
    # Optionally, plot the ROC curve
    fpr, tpr, _ = roc_curve(actual_labels, calculated_scores)
    plt.plot(fpr, tpr, label=f"AUC-ROC: {auc:.4f}")
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal for random guessing
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()
    mae = mean_absolute_error(predefined_scores, calculated_scores)
    mse = mean_squared_error(predefined_scores, calculated_scores)
    r2 = r2_score(predefined_scores, calculated_scores)
    print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R²: {r2:.4f}")
# Evaluate model on test data
evaluate_model(test_loader)


def test_with_custom_input(sentence1, sentence2):
    model.eval()

    # Prepare embeddings for custom input
    emb1, emb2 = prepare_embeddings_batch([sentence1], [sentence2])
    emb1, emb2 = emb1.to(device), emb2.to(device)

    # Forward pass
    with torch.no_grad():
        output = model(emb1, emb2).squeeze()

    # Predicted similarity score
    similarity_score = output.item()
    print(f"Predicted similarity score: {similarity_score:.4f}")

    # Optionally, classify based on a threshold
    threshold = 0.5  # Example threshold (adjust as needed)
    predicted_label = 1 if similarity_score >= threshold else 0
    print(f"Predicted label (binary): {predicted_label}")

# Example of testing with custom sentences
custom_sentence1 = "I am going to delhi"
custom_sentence2 = "I am not going to delhi"

test_with_custom_input(custom_sentence1, custom_sentence2)
torch.save(model.state_dict(), "/content/pretrained_siamese_model.pth")

In [None]:
import torch
import torch.nn as nn
import re
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
import gensim.downloader as api
import spacy
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.stats import zscore

# Load SpaCy model for NER and syntactic parsing
nlp = spacy.load("en_core_web_sm")

# Load Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Extract negative sentiment words from SentiWordNet
negative_sentiment_words = set()
for synset in list(swn.all_senti_synsets()):
    if synset.neg_score() > 0.5:  # Threshold for significant negativity
        for lemma in synset.synset.lemmas():
            negative_sentiment_words.add(lemma.name())

# Function to handle negations using SentiWordNet's negative sentiment words
def handle_negations(text):
    words = text.lower().split()
    new_words = []
    i = 0
    while i < len(words):
        if words[i] in negative_sentiment_words and i + 1 < len(words):
            i += 1
        else:
            new_words.append(words[i])
        i += 1
    return ' '.join(new_words)

# Function to count negation words using SentiWordNet's negative sentiment words
def count_negations(text):
    words = text.lower().split()
    return sum(1 for word in words if word in negative_sentiment_words)

# Define the Siamese network model class
class SiameseNetworkWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(SiameseNetworkWithAttention, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=4, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.fc1 = nn.Linear(hidden_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward_one(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))
        lstm_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.layer_norm(lstm_out)
        x = self.relu(self.fc1(lstm_out))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        return x

    def forward(self, x1, x2):
        output1 = self.forward_one(x1)
        output2 = self.forward_one(x2)
        distance = torch.abs(output1 - output2)
        score = self.fc3(distance)
        return torch.sigmoid(score)  # Constrain output between 0 and 1

# Load your pre-trained Siamese model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_dim = 2604  # Assuming combined input dimension from the embeddings
hidden_dim = 128

# Instantiate the model and load the weights
model = SiameseNetworkWithAttention(input_dim, hidden_dim).to(device)
model.load_state_dict(torch.load("/content/pretrained_siamese_model.pth", map_location=device))
model.eval()  # Set the model to evaluation mode

# Load pre-trained models
tokenizer_negbert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_negbert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')

tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = RobertaModel.from_pretrained('roberta-base')

model_sbert = SentenceTransformer('all-mpnet-base-v2')
model_w2v = api.load('word2vec-google-news-300')

# Function to get NegBERT embeddings
def get_negbert_embeddings_batch(sentences):
    inputs = tokenizer_negbert(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model_negbert(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Function to get RoBERTa embeddings
def get_roberta_embeddings_batch(sentences):
    inputs = tokenizer_roberta(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model_roberta(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Function to get SBERT embeddings
def get_sbert_embeddings_batch(sentences):
    embeddings = model_sbert.encode(sentences, batch_size=len(sentences))
    return torch.tensor(embeddings)

# Function to get Word2Vec embeddings
def get_word2vec_embeddings_batch(sentences):
    word_vectors_batch = []
    for sentence in sentences:
        words = sentence.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        if len(word_vectors) > 0:
            word_vectors_batch.append(torch.tensor(np.mean(word_vectors, axis=0)))
        else:
            word_vectors_batch.append(torch.zeros(300))  # Word2Vec has 300 dimensions
    return torch.stack(word_vectors_batch)

# Function to prepare embeddings for similarity calculation
def prepare_embeddings_batch(sentences1, sentences2):
    negbert_emb1 = get_negbert_embeddings_batch(sentences1)
    roberta_emb1 = get_roberta_embeddings_batch(sentences1)
    sbert_emb1 = get_sbert_embeddings_batch(sentences1)
    w2v_emb1 = get_word2vec_embeddings_batch(sentences1)

    negbert_emb2 = get_negbert_embeddings_batch(sentences2)
    roberta_emb2 = get_roberta_embeddings_batch(sentences2)
    sbert_emb2 = get_sbert_embeddings_batch(sentences2)
    w2v_emb2 = get_word2vec_embeddings_batch(sentences2)

    emb1 = torch.cat([negbert_emb1, roberta_emb1, sbert_emb1, w2v_emb1], dim=-1)
    emb2 = torch.cat([negbert_emb2, roberta_emb2, sbert_emb2, w2v_emb2], dim=-1)

    return emb1, emb2

# Function to calculate similarity using the pre-trained Siamese network
def siamese_network_similarity(sentence1, sentence2):
    emb1, emb2 = prepare_embeddings_batch([sentence1], [sentence2])
    emb1, emb2 = emb1.to(device), emb2.to(device)

    with torch.no_grad():
        output = model(emb1, emb2).squeeze()
    return output.item()

# Refactored negation-aware similarity using the pre-trained Siamese Network similarity
def negation_aware_similarity(doc1, doc2):
    modified_doc1 = handle_negations(doc1)
    modified_doc2 = handle_negations(doc2)

    similarity_no_neg1 = siamese_network_similarity(modified_doc1, doc2)
    similarity_no_neg2 = siamese_network_similarity(doc1, modified_doc2)
    similarity_no_neg_both = siamese_network_similarity(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if negation_count1 == negation_count2:
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.7  # Lower score if negation counts differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted, negation_count1, negation_count2

# Final similarity score calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2):
    negation_even_odd_match = (neg_count1 % 2 == neg_count2 % 2)
    if negation_even_odd_match:
        combined_score = max(sim_no_neg1, sim_no_neg2, sim_adjusted)
    else:
        combined_score = min(sim_no_neg1, sim_no_neg2, sim_adjusted)
    return combined_score
# Function to split sentences using conjunctions
def split_using_conjunctions(sentence):
    doc = nlp(sentence)
    conjunctions = [token.text for token in doc if token.dep_ in ('cc', 'mark')]
    pattern = r'\b(' + '|'.join(conjunctions) + r')\b'
    parts = re.split(pattern, sentence)
    parts = [' '.join(parts[i:i + 2]).strip() for i in range(0, len(parts), 2)]
    clean_parts = [part.strip() for part in parts if part.strip()]
    return clean_parts, len(conjunctions) > 0
# Function to calculate similarities for each specific pair of parts
def calculate_specific_similarities(pairs):
    results = []
    for i, (part1, part2) in enumerate(pairs, 1):
        sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2 = negation_aware_similarity(part1, part2)
        final_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2)
        results.append((f'Pair {i}: "{part1}" vs. "{part2}"', sim_no_neg1, sim_no_neg2, sim_adjusted, final_score))
    return results
# Main function to decide similarity calculation approach based on conjunction presence
def calculate_combined_similarity(sentence1, sentence2):
    parts1, has_conjunctions1 = split_using_conjunctions(sentence1)
    parts2, has_conjunctions2 = split_using_conjunctions(sentence2)
    if has_conjunctions1 or has_conjunctions2:
        desired_pairs = []
        for part1 in parts1:
            for part2 in parts2:
                desired_pairs.append((part1, part2))
        similarities = calculate_specific_similarities(desired_pairs)
        final_scores = [result[-1] for result in similarities]
        z_scores = zscore(final_scores)
        z_score_threshold = 1.28
        if max(z_scores) > z_score_threshold:
            combined_score = max(final_scores)
        else:
            combined_score = min(final_scores)
        print(f"Combined Similarity Score (Z-Score Based Decision): {combined_score}")
    else:
        sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2 = negation_aware_similarity(sentence1, sentence2)
        combined_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2)
        print(f"Combined Similarity Score (Standard Approach): {combined_score}")
    return combined_score
# Example sentences to test integration
sentence1 = "Virtual memory expands available memory space."
sentence2 = "Virtual memory does not expand on available memory space"
# Calculate combined similarity score based on conjunction presence
combined_similarity = calculate_combined_similarity(sentence1, sentence2)
# Save the trained Siamese model
torch.save(model.state_dict(), "siamese_model.pth")
print("Model saved successfully as 'siamese_model.pth'")



In [None]:
import torch
import torch.nn as nn
import re
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from sentence_transformers import SentenceTransformer
import gensim.downloader as api
import spacy
from nltk.corpus import sentiwordnet as swn
from nltk.sentiment import SentimentIntensityAnalyzer
from scipy.stats import zscore

# Load SpaCy model for NER and syntactic parsing
nlp = spacy.load("en_core_web_sm")

# Load Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# Extract negative sentiment words from SentiWordNet
negative_sentiment_words = set()
for synset in list(swn.all_senti_synsets()):
    if synset.neg_score() > 0.5:  # Threshold for significant negativity
        for lemma in synset.synset.lemmas():
            negative_sentiment_words.add(lemma.name())

# Function to handle negations using SentiWordNet's negative sentiment words
def handle_negations(text):
    words = text.lower().split()
    new_words = []
    i = 0
    while i < len(words):
        if words[i] in negative_sentiment_words and i + 1 < len(words):
            i += 1
        else:
            new_words.append(words[i])
        i += 1
    return ' '.join(new_words)

# Function to count negation words using SentiWordNet's negative sentiment words
def count_negations(text):
    words = text.lower().split()
    return sum(1 for word in words if word in negative_sentiment_words)

# Define the Siamese network model class
class SiameseNetworkWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(SiameseNetworkWithAttention, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=4, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.fc1 = nn.Linear(hidden_dim * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()

    def forward_one(self, x):
        lstm_out, _ = self.lstm(x.unsqueeze(1))
        lstm_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        lstm_out = lstm_out[:, -1, :]
        lstm_out = self.layer_norm(lstm_out)
        x = self.relu(self.fc1(lstm_out))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        return x

    def forward(self, x1, x2):
        output1 = self.forward_one(x1)
        output2 = self.forward_one(x2)
        distance = torch.abs(output1 - output2)
        score = self.fc3(distance)
        return torch.sigmoid(score)  # Constrain output between 0 and 1

# Load your pre-trained Siamese model
device = torch.device('cpu')
input_dim = 2604  # Assuming combined input dimension from the embeddings
hidden_dim = 128

# Instantiate the model and load the weights
model = SiameseNetworkWithAttention(input_dim, hidden_dim).to(device)
model.load_state_dict(torch.load("/content/pretrained_siamese_model.pth", map_location=device))
model.eval()  # Set the model to evaluation mode

# Load pre-trained models
tokenizer_negbert = BertTokenizer.from_pretrained('textattack/bert-base-uncased-snli')
model_negbert = BertModel.from_pretrained('textattack/bert-base-uncased-snli')

tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
model_roberta = RobertaModel.from_pretrained('roberta-base')

model_sbert = SentenceTransformer('all-mpnet-base-v2')
model_w2v = api.load('word2vec-google-news-300')

# Function to get NegBERT embeddings
def get_negbert_embeddings_batch(sentences):
    inputs = tokenizer_negbert(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model_negbert(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Function to get RoBERTa embeddings
def get_roberta_embeddings_batch(sentences):
    inputs = tokenizer_roberta(sentences, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model_roberta(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Function to get SBERT embeddings
def get_sbert_embeddings_batch(sentences):
    embeddings = model_sbert.encode(sentences, batch_size=len(sentences))
    return torch.tensor(embeddings)

# Function to get Word2Vec embeddings
def get_word2vec_embeddings_batch(sentences):
    word_vectors_batch = []
    for sentence in sentences:
        words = sentence.lower().split()
        word_vectors = [model_w2v[word] for word in words if word in model_w2v]
        if len(word_vectors) > 0:
            word_vectors_batch.append(torch.tensor(np.mean(word_vectors, axis=0)))
        else:
            word_vectors_batch.append(torch.zeros(300))  # Word2Vec has 300 dimensions
    return torch.stack(word_vectors_batch)

# Function to prepare embeddings for similarity calculation
def prepare_embeddings_batch(sentences1, sentences2):
    negbert_emb1 = get_negbert_embeddings_batch(sentences1)
    roberta_emb1 = get_roberta_embeddings_batch(sentences1)
    sbert_emb1 = get_sbert_embeddings_batch(sentences1)
    w2v_emb1 = get_word2vec_embeddings_batch(sentences1)

    negbert_emb2 = get_negbert_embeddings_batch(sentences2)
    roberta_emb2 = get_roberta_embeddings_batch(sentences2)
    sbert_emb2 = get_sbert_embeddings_batch(sentences2)
    w2v_emb2 = get_word2vec_embeddings_batch(sentences2)

    emb1 = torch.cat([negbert_emb1, roberta_emb1, sbert_emb1, w2v_emb1], dim=-1)
    emb2 = torch.cat([negbert_emb2, roberta_emb2, sbert_emb2, w2v_emb2], dim=-1)

    return emb1, emb2

# Function to calculate similarity using the pre-trained Siamese network
def siamese_network_similarity(sentence1, sentence2):
    emb1, emb2 = prepare_embeddings_batch([sentence1], [sentence2])
    emb1, emb2 = emb1.to(device), emb2.to(device)

    with torch.no_grad():
        output = model(emb1, emb2).squeeze()
    return output.item()

# Refactored negation-aware similarity using the pre-trained Siamese Network similarity
def negation_aware_similarity(doc1, doc2):
    modified_doc1 = handle_negations(doc1)
    modified_doc2 = handle_negations(doc2)

    similarity_no_neg1 = siamese_network_similarity(modified_doc1, doc2)
    similarity_no_neg2 = siamese_network_similarity(doc1, modified_doc2)
    similarity_no_neg_both = siamese_network_similarity(modified_doc1, modified_doc2)

    negation_count1 = count_negations(doc1)
    negation_count2 = count_negations(doc2)

    if negation_count1 == negation_count2:
        similarity_adjusted = similarity_no_neg_both
    else:
        similarity_adjusted = similarity_no_neg_both * 0.7  # Lower score if negation counts differ

    return similarity_no_neg1, similarity_no_neg2, similarity_adjusted, negation_count1, negation_count2

# Final similarity score calculation
def final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2):
    negation_even_odd_match = (neg_count1 % 2 == neg_count2 % 2)
    if negation_even_odd_match:
        combined_score = max(sim_no_neg1, sim_no_neg2, sim_adjusted)
    else:
        combined_score = min(sim_no_neg1, sim_no_neg2, sim_adjusted)
    return combined_score
# Function to split sentences using conjunctions
def split_using_conjunctions(sentence):
    doc = nlp(sentence)
    conjunctions = [token.text for token in doc if token.dep_ in ('cc', 'mark')]
    pattern = r'\b(' + '|'.join(conjunctions) + r')\b'
    parts = re.split(pattern, sentence)
    parts = [' '.join(parts[i:i + 2]).strip() for i in range(0, len(parts), 2)]
    clean_parts = [part.strip() for part in parts if part.strip()]
    return clean_parts, len(conjunctions) > 0
# Function to calculate similarities for each specific pair of parts
def calculate_specific_similarities(pairs):
    results = []
    for i, (part1, part2) in enumerate(pairs, 1):
        sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2 = negation_aware_similarity(part1, part2)
        final_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2)
        results.append((f'Pair {i}: "{part1}" vs. "{part2}"', sim_no_neg1, sim_no_neg2, sim_adjusted, final_score))
    return results
# Main function to decide similarity calculation approach based on conjunction presence
def calculate_combined_similarity(sentence1, sentence2):
    parts1, has_conjunctions1 = split_using_conjunctions(sentence1)
    parts2, has_conjunctions2 = split_using_conjunctions(sentence2)
    if has_conjunctions1 or has_conjunctions2:
        desired_pairs = []
        for part1 in parts1:
            for part2 in parts2:
                desired_pairs.append((part1, part2))
        similarities = calculate_specific_similarities(desired_pairs)
        final_scores = [result[-1] for result in similarities]
        z_scores = zscore(final_scores)
        z_score_threshold = 1.28
        if max(z_scores) > z_score_threshold:
            combined_score = max(final_scores)
        else:
            combined_score = min(final_scores)
        print(f"Combined Similarity Score (Z-Score Based Decision): {combined_score}")
    else:
        sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2 = negation_aware_similarity(sentence1, sentence2)
        combined_score = final_similarity_score(sim_no_neg1, sim_no_neg2, sim_adjusted, neg_count1, neg_count2)
        print(f"Combined Similarity Score (Standard Approach): {combined_score}")
    return combined_score
# Example sentences to test integration
sentence1 = "Virtual memory expands available memory space."
sentence2 = "Virtual memory does not expand on available memory space"
# Calculate combined similarity score based on conjunction presence
combined_similarity = calculate_combined_similarity(sentence1, sentence2)


  model.load_state_dict(torch.load("/content/pretrained_siamese_model.pth", map_location=device))
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Combined Similarity Score (Standard Approach): 0.37986738085746763
