<a href="https://colab.research.google.com/github/Nukaraju2003/similarity-detection-techniques-for-question-and-answer-grading/blob/main/similarity_detection_techniques_for_question_and_answer_grading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### calculating cosine similarity between two text documents using sklearn library


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the two documents to compare
doc1 = "The dog was lazy"
doc2 = "The lazy dog"

# Create a CountVectorizer object to transform the documents into vectors
vectorizer = CountVectorizer().fit_transform([doc1, doc2])

# Calculate the cosine similarity between the two vectors
cosine_sim = cosine_similarity(vectorizer[0], vectorizer[1])[0][0]

print(cosine_sim)  


0.8660254037844388


#### calculating the Jaccard similarity between two text documents

In [None]:
doc1 = "The dog was lazy"
doc2 = "The lazy dog"

# Convert the documents to set of words
set1 = set(doc1.lower().split())
set2 = set(doc2.lower().split())

# Calculaate the Jaccard similarity between the two sets
jaccard_sim = len(set1.intersection (set2))/ len(set1.union(set2))

print(jaccard_sim)

0.75


### Python code for calculating the Levenshtein distance between two strings and their similarity:

In [None]:
def levenshtein_distance(str1, str2):
    m = len(str1)
    n = len(str2)
    # Create a matrix of distances
    d = [[0 for j in range(n + 1)] for i in range(m + 1)]
    for i in range(m + 1):
        d[i][0] = i
    for j in range(n + 1):
        d[0][j] = j
    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if str1[i - 1] == str2[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1
    # Return the Levenshtein distance
    return d[m][n]

def similarity(str1, str2):
    lev_distance = levenshtein_distance(str1, str2)
    sim = 1 / (1 + lev_distance)
    return sim

# Example usage
str1 = "The dog was lazy"
str2 = "The lazy dog"
sim = similarity(str1, str2)
print("Similarity between '{}' and '{}' is: {:.2f}".format(str1, str2, sim))


Similarity between 'The dog was lazy' and 'The lazy dog' is: 0.08


In [None]:
#!pip install torch torchvision
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m86.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


### BERT embeddings for question-and-answer grading similarity detection using the Hugging face Transformers library:

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example question and answer
question = "What is the capital of France?"
answer = "The capital of France is paris."

# Tokenize the input
inputs = tokenizer.encode_plus(question, answer, return_tensors='pt', add_special_tokens=True)

# Obtain the embeddings for the [CLS] token, which is a representation of the entire input
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]

# Calculate the cosine similarity between the question and answer embeddings
similarity = cosine_similarity(embeddings[0].unsqueeze(0), embeddings[0].unsqueeze(0))

# Print the similarity score
print(f"Similarity score: {similarity[0][0]}")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Similarity score: 0.9999999403953552


### Siamese network for sentence similarity using TensorFlow

In [None]:
#import keras
#print(keras.__version__)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Concatenate, Dense, Lambda
from tensorflow.keras.models import Model

#Define toy dataset
texts = ['this is the first sentence', 'this is the second sentence']

# Fit tokenizer on toy dataset
tokenizer = Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(texts)

# Define vocab_size as the number of words in the vocabulary
vocab_size = len(tokenizer.word_index)+ 1 # add 1 for the <OOV> token

# Define input layers
input1 = Input(shape=(None,))
input2 = Input(shape=(None,))

embedding_dim = 100

# Define shared embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim)

lstm_units = 64

# Define shared LSTM layer
lstm_layer = Bidirectional(LSTM(units=lstm_units, return_sequences=True))

# Pass inputs through shared layers
encoded1 = lstm_layer(embedding_layer(input1))
encoded2 = lstm_layer(embedding_layer(input2))

# Concatenate encoded inputs
merged = Concatenate(axis=1)([encoded1, encoded2])

# Calculate similarity score
similarity = Dense(units=1, activation='sigmoid')(merged)

print(similarity)

# Build model
model = Model(inputs=[input1, input2], outputs=similarity)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


KerasTensor(type_spec=TensorSpec(shape=(None, None, 1), dtype=tf.float32, name=None), name='dense/Sigmoid:0', description="created by layer 'dense'")


In [None]:
word_index = {"apple": 0, "banana": 1, "orange": 2}
vocab_size = len(word_index)
print(vocab_size)

3


The <OOV> token stands for "out-of-vocabulary" token. It is a token that is used to represent words that are not present in the vocabulary of the tokenizer. When the tokenizer encounters a word that is not in its vocabulary during text encoding, it replaces it with the <OOV> token. This is often used to handle words that are not seen in the training data but may appear in the test data or real-world data.

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

#Define toy dataset
texts = ['this is the first sentence', 'this is the second sentence']

# Fit tokenizer on toy dataset
tokenizer = Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(texts)

# Define vocab_size as the number of words in the vocabulary
vocab_size = len(tokenizer.word_index)+ 1 # add 1 for the <OOV> token (out-of-vocabulary token) 
vocab_size
print(tokenizer.word_index)

{'this': 1, 'is': 2, 'the': 3, 'sentence': 4, 'first': 5, 'second': 6}


In [None]:
# Define input data
test_text1 = ['this is a sentence']
test_text2 = ['this is another sentence']

# Convert text data to sequences
test_seq1 = tokenizer.texts_to_sequences(test_text1)
test_seq2 = tokenizer.texts_to_sequences(test_text2)

# Pad sequences to ensure equal length
test_seq1_padded = tf.keras.preprocessing.sequence.pad_sequences(test_seq1, padding='post')
test_seq2_padded = tf.keras.preprocessing.sequence.pad_sequences(test_seq2, padding='post')

# Get similarity score
similarity_score = model.predict([test_seq1_padded, test_seq2_padded])[0][0]

print(similarity_score)


[0.5000158]


### Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks

In [None]:
pip install -U sentence-transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
sentence_embeddings = model.encode(['This is the first sentence.','This is the second sentence.'])
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

from sentence_transformers import util

#Calculate cosine similarity between two sentences
sentence1 = 'This is the first sentence.'
sentence2 = 'This is the second sentence.'
sentence_embeddings = model.encode([sentence1, sentence2]) 
cosine_similarity = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1])

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading (…)7f4ef/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f279f7f4ef/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

Downloading (…)79f7f4ef/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)279f7f4ef/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)7f4ef/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading (…)279f7f4ef/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)9f7f4ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
from sentence_transformers import SentenceTransformer, util

# Instantiate SentenceTransformer with pre-trained siamese network model
model = SentenceTransformer('paraphrase-distilroberta-base-v1')

#Define sentences to be compared
sentence1 = 'This is the first sentence.'
sentence2 = 'This is the second sentence.'

#Encode sentences
sentence_embeddings = model.encode([sentence1, sentence2])  

#Calculate cosine similarity between sentence embeddings
cosine_similarity = util.pytorch_cos_sim(sentence_embeddings[0], sentence_embeddings[1]) 

#Print cosine similarity
print('Cosine similarity between sentence 1 and sentence 2: {:.2f}'.format(cosine_similarity.item()))

Cosine similarity between sentence 1 and sentence 2: 0.89
