In [1]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModel
import torch

# Download necessary NLTK resources
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

# Load SBERT model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function to lowercase, remove special characters, stopwords, tokenization, and lemmatization
def preprocess_text(text):
    # 1. Lowercase the text
    text = text.lower()

    # 2. Remove special characters and digits (keeping only words)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 3. Tokenize the text
    tokens = word_tokenize(text)

    # 4. Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # 5. Lemmatize the tokens
    lemmatized_text = " ".join([lemmatizer.lemmatize(token) for token in tokens])

    return lemmatized_text

# Semantic similarity function using SBERT
def semantic_similarity(correct, student):
    def encode_text(text):
        tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            embeddings = model(**tokens).last_hidden_state.mean(dim=1)
        return embeddings

    emb1 = encode_text(correct)
    emb2 = encode_text(student)
    return torch.nn.functional.cosine_similarity(emb1, emb2).item()

# Grading function
def grade_answer(correct, student, total_marks):
    # Preprocess both correct and student answers
    correct_processed = preprocess_text(correct)
    student_processed = preprocess_text(student)

    # Calculate semantic similarity
    semantic_score = semantic_similarity(correct_processed, student_processed)

    # Grade based on similarity
    grade = semantic_score * total_marks
    return grade, semantic_score

# Example usage
correct_answer = "To fast-track the development process, we use DevOps for the sake of Sir Azhar."
student_answer = "For fast development process, we use DevOps."
total_marks = 100

grade, score = grade_answer(correct_answer, student_answer, total_marks)
print(f"Semantic Similarity Score: {score:.2f}")
print(f"Grade: {grade:.2f}/{total_marks}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Semantic Similarity Score: 0.74
Grade: 74.36/100
