## Comprehensive Answer Evaluation Tool - NLP Based

The code aims to evaluate a question dataset, given that answers are provided, and a reference text is available. The evaluation is based on three parts: a keyword generator/counter, accounting for 20% of the overall marks, 20% for grammar, and the rest on semantic

### Imports for NLP

In [176]:
import json
import nltk
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
from PyPDF2 import PdfReader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sentence_transformers import SentenceTransformer

### Basic NLP Tasks
1. Tokenization
2. Stopword removal
3. N-grams (unigrams, bigrams trigrams)

In [177]:
STOPWORDS = {
    "the","is","in","and","to","of","a","an","for","on","with","as",
    "by","this","that","it","from","are","was","were","be","has","had",
    "have","at","or","but","not","which","their","its","also"
}


# PDF Processing
pdf_path = 'text.pdf'
reader = PdfReader(pdf_path)

student_text = ""
for page in reader.pages:
    student_text += page.extract_text() + " "

student_text = student_text.lower()

# JSON Processing
json_path = 'history_eval_dataset_v2.json'

with open(json_path, "r", encoding="utf-8") as f:
    qa = json.load(f)

reference_text = " ".join(
    item["reference"] for item in qa if "reference" in item
).lower()

# Tokenization
tokenizer = RegexpTokenizer(r'\b[a-zA-Z]+\b')
stop_words = STOPWORDS

def preprocess(text):
    tokens = tokenizer.tokenize(text)
    return [w for w in tokens if w not in stop_words]

student_tokens = preprocess(student_text)
reference_tokens = preprocess(reference_text)

# N-grams
def get_ngrams(tokens, n):
    return Counter(ngrams(tokens, n))

student_uni = Counter(student_tokens)
student_bi = get_ngrams(student_tokens, 2)
student_tri = get_ngrams(student_tokens, 3)

ref_uni = Counter(reference_tokens)
ref_bi = get_ngrams(reference_tokens, 2)
ref_tri = get_ngrams(reference_tokens, 3)

# Keywords Generation
def common_terms(c1, c2):
    common = {}
    for k in c1:
        if k in c2:
            common[k] = min(c1[k], c2[k])
    return Counter(common)

common_uni = common_terms(student_uni, ref_uni)
common_bi = common_terms(student_bi, ref_bi)
common_tri = common_terms(student_tri, ref_tri)

uni = []
bi = []
tri = []
u_uni = set()
u_bi = set()
u_tri = set()

for w, c in common_uni.most_common():
    uni.append(w)
for w, c in common_bi.most_common():
    bi.append(w)
for w, c in common_tri.most_common():
    tri.append(w)

for tri_g in tri:
    tri_set = set(tri_g)

    # Unigrams in trigrams
    for u in uni:
        if u in tri_set:
            u_uni.add(u)

    # bigrams in trigram
    tri_bigrams = {(tri_g[0], tri_g[1]), (tri_g[1], tri_g[2])}

    for b in bi:
        if b in tri_bigrams:
            u_bi.add(b)

    # trigrams that have unigrams and bigrams
    if any(
        b in tri_bigrams for b in bi
    ):
        u_tri.add(tri_g)

u_uni = list(u_uni)
u_bi = list(u_bi)
u_tri = list(u_tri)


### Feature Vectorization
 Make vectors, which are count values of each unigram, bigram, trigram from the keyword list, as a segment of evaluation.

In [178]:
def vector(tokens, uni_keys, bi_keys, tri_keys):
    vec = []

    token_counts = Counter(tokens)
    bigram_counts = Counter(ngrams(tokens, 2))
    trigram_counts = Counter(ngrams(tokens, 3))

    for u in uni_keys:
        vec.append(token_counts.get(u, 0)*0.2)

    for b in bi_keys:
        vec.append(bigram_counts.get(b, 0)*0.3)

    for t in tri_keys:
        vec.append(trigram_counts.get(t, 0)*0.5)

    return vec

### Tokenization & feature vectorization

In [179]:
X_stu = []
X_ref = []
y_stu = []
for item in qa:
    tokens_stu = preprocess(item["student_answer"])
    X_stu.append(vector(tokens_stu, uni, bi, tri))
    tokens_ref = preprocess(item["reference"])
    X_ref.append(vector(tokens_ref, uni, bi, tri))
    y_stu.append(item["score"])


### Mark calculation based on keyword presence

In [180]:
x_key_stu = []
x_val_stu = []
x_key_ref = []
x_val_keyword = []

def feature_sum(feature_vec_stu, feature_vec_ref, x_key_stu, x_key_ref, x_val):


    for student_features in feature_vec_stu:
        total = sum(student_features)
        x_key_stu.append(total)

    for teach_features in feature_vec_ref:
        total = sum(teach_features)
        x_key_ref.append(total)

    if len(x_key_stu) == len(x_key_ref):
        for i in range(len(x_key_stu)):
            j = (x_key_stu[i]/x_key_ref[i])*2
            x_val.append(j)

feature_sum(X_stu, X_ref, x_key_stu, x_key_ref, x_val_keyword)

print(x_val_keyword)


[0.8, 0.0, 0.0, 1.5294117647058825, 0.0, 0.0, 0.0, 1.5555555555555556, 0.0, 1.6, 0.0, 0.8979591836734697, 0.08163265306122452, 0.5000000000000001, 0.25000000000000006, 0.8888888888888888, 0.4444444444444444, 1.2000000000000002, 0.0]


### Using Word2Vec and MLPRegressor for evaluating semantics

In [181]:
sentences = []
for i in student_tokens:
    sentences.append(i)


w2v = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1
)

def sentence_vector(tokens, model):
    vectors = []
    for word in tokens:
        if word in model.wv:
            vectors.append(model.wv[word])
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)


def semantic_similarity(ref_text, stu_text, model):
    ref_tokens = preprocess(ref_text)
    stu_tokens = preprocess(stu_text)

    v_ref = sentence_vector(ref_tokens, model).reshape(1, -1)
    v_stu = sentence_vector(stu_tokens, model).reshape(1, -1)

    return cosine_similarity(v_ref, v_stu)[0][0]


X, y = [], []

for item in qa:
    sim = semantic_similarity(
        item["reference"],
        item["student_answer"],
        w2v
    )

    features = [
        sim,
        len(preprocess(item["student_answer"]))
    ]

    X.append(features)
    y.append(item["score"])
import numpy as np

X = np.array(X)
y = np.array(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = MLPRegressor(
    hidden_layer_sizes=(16, 8),
    max_iter=5000,
    random_state=42


)

model.fit(X_train, y_train)

pred = model.predict(X_test)

print(model.score(X_test, y_test))

for p, a in zip(pred[:5], y_test[:5]):
    print(f"Predicted: {p:.2f} | Actual: {a}")


0.7684730158121744
Predicted: 4.78 | Actual: 4
Predicted: 4.09 | Actual: 4
Predicted: 4.09 | Actual: 4
Predicted: 2.27 | Actual: 2


### Predicting using a different dataset than the trained one.

In [182]:
with open("hist_100.json", "r", encoding="utf-8") as f:
    data = json.load(f)

sentences = []

for item in data:
    sentences.append(preprocess(item["reference"]))


w2v = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1
)

X, y = [], []

for item in data:
    sim = semantic_similarity(
        item["reference"],
        item["student_answer"],
        w2v
    )

    features = [
        sim,
        len(preprocess(item["student_answer"]))
    ]

    X.append(features)
    y.append(item["score"])

scaler = StandardScaler()
X_test = scaler.fit_transform(X)



pred = model.predict(X_test)
print(model.score(X_test, y))

for p, a in zip(pred[:5], y[:5]):
    print(f"Predicted: {p:.2f} | Actual: {a}")


0.6377301617344744
Predicted: 5.33 | Actual: 4
Predicted: 3.78 | Actual: 3
Predicted: 2.85 | Actual: 2
Predicted: 0.08 | Actual: 0
Predicted: 3.83 | Actual: 5


### Using Word2Vec and ridge regression

In [183]:
sentences = []

for item in qa:
    sentences.append(preprocess(item["reference"]))
    sentences.append(preprocess(item["student_answer"]))

w2v = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1
)

def sentence_vector(tokens, model):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

def semantic_similarity(ref_text, stu_text, model):
    ref_tokens = preprocess(ref_text)
    stu_tokens = preprocess(stu_text)

    v_ref = sentence_vector(ref_tokens, model).reshape(1, -1)
    v_stu = sentence_vector(stu_tokens, model).reshape(1, -1)

    return cosine_similarity(v_ref, v_stu)[0][0]

X, y = [], []

for item in qa:
    sim = semantic_similarity(
        item["reference"],
        item["student_answer"],
        w2v
    )

    len_ratio = len(preprocess(item["student_answer"])) / max(
        1, len(preprocess(item["reference"]))
    )

    X.append([sim, len_ratio])
    y.append(item["score"])

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

pred = model.predict(X_test)
pred = np.clip(pred, 0, 5)

print("MAE:", mean_absolute_error(y_test, pred))

for p, a in zip(pred[:5], y_test[:5]):
    print(f"Predicted: {p:.2f} | Actual: {a}")


MAE: 0.30117919508534274
Predicted: 4.19 | Actual: 4
Predicted: 4.04 | Actual: 4
Predicted: 3.71 | Actual: 4
Predicted: 1.31 | Actual: 2


### Capturing Semantics using GloVE

In [184]:
def load_glove(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings


glove = load_glove("glove/glove.6B.100d.txt")
EMB_DIM = 100

def sentence_vector_glove(tokens, embeddings, dim=100):
    vectors = [embeddings[w] for w in tokens if w in embeddings]
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)

from sklearn.metrics.pairwise import cosine_similarity

def semantic_similarity_glove(ref_text, stu_text, embeddings):
    ref_tokens = preprocess(ref_text)
    stu_tokens = preprocess(stu_text)

    v_ref = sentence_vector_glove(ref_tokens, embeddings).reshape(1, -1)
    v_stu = sentence_vector_glove(stu_tokens, embeddings).reshape(1, -1)

    return cosine_similarity(v_ref, v_stu)[0][0]

X, y = [], []

for item in qa:
    sim = semantic_similarity_glove(
        item["reference"],
        item["student_answer"],
        glove
    )

    features = [
        sim,
        len(preprocess(item["student_answer"]))
    ]

    X.append(features)
    y.append(item["score"])

X = np.array(X)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

pred = model.predict(X_test)

print("R² score:", model.score(X_test, y_test))

for p, a in zip(pred[:5], y_test[:5]):
    print(f"Predicted: {p:.2f} | Actual: {a}")


R² score: 0.27775230967931475
Predicted: 4.68 | Actual: 4
Predicted: 2.89 | Actual: 4
Predicted: 4.40 | Actual: 4
Predicted: 1.43 | Actual: 2


### Evaluation Based purely on cosine similarity

In [188]:
def mark_from_similarity(similarity, max_marks=5):
    sim = max(0.0, min(1.0, similarity))  # clamp
    return round(sim * max_marks, 2)

for item in qa:
    sim = semantic_similarity_glove(
        item["reference"],
        item["student_answer"],
        glove
    )

    marks = mark_from_similarity(sim, max_marks=5)

    print("Q:", item["question"])
    print("A:", item["reference"])
    print("A:", item["student_answer"])
    print("Similarity:", round(sim, 3))
    print("Marks:", marks)
    print("-")



Q: What were the main causes of the French Revolution?
A: The French Revolution was caused by social inequality, financial crisis, heavy taxation, and the absolute monarchy of France.
A: The revolution happened because of inequality and financial problems in France.
Similarity: 0.913
Marks: 4.57
-
Q: What were the main causes of the French Revolution?
A: The French Revolution was caused by social inequality, financial crisis, heavy taxation, and the absolute monarchy of France.
A: It happened because people were unhappy.
Similarity: 0.662
Marks: 3.31
-
Q: What were the main causes of the French Revolution?
A: The French Revolution was caused by social inequality, financial crisis, heavy taxation, and the absolute monarchy of France.
A: It started because of wars with other countries.
Similarity: 0.769
Marks: 3.84
-
Q: What is meant by social inequality in France before 1789?
A: Social inequality referred to unequal privileges enjoyed by the clergy and nobility, while the Third Estate b

In [189]:
sbert = SentenceTransformer("all-MiniLM-L6-v2")

for item in qa:
    tokens = preprocess(item["reference"])
    ref_uni |= set(tokens)
    ref_bi |= set(ngrams(tokens, 2))
    ref_tri |= set(ngrams(tokens, 3))

def semantic_similarity(ref, stu):
    emb = sbert.encode([ref, stu])
    return cosine_similarity([emb[0]], [emb[1]])[0][0]

def keyword_overlap(stu):
    tokens = preprocess(stu)
    u = set(tokens)
    b = set(ngrams(tokens, 2))
    t = set(ngrams(tokens, 3))

    uni = len(u & ref_uni) / max(len(ref_uni), 1)
    bi  = len(b & ref_bi)  / max(len(ref_bi), 1)
    tri = len(t & ref_tri) / max(len(ref_tri), 1)

    return 0.2 * uni + 0.3 * bi + 0.5 * tri

X, y = [], []

for item in qa:
    sim = semantic_similarity(
        item["reference"],
        item["student_answer"]
    )

    key = keyword_overlap(item["student_answer"])
    length = len(preprocess(item["student_answer"]))

    X.append([sim, key, length])
    y.append(item["score"])

X = np.array(X)
y = np.array(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Ridge(alpha=1.0)
model.fit(X_train, y_train)


pred = model.predict(X_test)

print("R² score:", round(model.score(X_test, y_test), 3))

for p, a in zip(pred[:5], y_test[:5]):
    print(f"Predicted: {p:.2f} | Actual: {a}")


R² score: 0.559
Predicted: 4.95 | Actual: 4
Predicted: 4.02 | Actual: 4
Predicted: 4.48 | Actual: 4
Predicted: 1.57 | Actual: 2
