In [1]:
import pandas as pd

# Load SciQ dataset
df = pd.read_csv(r"D:\Career\FYP\Dataset\SciQ Updated.csv")

print("Dataset loaded:", df.shape)
df.head()

Dataset loaded: (13613, 2)


Unnamed: 0,question,correct_answer
0,What type of organism is commonly used in prep...,mesophilic organisms
1,What phenomenon makes global winds blow northe...,coriolis effect
2,Changes from a less-ordered state to a more-or...,exothermic
3,What is the least dangerous radioactive decay?,alpha decay
4,Kilauea in hawaii is the world’s most continuo...,smoke and ash


In [2]:
import re
import spacy
from textblob import TextBlob

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    """
    Cleans and normalizes text for semantic grading.
    """
    # Lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Light spell correction
    try:
        text = str(TextBlob(text).correct())
    except Exception:
        pass

    # Lemmatization + stopword removal
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc if not token.is_stop]

    return ' '.join(lemmas)

In [None]:
df["clean_key_answer"] = df["correct_answer"].apply(preprocess)

print("Preprocessing completed.")
df[["correct_answer", "clean_key_answer"]].head()

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
print("SBERT model loaded.")

In [None]:
# Encode cleaned key answers
key_embeddings = model.encode(
    df["clean_key_answer"].tolist(),
    convert_to_tensor=True
)

print("Key embeddings generated.")
print("Shape:", key_embeddings.shape)


In [None]:
import os
import pickle

os.makedirs("models", exist_ok=True)

# Save embeddings
with open("models/key_embeddings.pkl", "wb") as f:
    pickle.dump(key_embeddings, f)

# Save processed dataset
df.to_pickle("models/sciq_processed.pkl")

print("Phase 1 artifacts saved successfully.")