In [None]:
%pip install langchain openai sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transform

In [113]:
import os
import requests
import langchain
import openai
from langchain_text_splitters import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F


os.environ["OPENAI_API_KEY"] = ""

import nltk

import warnings
import logging
# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
#a function to split our answers on an appearance of new line character
def answer_splitter(answers: str)-> list[dict]:
  #create an instance of the splitter
  text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1,
    chunk_overlap = 0,
    length_function = len
  )

  answers_split = text_splitter.split_text(answers)

  answer_stat = []
  for i in range(len(answers_split)):
    answer_stat.append(
        {
            "answer_line_len": len(answers_split[i]),
            "answer_line_text": answers_split[i],
            "answer_embedding": None
        }
    )

  return answer_stat

In [None]:
def answer_splitter_py(answers: str)-> list[dict]:
  answers_split = answers.split("\n")

  answer_stat = []
  for i in range(len(answers_split)):
    answer_stat.append(
        {
            "answer_line_len": len(answers_split[i]),
            "answer_line_text": answers_split[i],
        }
    )

  return answer_stat


In [119]:
answers_scheme ='''
Moses had gone up the mountain to seek God’s guidance/instructions.
Aaron had been left in charge of the people.
Moses delayed in returning/the people became impatient.
Israelites asked Aaron to make them a god that would lead them.
Aaron yielded to the demands of the Israelites/made them a golden calf to worship.
Aaron built an altar at the foot of the mountain/put the calf.
The people worshiped the calf/made sacriﬁces to it.
God was angered set to destroy the Israelites.
Moses interceded on behalf of the people.
When Moses came down from the mountain he was angry with the people/broke the stone tablet on which the Ten Commandments were written.
Israelite’s were given a chance to choose between following Yahweh or golden calf.
Those who followed the golden calf were destroyed/killed
'''
answers_stud = '''
Moses had gone up Mt. Sinai to talk to God
His brother, Aaron had been left in charge of the people below the mountain
The people became impatient when Moses delayed
So the Israelites asked Aaron to make them a god that they could see and that would lead them
Aaron gave in to their demands and made them a golden calf to worship
God was angered and planned to destroy them
Moses interceded on behalf of the people
When Moses came down from the mountain, he was angry and broke the stone tablet that the commandements had been written on
They were told to choose between following Yahweh or golden calf
Those who chose to follow the golden calf were killed
'''
chunks_scheme = answer_splitter(answers_scheme)
chunks_stud = answer_splitter(answers_stud)

print(f"Marking scheme: {len(chunks_scheme)}\n Student answers: {len(chunks_stud)}")



Marking scheme: 12
 Student answers: 10


In [67]:
#create embeddings for each answer
def embed(chunks: list[dict])->list[dict]:
  embedding_model = SentenceTransformer(
    model_name_or_path = "all-mpnet-base-v2",
    device = "cpu")

  sentences = []

  for i in range(len(chunks)):
    sentences.append(chunks[i].get("answer_line_text"))

  embeddings = embedding_model.encode(sentences, convert_to_tensor=True)
  embeddings_dict = dict(zip(sentences, embeddings))

  for i, embedding in enumerate(embeddings):
    chunks[i].update({"answer_embedding":embedding})

  return chunks

In [74]:
def cos_sim(tens1, tens2):
  return F.cosine_similarity(tens1, tens2, dim=1)

In [114]:
#functionize our semantic search to be used for questions with 'points' type answers
def sem_search_points(
    scheme_chunks: list[dict],
    student_chunks: list[dict])-> int:

  correct_answers = 0

  #extract the embeddings from the chunks
  scheme_embeddings = [scheme_chunks[i].get("answer_embedding") for i in range(len(scheme_chunks))]
  student_embeddings = [student_chunks[i].get("answer_embedding") for i in range(len(student_chunks))]

  similarities = {}

  for i in range(len(student_embeddings)):
    for j in range(len(scheme_embeddings)):
      key = f"student_answer{i+1}_vs_scheme_answer{j+1}"
      similarities[key]=cos_sim(student_embeddings[i].reshape(1, -1), scheme_embeddings[j])

  for pair, sim in similarities.items():
    if (sim>0.8):
      print(f"{pair}:{sim}")
      correct_answers += 1

  return correct_answers


In [115]:
sem_search_points(embed(answer_splitter(answers_scheme)), embed(answer_splitter(answers_stud)))



student_answer1_vs_scheme_answer1:tensor([0.8861])
student_answer2_vs_scheme_answer2:tensor([0.9037])
student_answer3_vs_scheme_answer9:tensor([1.])
student_answer5_vs_scheme_answer7:tensor([0.8486])
student_answer6_vs_scheme_answer4:tensor([0.9245])


5