In [11]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from transformers.utils import is_flash_attn_2_available
from peft import LoraConfig, get_peft_config, PeftModel
from sklearn.model_selection import train_test_split 
from peft import LoraConfig, get_peft_model
from Retrieval_system import Retrieval
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
import torch
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from pincone import Pincone_vectorStore
import warnings
from operator import itemgetter
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [8]:
Device = "cuda" if torch.cuda.is_available else "cpu"
index= Pincone_vectorStore()
Embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2", model_kwargs={"device": Device})
vector_store = PineconeVectorStore(embedding=Embeddings,index=index)
retrieval_obj = Retrieval(device=Device, index=index,Embeddings=Embeddings,vector_store=vector_store)

Successfully connect to <pinecone.db_data.index.Index object at 0x7f9fd2d549a0>


In [2]:
MODEL_ID = "/home/shegun93/ITS/Fine-tunning/nairsV1"
ATTN_IMPLEMENTATION = "flash_attention_2" if (is_flash_attn_2_available() and (torch.cuda.get_device_capability(0)[0] >= 8)) else "sdpa"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
#MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"

In [3]:
use_quantization_config = True
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
print(f"[INFO] Using attention implementation: {ATTN_IMPLEMENTATION}")
print(f"[INFO] Using model_id: {MODEL_ID}")
Model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    quantization_config=quantization_config if use_quantization_config else None,                                             
    attn_implementation=ATTN_IMPLEMENTATION,
    device_map = "auto"
)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: /home/shegun93/ITS/Fine-tunning/nairsV1




In [9]:
def parse_mcq(doc_text):
    lines = [line.strip() for line in doc_text.splitlines() if line.strip()]

    question = None
    options = {}

    for line in lines:
        if line.startswith("Question:"):
            question = line.replace("Question:", "").strip()

        elif len(line) > 2 and line[1] == "." and line[0] in ["A", "B", "C", "D"]:
            options[line[0]] = line[2:].strip()

    return question, options

In [19]:
pipe = pipeline(task="text-generation", model=Model, 
                tokenizer=tokenizer, max_new_tokens = 150,
                return_full_text = False)

Device set to use cuda:0


In [20]:
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
def evaluate_student_answer(query):
    retriever, _ = retrieval_obj.get_retrieval() 
    docs = retriever.get_relevant_documents(query)
    if not docs:
        return "No relevant context found."
    doc = docs[0]
    question, options = parse_mcq(doc.page_content)
    
    print(f"\nQuestion: {question}")
    for k, v in options.items():
        print(f"{k}. {v}")

    user_answer = input("\nEnter your answer (A/B/C/D): ").strip().upper()
    if user_answer not in options:
        return "Invalid selection. Please choose A, B, C, or D."


    formatted_options = "\n".join([f"{k}. {v}" for k, v in options.items()])
    prompt_text = (
    f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
    f"You are a Pedagogical Expert. Your task is to diagnose student errors based ONLY on the provided CONTEXT.\n"
    f"RULES:\n"
    f"1. Compare the Student Answer to the CORRECT answer found in the CONTEXT.\n"
    f"2. Identify the specific misunderstanding (e.g., confusing two different proofs of relativity).\n"
    f"3. Provide a 'Socratic Hint' that points out a flaw in their logic without naming the correct option.\n"
    f"4. BE FACTUALLY ACCURATE. Do not invent details.<|eot_id|>"
    f"<|start_header_id|>user<|end_header_id|>\n\n"
    f"CONTEXT: {doc.page_content}\n"
    f"QUESTION: {question}\n"
    f"OPTIONS: {formatted_options}\n"
    f"STUDENT ANSWER: {user_answer}<|eot_id|>"
    f"<|start_header_id|>assistant<|end_header_id|>\n\n"
    f"<diagnosis>"
)
    full_response = llm.invoke(prompt_text)
    if "<hint>" in full_response:
        hint_only = full_response.split("<hint>")[-1].strip()
        return hint_only
    return full_response
    #response = llm.invoke(prompt_text)
    #return f"<thinking>{response}"


In [31]:
result = evaluate_student_answer("What is the Principle of Relativity")
print(result)


Question: What observable phenomenon confirmed Einstein's theory of general relativity in 1919?
A. Gravitational redshift
B. Time dilation in GPS satellites
C. Bending of starlight by the Sun
D. Black hole mergers
Gravitational redshift occurs when light escapes from a massive object's gravitational field and its frequency is lowered due to the increased distance from the source. This phenomenon was observed and confirmed in 1919 during a solar eclipse, when scientists measured the redshift of light passing near the Sun.


In [6]:
# USER = "Tell me the principle of relativity"
# Prompt = f"""[SYSTEM] You are an AI Tutor. Identify the student's misconception and provide a helpful hint.
# Q: {USER}"""

In [None]:
print(evaluate_student_answer("WHAT IS Thermodynaics"))

In [None]:

USER = "Tell me the principle of relativity"
Prompt = f"""[SYSTEM] You are an AI Tutor. Identify the student's misconception and provide a helpful hint.
Q: {USER}"""

In [None]:
    # inputs = tokenizer(prompt_text, return_tensors="pt").to(Model.device)
    # with torch.no_grad():
    #     output_tokens = Model.generate(
    #         **inputs,
    #         max_new_tokens=150,    
    #         temperature=0.1,      
    #         repetition_penalty=1.2, 
    #         do_sample=True,
    #         eos_token_id=tokenizer.eos_token_id,
    #         pad_token_id=tokenizer.eos_token_id
    #     )
    # input_length = inputs.input_ids.shape[1]
    # generated_text = tokenizer.decode(output_tokens[0][input_length:], skip_special_tokens=True)
    # full_response = "<thinking>" + generated_text
    # clean_output = full_response.split("Q:")[0].split("Question:")[0].split("[INST]")[0].strip()
    # return clean_output