<a href="https://colab.research.google.com/github/Shirshadas24/Open-Source-Models-for-Student-Competence-Analysis-Task_3_python/blob/main/Task_3_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install transformers torch --quiet
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline

In [17]:

def load_codebert():
    print(" Loading CodeBERT (microsoft/codebert-base)...")
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    model = AutoModel.from_pretrained("microsoft/codebert-base")
    return tokenizer, model

def load_codegen():
    print(" Loading CodeGen (Salesforce/codegen-350M-mono)...")
    tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
    model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-mono")
    return tokenizer, model

codebert_tokenizer, codebert_model = load_codebert()
codegen_tokenizer, codegen_model = load_codegen()


 Loading CodeBERT (microsoft/codebert-base)...
 Loading CodeGen (Salesforce/codegen-350M-mono)...


Some weights of the model checkpoint at Salesforce/codegen-350M-mono were not used when initializing CodeGenForCausalLM: ['transformer.h.0.attn.causal_mask', 'transformer.h.1.attn.causal_mask', 'transformer.h.10.attn.causal_mask', 'transformer.h.11.attn.causal_mask', 'transformer.h.12.attn.causal_mask', 'transformer.h.13.attn.causal_mask', 'transformer.h.14.attn.causal_mask', 'transformer.h.15.attn.causal_mask', 'transformer.h.16.attn.causal_mask', 'transformer.h.17.attn.causal_mask', 'transformer.h.18.attn.causal_mask', 'transformer.h.19.attn.causal_mask', 'transformer.h.2.attn.causal_mask', 'transformer.h.3.attn.causal_mask', 'transformer.h.4.attn.causal_mask', 'transformer.h.5.attn.causal_mask', 'transformer.h.6.attn.causal_mask', 'transformer.h.7.attn.causal_mask', 'transformer.h.8.attn.causal_mask', 'transformer.h.9.attn.causal_mask']
- This IS expected if you are initializing CodeGenForCausalLM from the checkpoint of a model trained on another task or with another architecture (e

In [18]:
def get_embeddings(code: str):
    inputs = codebert_tokenizer(code, return_tensors="pt")
    with torch.no_grad():
        outputs = codebert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)


In [19]:
def generate_continuation(code: str, max_length: int = 128):
    inputs = codegen_tokenizer(code, return_tensors="pt")
    outputs = codegen_model.generate(
        **inputs,
        max_length=max_length,
        pad_token_id=codegen_tokenizer.eos_token_id,
    )
    continuation = codegen_tokenizer.decode(outputs[0], skip_special_tokens=True)

    continuation = continuation.encode("ascii", errors="ignore").decode()
    return continuation


In [20]:
def generate_feedback(code: str):
    print("\n===  Student Code ===\n")
    print(code)

    emb = get_embeddings(code)
    print("\n===  Embedding Shape (CodeBERT) ===")
    print(emb.shape)

    continuation = generate_continuation(code)
    print("\n===  CodeGen Continuation ===\n")
    print(continuation)


    feedback = []
    if "while True" in code:
        feedback.append(" Your loop may run forever. Consider adding a stopping condition.")
    if "/" in code and "b" in code:
        feedback.append(" Be careful with division by zero.")
    if "input(" in code:
        feedback.append(" Remember to handle invalid user input gracefully.")


    feedback.append(" What is the purpose of this function, and does it handle all possible cases?")

    print("\n===  Feedback Suggestions ===")
    for fb in feedback:
        print("- " + fb)

In [21]:

student_code = """
def divide(a, b):
    while True:
        return a / b
"""

generate_feedback(student_code)


===  Student Code ===


def divide(a, b):
    while True:
        return a / b


===  Embedding Shape (CodeBERT) ===
torch.Size([1, 768])

===  CodeGen Continuation ===


def divide(a, b):
    while True:
        return a / b

def main():
    print("Divide")
    print("What is the dividend?")
    dividend = input()
    print("What is the divisor?")
    divisor = input()
    print("What is the result?")
    result = divide(int(dividend), int(divisor))
    print(result)

if __name__ == "__main__":
    main()

===  Feedback Suggestions ===
-  Your loop may run forever. Consider adding a stopping condition.
-  Be careful with division by zero.
-  What is the purpose of this function, and does it handle all possible cases?


In [22]:
#  Test Case 2
student_code_2 = """
def calculate_average(numbers):
    total = 0
    for i in range(len(numbers)):
        total = numbers[i]
    return total / len(numbers)
"""

print("\n=== Student Code 2 ===\n")
print(student_code_2)


inputs_2 = codebert_tokenizer(student_code_2, return_tensors="pt", truncation=True, padding=True)
embeddings_2 = codebert_model(**inputs_2).last_hidden_state
print("\n=== Embedding Shape (CodeBERT) ===")
print(embeddings_2.shape)


inputs_cg_2 = codegen_tokenizer(student_code_2, return_tensors="pt")
outputs_cg_2 = codegen_model.generate(**inputs_cg_2, max_length=150, num_return_sequences=1)
print("\n=== CodeGen Continuation ===\n")
print(codegen_tokenizer.decode(outputs_cg_2[0], skip_special_tokens=True))


print("\n=== Feedback Suggestions ===")
print("-  Model continuation (as a hint, not a solution):")
print("-  Are you sure you're updating the total correctly inside the loop?")
print("-  What happens if the input list is empty?")
print("-  Does your return value actually represent an average, or just the last element divided by the count?")



=== Student Code 2 ===


def calculate_average(numbers):
    total = 0
    for i in range(len(numbers)):
        total = numbers[i]   
    return total / len(numbers)  



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== Embedding Shape (CodeBERT) ===
torch.Size([1, 65, 768])

=== CodeGen Continuation ===


def calculate_average(numbers):
    total = 0
    for i in range(len(numbers)):
        total = numbers[i]   
    return total / len(numbers)  

def main():
    numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    print(calculate_average(numbers))

if __name__ == "__main__":
    main()

=== Feedback Suggestions ===
-  Model continuation (as a hint, not a solution):
-  Are you sure you're updating the total correctly inside the loop?
-  What happens if the input list is empty?
-  Does your return value actually represent an average, or just the last element divided by the count?


In [26]:
import re

def detect_gaps(student_code: str):
    suggestions = []

    if "while" in student_code and "break" not in student_code and "return" not in student_code:
        suggestions.append(" Your loop may run forever. Do you have a stopping condition?")

    if re.search(r"/\s*len\(", student_code):
        suggestions.append(" What happens if the list is empty? Division by zero might occur.")

    if "while" in student_code and re.search(r"\w+\s*=\s*\w+\s*[\+\-]", student_code) is None:
        suggestions.append(" Are you updating your loop variable correctly inside the loop?")

    if "total" in student_code and "total" in re.findall(r"return\s+(.*)", student_code):
        if not re.search(r"total\s*=\s*total\s*[\+\-]", student_code):
            suggestions.append(" Does your return value represent an accumulated result, or just the initial/last value?")

    if "def " not in student_code:
        suggestions.append(" Try wrapping your logic in a function for clarity and reusability.")

    if not suggestions:
        suggestions.append(" No obvious conceptual gaps detected. Looks good!")

    return suggestions


In [30]:
while True:
    print("\nPaste your Python code (type END to quit):")
    lines = []
    while True:
        line = input()
        if line.strip().upper() == "END":
            break
        lines.append(line)
    if not lines:
        break

    student_code = "\n".join(lines)
    print("\n=== Your Code ===")
    print(student_code)

    inputs = codebert_tokenizer(student_code, return_tensors="pt", truncation=True, padding=True)
    embeddings = codebert_model(**inputs).last_hidden_state
    print("\n=== Embedding Shape (CodeBERT) ===")
    print(embeddings.shape)

    inputs_cg = codegen_tokenizer(student_code, return_tensors="pt")
    outputs_cg = codegen_model.generate(**inputs_cg, max_length=150, num_return_sequences=1)
    print("\n=== CodeGen Continuation ===\n")
    print(codegen_tokenizer.decode(outputs_cg[0], skip_special_tokens=True))

    print("\n=== Feedback Suggestions ===")
    for fb in detect_gaps(student_code):
        print("-", fb)



Paste your Python code (type END to quit):
def division(nums):
 a/b
END


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== Your Code ===
def division(nums):
 a/b

=== Embedding Shape (CodeBERT) ===
torch.Size([1, 12, 768])

=== CodeGen Continuation ===

def division(nums):
 a/b = a/b
    b/a = b/a
    b//a = b/a
    b/=a
    b%a = b%a
    b**a = b**a
    b//=a
    b%=a
    b/=a
    b%=a
    b//=a
    b%=a
    b/=a
    b%=a
    b//=a
    b%=a
    b/=a
    b%=a
    b//=a
    b%=a
    b/=a
    b%

=== Feedback Suggestions ===
-  No obvious conceptual gaps detected. Looks good!

Paste your Python code (type END to quit):
END
