<a href="https://colab.research.google.com/github/Nikil263/llm-codechallenge/blob/main/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Creating eval dataset**

In [1]:
import random
import json

# Function to generate random coefficients (including negative ones) and equations
def generate_equation():
    coefficients = [random.randint(-10, 10) for _ in range(3)]
    signs = ['<=', '>=', '=']
    sign = random.choice(signs)
    return f"{coefficients[0]}X {'-' if coefficients[1] < 0 else '+'} {abs(coefficients[1])}Y {sign} {coefficients[2]}"

# Function to generate the corresponding Pyomo constraint
def generate_constraint(equation):
    equation_parts = equation.split()

    # Check if equation_parts has enough elements
    if len(equation_parts) < 5:
        return None

    a, sign_b, b = equation_parts[0], equation_parts[1], equation_parts[2]
    sign = equation_parts[3]
    c = equation_parts[4]

    # Format coefficients
    a_coeff = f"{a.strip().replace('X', '')} * model.x"
    b_coeff = f"{'-' if '-' in sign_b else '+'} {b.strip().replace('Y', '')} * model.y"

    # Combine coefficients, handling the sign of the second coefficient correctly
    pyomo_expr = f"{a_coeff} {b_coeff} {sign} {c.strip()}"

    return f"model.constraint3 = Constraint(expr={pyomo_expr})"

# Generate the dataset
dataset = []
for _ in range(100):
    equation = generate_equation()
    user_input = f"add a constraint of {equation} in our model."
    output = generate_constraint(equation)

    # Skip if output is None (equation_parts not enough elements)
    if output is None:
        continue

    dataset.append({"input": user_input, "output": output})

# Validate and save the dataset to a JSON file
try:
    json_data = json.dumps(dataset, indent=4)
    with open('eval_dataset.json', 'w') as f:
        f.write(json_data)
    print("Dataset created and saved to 'eval_dataset.json'")
except json.JSONDecodeError as e:
    print(f"JSON error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")


Dataset created and saved to 'train.json'


**Evaluation**

In [13]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer
import json

# Load the fine-tuned model and tokenizer
model_name = "Nikil263/Fine_Tuned_GPT2model"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Initialize the pipeline with the fine-tuned model
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Load the evaluation dataset
with open("eval_dataset.json", "r") as f:
    eval_data = json.load(f)

# Function to generate constraints
def generate_constraint(user_input):
    response = generator(user_input, max_length=50, num_return_sequences=1)
    new_constraint_code = response[0]['generated_text'].strip()
    lines = new_constraint_code.split('\n')
    constraint_line = None

    for line in lines:
      if line.startswith("output: model.constraint3"):
        constraint_line = line
        break
    if constraint_line == None:
      return ""
    constraint_line = constraint_line.replace("output: ", "")
    return constraint_line

# Initialize lists to store results
y_true = []
y_pred = []
bleu_scores = []

# Smoothing function for BLEU score
smooth = SmoothingFunction().method1

# Evaluate the model
for item in eval_data:
    input_text = item['input']
    reference_constraint = item['output']

    generated_constraint = generate_constraint(input_text)

    y_true.append(reference_constraint)
    y_pred.append(generated_constraint)

    # Calculate BLEU score
    reference_tokens = reference_constraint.split()
    generated_tokens = generated_constraint.split()
    bleu_score = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smooth)
    bleu_scores.append(bleu_score)

# Calculate Accuracy
accuracy = accuracy_score(y_true, y_pred)


# Calculate Average BLEU Score
average_bleu_score = sum(bleu_scores) / len(bleu_scores)




Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

In [15]:
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Average BLEU Score: {average_bleu_score:.4f}")

Accuracy: 72.00%
Average BLEU Score: 0.8894
