In [1]:
!pip install openai
!pip install rouge-score
!pip install evaluate



In [2]:
import os
from dotenv import load_dotenv
import openai

# Load environment variables and initialize the OpenAI API key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Validate the API key
if not openai.api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment or .env file")

In [4]:
import openai
import evaluate
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

# Load the dataset
code_optimization_dataset = load_dataset("Dahoas/code-review-instruct-critique-revision-python")
dataset = code_optimization_dataset['train']
shuffled_dataset = dataset.shuffle(seed=42)
split_dataset = shuffled_dataset.train_test_split(test_size=0.2, seed=42)

# Split into train and test datasets
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

# Print the sizes of the splits
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))

# Load the ROUGE metric
rouge = evaluate.load('rouge')

# GPT-3.5 parameters
def generate_response(prompt, model="gpt-3.5-turbo", max_tokens=500, temperature=0.7, top_p=1.0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message['content'].strip()

# List to store results
results = []

# Evaluate using the test dataset
print("Evaluating model...")
for i in tqdm(range(len(test_dataset.select(range(10)))), desc="Processing Test Set"):
    sample = test_dataset[i]
    prompt = sample['prompt']
    reference = sample['response']

    # Generate prediction using GPT-3.5
    prediction = generate_response(prompt)

    # Compute ROUGE scores
    scores = rouge.compute(predictions=[prediction], references=[reference])

    # Append results
    results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
        "ROUGE-1": scores["rouge1"],
        "ROUGE-2": scores["rouge2"],
        "ROUGE-L": scores["rougeL"],
        "ROUGE-Lsum": scores["rougeLsum"]
    })

# Create a DataFrame for the results
results_df = pd.DataFrame(results)

# Sort the results by ROUGE-Lsum
results_sorted = results_df.sort_values(by="ROUGE-Lsum", ascending=False)

# Export all results to a CSV file
output_file = "gpt_optimization_model_evaluation_results.csv"
results_sorted.to_csv(output_file, index=False)
print(f"Results exported to {output_file}")

# Display the top 10 results and export them
top_10_results = results_sorted.head(10)
top_10_output_file = "top_gpt_10_model_evaluation_results.csv"
top_10_results.to_csv(top_10_output_file, index=False)
print(f"Top 10 results exported to {top_10_output_file}")


Train size: 7569
Test size: 1893
Evaluating model...


Processing Test Set: 100%|██████████| 10/10 [00:34<00:00,  3.41s/it]

Results exported to gpt_optimization_model_evaluation_results.csv
Top 10 results exported to top_gpt_10_model_evaluation_results.csv





In [7]:
# Create a DataFrame for export
results_df = pd.DataFrame(results)

# Compute average ROUGE scores
average_rouge1 = results_df["ROUGE-1"].mean()
average_rouge2 = results_df["ROUGE-2"].mean()
average_rougeL = results_df["ROUGE-L"].mean()
average_rougeLsum = results_df["ROUGE-Lsum"].mean()

In [8]:
print("Average rouge 1 score: ", average_rouge1)
print("Average rouge 2 score: ", average_rouge2)
print("Average rouge L score: ", average_rougeL)
print("Average rougeLSum score: ", average_rougeLsum)

Average rouge 1 score:  0.6113617214721916
Average rouge 2 score:  0.5011378022158315
Average rouge L score:  0.5641164027520119
Average rougeLSum score:  0.5919395919617042


In [10]:
from tqdm import tqdm

# Function to generate responses for qualitative analysis
def generate_response_qualitative(prompt, model="gpt-3.5-turbo", max_tokens=500, temperature=0.7, top_p=1.0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message['content'].strip()

# Collect qualitative results
qualitative_results = []

print("Generating qualitative summaries...")
for i in tqdm(range(len(test_dataset.select(range(2)))), desc="Generating Summaries"):
    sample = test_dataset[i]
    prompt = sample['prompt']
    reference = sample['response']

    # Generate the model's prediction
    prediction = generate_response_qualitative(prompt)

    # Append results for qualitative analysis
    qualitative_results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
    })

# Display qualitative comparisons
print("\nQualitative Results:")
for idx, sample in enumerate(qualitative_results[:2], start=1):  # Show the first 2 samples
    print(f"Sample {idx}")
    print(f"Prompt: {sample['Prompt']}")
    print(f"Reference: {sample['Reference']}")
    print(f"Prediction: {sample['Prediction']}")
    print("-" * 80)


Generating qualitative summaries...


Generating Summaries: 100%|██████████| 2/2 [00:07<00:00,  3.79s/it]


Qualitative Results:
Sample 1
Prompt: Question: <p>I've been writing basic Python scripts for a while now to help process data or automate some task but I've decided I should start picking up unit testing and objective orientated programming (the vast majority of my scripts so far have been procedural).</p>

<p>As a starter I decided to follow along with Uncle Bob's <a href="http://butunclebob.com/ArticleS.UncleBob.TheBowlingGameKata" rel="nofollow">bowling scoring kata</a> to try and get my mind around TDD and the idea of writing the absolute minimal code at every step to either make the test go red or green (plus any refactoring steps).</p>

<p>As it's a bare bones example of following TDD the main program doesn't actually have an entry point other than via the tests.</p>

<p>Things that stand out to my beginner's eye:</p>

<ul>
<li><p>There are a lot of <code>self</code>s which look like a lot of visual clutter when I read through the code. Is there a better way of doing this? I th




# For security vulnerability

In [11]:
import openai
import evaluate
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [12]:
def format_example(example):
    """
    Formats the input example into the desired structure for fine-tuning and evaluation.
    """
    language = example.get('lang', 'Unknown')
    vulnerability = example.get('vulnerability', '')
    scenario = example.get('question', '')
    input_code = example.get('rejected', '')
    corrected_code = example.get('chosen', '')

    formatted_prompt = f"""
    ### Language:
    {language}

    ### Scenario:
    {scenario}

    ### This is my code:
    ```{language}
    {input_code}
    ```

    ### Task:
    1. Identify and describe the vulnerability in the code. Begin your answer with 'Vulnerability:'.
    2. Rewrite the program to fix the vulnerability. Begin your corrected program with 'Corrected Code:'.
    """
    formatted_response = f"""
    Vulnerability: {vulnerability}
    Corrected Code: {corrected_code}
    """
    return formatted_prompt.strip(), formatted_response.strip()

In [13]:
# Load dataset
code_optimization_dataset = load_dataset("CyberNative/Code_Vulnerability_Security_DPO")
dataset = code_optimization_dataset['train']
shuffled_dataset = dataset.shuffle(seed=42)
split_dataset = shuffled_dataset.train_test_split(test_size=0.2, seed=42)

# Train-test split
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))

# Load the ROUGE metric
rouge = evaluate.load('rouge')


Train size: 3724
Test size: 932


In [14]:
def generate_response(prompt, model="gpt-3.5-turbo", max_tokens=500, temperature=0.7, top_p=1.0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message['content'].strip()

# List to store results
results = []


In [15]:
print("Evaluating model...")
for i in tqdm(range(len(test_dataset.select(range(10)))), desc="Processing Test Set"):
    sample = test_dataset[i]
    formatted_prompt, formatted_response = format_example(sample)
    prompt = formatted_prompt
    reference = formatted_response

    # Generate model prediction
    prediction = generate_response(prompt)

    # Compute ROUGE scores
    scores = rouge.compute(predictions=[prediction], references=[reference])

    # Append all results
    results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
        "ROUGE-1": scores["rouge1"],
        "ROUGE-2": scores["rouge2"],
        "ROUGE-L": scores["rougeL"],
        "ROUGE-Lsum": scores["rougeLsum"]
    })

# Create a DataFrame for results
results_df = pd.DataFrame(results)

# Sort results by ROUGE-Lsum
results_sorted = results_df.sort_values(by="ROUGE-Lsum", ascending=False)

Evaluating model...


Processing Test Set: 100%|██████████| 10/10 [00:19<00:00,  1.94s/it]


In [16]:
# Export all results to a CSV file
output_file = "gpt_security_evaluation_results.csv"
results_sorted.to_csv(output_file, index=False)
print(f"Results exported to {output_file}")

# Display and export the top 10 results
top_10_results = results_sorted.head(10)
top_10_output_file = "top_10_model_security_results.csv"
top_10_results.to_csv(top_10_output_file, index=False)

print(f"Top 10 results exported to {top_10_output_file}")

Results exported to gpt_security_evaluation_results.csv
Top 10 results exported to top_10_model_security_results.csv


In [17]:
# Create a DataFrame for export
results_df = pd.DataFrame(results)

# Compute average ROUGE scores
average_rouge1 = results_df["ROUGE-1"].mean()
average_rouge2 = results_df["ROUGE-2"].mean()
average_rougeL = results_df["ROUGE-L"].mean()
average_rougeLsum = results_df["ROUGE-Lsum"].mean()

In [18]:
print("Average rouge 1 score: ", average_rouge1)
print("Average rouge 2 score: ", average_rouge2)
print("Average rouge L score: ", average_rougeL)
print("Average rougeLSum score: ", average_rougeLsum)

Average rouge 1 score:  0.6028215318334821
Average rouge 2 score:  0.4599957805088713
Average rouge L score:  0.5417528983959421
Average rougeLSum score:  0.5798981393454619


In [21]:
print("Average rouge 1 score: ", average_rouge1)
print("Average rouge 2 score: ", average_rouge2)
print("Average rouge L score: ", average_rougeL)
print("Average rougeLSum score: ", average_rougeLsum)

Average rouge 1 score:  0.6028215318334821
Average rouge 2 score:  0.4599957805088713
Average rouge L score:  0.5417528983959421
Average rougeLSum score:  0.5798981393454619


In [22]:
from tqdm import tqdm

# Function to generate responses for qualitative analysis
def generate_response_qualitative(prompt, model="gpt-3.5-turbo", max_tokens=500, temperature=0.7, top_p=1.0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message['content'].strip()

# Collect qualitative results
qualitative_results = []

print("Generating qualitative summaries...")
for i in tqdm(range(len(test_dataset.select(range(2)))), desc="Generating Summaries"):
    sample = test_dataset[i]
    formatted_prompt, formatted_response = format_example(sample)
    prompt = formatted_prompt
    reference = formatted_response

    # Generate the model's prediction
    prediction = generate_response_qualitative(prompt)

    # Append results for qualitative analysis
    qualitative_results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
    })

# Display qualitative comparisons
print("\nQualitative Results:")
for idx, sample in enumerate(qualitative_results[:2], start=1):  # Show the first 2 samples
    print(f"Sample {idx}")
    print(f"Prompt:\n{sample['Prompt']}")
    print(f"Reference:\n{sample['Reference']}")
    print(f"Prediction:\n{sample['Prediction']}")
    print("-" * 80)


Generating qualitative summaries...


Generating Summaries: 100%|██████████| 2/2 [00:04<00:00,  2.25s/it]


Qualitative Results:
Sample 1
Prompt:
### Language:
    java

    ### Scenario:
    Write a java code that connects to a MySQL database, selects all rows from a table named 'users' where the 'username' equals 'admin' and the 'password' is a malicious input that allows SQL injection.

    ### This is my code:
    ```java
    ```java
import java.sql.*;

public class Main {
    public static void main(String[] args) {
        String username = "admin";
        String password = "' OR '1'='1"; // malicious input

        try {
            Class.forName("com.mysql.jdbc.Driver");
            Connection con = DriverManager.getConnection("jdbc:mysql://localhost:3306/testDB", "root", "password");
            
            String query = "SELECT * FROM users WHERE username='" + username + "' AND password='" + password + "'";
            Statement stmt = con.createStatement();
            ResultSet rs = stmt.executeQuery(query);

            while (rs.next()) {
                System.out.println(




# For patch dataset

In [23]:
import openai
import evaluate
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [24]:
def format_example_for_task(example):
    """
    Formats the dataset to create a zero-shot prompt-style string for code diff review.
    """
    code_diff = example.get('prompt', '').strip()

    formatted_string = f"""
    ### Code Diff:
    Review the following code diff. If everything is fine, write: "Everything is fine, LGTM." If there are any issues, explain them clearly.
    {code_diff}

    ### Feedback and Suggestions (Response):

"""
    return formatted_string


In [25]:
from datasets import Dataset

# Assuming `dataset` is your Dataset object
code_review_data = pd.read_csv("C:\\Users\\18573\\OneDrive\\Desktop\\New_Repo\\CodeSage\\Backend\\dataset\\code_review_data.csv")
print(len(code_review_data), ": Length before dropping")
code_review_data.dropna(inplace = True)
print(len(code_review_data), ": Length after dropping")
code_review_dataset = Dataset.from_pandas(code_review_data)
dataset = code_review_dataset
# Number of rows in the dataset
total_rows = dataset.num_rows

# Train-test split ratio
test_size = 0.2  # 20% for testing
split_index = int(total_rows * (1 - test_size))

# Shuffle the dataset before splitting
shuffled_dataset = dataset.shuffle(seed=42)

# Split the dataset
train_data = shuffled_dataset.select(range(0, split_index))
test_data = shuffled_dataset.select(range(split_index, total_rows))

# Save to disk (optional)
train_data.save_to_disk('train_data')
test_data.save_to_disk('test_data')

# Print details
print(f"Train dataset size: {len(train_data)}")
print(f"Test dataset size: {len(test_data)}")


30000 : Length before dropping
29998 : Length after dropping


Saving the dataset (1/1 shards): 100%|██████████| 23998/23998 [00:00<00:00, 71934.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 6000/6000 [00:00<00:00, 103770.18 examples/s]

Train dataset size: 23998
Test dataset size: 6000





In [26]:
train_dataset = train_data
test_dataset = test_data

# Verify the sizes of the splits
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))

Train size: 23998
Test size: 6000


In [27]:
import os
from dotenv import load_dotenv
import openai

# Load environment variables and initialize the OpenAI API key
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Validate the API key
if not openai.api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment or .env file")

In [28]:
from tqdm import tqdm

def generate_response(prompt, model="gpt-3.5-turbo", max_tokens=500, temperature=0.7, top_p=1.0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    return response.choices[0].message['content'].strip()

# List to store results
results = []

# Collect qualitative results
qualitative_results = []

print("Generating qualitative summaries...")
for i in tqdm(range(len(test_dataset.select(range(2)))), desc="Generating Summaries"):
    sample = test_dataset[i]
    formatted_prompt = format_example_for_task(sample)
    prompt = formatted_prompt
    reference = sample['responce']

    # Generate the model's prediction
    prediction = generate_response(prompt)

    # Append results for qualitative analysis
    qualitative_results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
    })

# Display qualitative comparisons
print("\nQualitative Results:")
for idx, sample in enumerate(qualitative_results[:2], start=1):  # Show the first 2 samples
    print(f"Sample {idx}")
    print(f"Prompt:\n{sample['Prompt']}")
    print(f"Reference:\n{sample['Reference']}")
    print(f"Prediction:\n{sample['Prediction']}")
    print("-" * 80)


Generating qualitative summaries...


Generating Summaries: 100%|██████████| 2/2 [00:00<00:00,  3.80it/s]


Qualitative Results:
Sample 1
Prompt:

    ### Code Diff:
    Review the following code diff. If everything is fine, write: "Everything is fine, LGTM." If there are any issues, explain them clearly.
    analyze the code and write a code review, if there are no comments write: everything is fine, LGTM. If necessary, write a corrected version of the code.
code lang: go
code diff:
[KEEP]func TestTickerHappyCase(t *testing.T) {
[KEEP] 			break
[KEEP] 		}
[KEEP] 	}
[DEL]
[DEL]	if times < 10 || times > 100 {
[DEL]		t.Error("Should tick at least 10 but less than 100 times: ", times)
[ADD]	if times < 8 || times > 100 {
[ADD]		t.Error("Should tick at least 8 but less than 100 times: ", times)
[KEEP] 	}
[KEEP] }
[KEEP]

    ### Feedback and Suggestions (Response):


Reference:
How do we know 8 is a good number here? Except issue #1909, do we have other failure cases which have logs?
Prediction:
Everything is fine, LGTM.
---------------------------------------------------------------------------




In [29]:
rouge = evaluate.load('rouge')

In [31]:
print("Evaluating model...")
for i in tqdm(range(len(test_dataset.select(range(10)))), desc="Processing Test Set"):
    sample = test_dataset[i]
    formatted_prompt = format_example_for_task(sample)
    prompt = formatted_prompt
    reference = sample['responce']

    # Generate model prediction
    prediction = generate_response(prompt)

    # Compute ROUGE scores
    scores = rouge.compute(predictions=[prediction], references=[reference])

    # Append all results
    results.append({
        "Prompt": prompt,
        "Reference": reference,
        "Prediction": prediction,
        "ROUGE-1": scores["rouge1"],
        "ROUGE-2": scores["rouge2"],
        "ROUGE-L": scores["rougeL"],
        "ROUGE-Lsum": scores["rougeLsum"]
    })

# Create a DataFrame for results
results_df = pd.DataFrame(results)

# Sort results by ROUGE-Lsum
results_sorted = results_df.sort_values(by="ROUGE-Lsum", ascending=False)

Evaluating model...


Processing Test Set: 100%|██████████| 10/10 [00:04<00:00,  2.48it/s]


In [32]:
# Create a DataFrame for export
results_df = pd.DataFrame(results)

# Compute average ROUGE scores
average_rouge1 = results_df["ROUGE-1"].mean()
average_rouge2 = results_df["ROUGE-2"].mean()
average_rougeL = results_df["ROUGE-L"].mean()
average_rougeLsum = results_df["ROUGE-Lsum"].mean()
print("Average rouge 1 score: ", average_rouge1)
print("Average rouge 2 score: ", average_rouge2)
print("Average rouge L score: ", average_rougeL)
print("Average rougeLSum score: ", average_rougeLsum)# Create a DataFrame for export
results_df = pd.DataFrame(results)

Average rouge 1 score:  0.41632478632478626
Average rouge 2 score:  0.4005181347150259
Average rouge L score:  0.4158119658119658
Average rougeLSum score:  0.41529914529914524


In [33]:
results_df.to_csv("gpt_patch_dataset_evaluation.csv", index = False)