In [1]:
import os
import getpass
import re
import time

from tqdm import tqdm  
import pandas as pd
from langchain import hub
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain.callbacks import get_openai_callback
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

In [2]:
# Add open-ai credentials. This project uses azure openAI service
os.environ["AZURE_OPENAI_ENDPOINT"] = "..."
os.environ["AZURE_OPENAI_API_VERSION"] = "..."
os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass()

 ········


In [3]:
#aoai models - add the azure openai deployment instance names

aoai_gpt4o="..."
aoai_gpt4o_mv="2023-07-01-preview" # need to be updated based on new model API version release
aoai_embeddings="..."

In [4]:
#call using AzureChatOpenAI - Langchain
aoai_llm = AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=aoai_gpt4o,
    model_version=aoai_gpt4o_mv
) 

In [5]:
def extract_python_code(markdown_text):  
    # Regular expression pattern for a Python code block in Markdown  
    # This pattern looks for both ```python and ``` followed by the code block  
    pattern = r'```(?:python)?\s*(.*?)```'  
      
    # Use re.DOTALL to match across multiple lines  
    matches = re.findall(pattern, markdown_text, re.DOTALL)  
      
    # If code blocks are found, join them; otherwise, return the entire text assuming it's Python code  
    if matches:  
        # Join matches in case there are multiple code blocks  
        python_code = '\n\n'.join(matches)  
    else:  
        # If no markdown code blocks are detected, assume the entire text is Python code  
        python_code = markdown_text  
      
    return python_code  



In [6]:
def double_curly_braces(text):  
    # Define the regular expression pattern for {some exp}  
    # The pattern looks for an opening curly brace, followed by  
    # any characters that are not a closing curly brace (non-greedy),  
    # followed by a closing curly brace  
    pattern = r'({[^}]*})'  
      
    # Use the re.sub function to replace the pattern with doubled curly braces  
    # The replacement pattern uses \1 to refer to the matched text within the curly braces  
    replaced_text = re.sub(pattern, r'{\1}', text, flags=re.DOTALL)
      
    return replaced_text   

In [9]:
data_train = pd.read_csv("./data/Schedule_Data_Train.csv")
data_val = pd.read_csv("./data/Schedule_Data_Val.csv")
data_test = pd.read_csv("./data/Schedule_Data_Test.csv")


## [A1] GPT4 with Zero shot

In [10]:
zs_prompt_template = """ You are an AI assistant with expertise in creating python problem formulation for a given job shop scheduling problem description.
I will provide you a job scheduling problem description and you goal is the answer the problem formulation in python. 
The answer you provide will be passed to a constraint programming solver(cpmpy library) to validate the problem formulation you provide.
So, revisit the problem formulation step by step to validate any syntax or logical errors and only output the valid python code for the problem formulation with no additional text and description.

If you don't know answer, just say that you don't know, don't try to make up an answer.

Input: {input}

Output:"""

zs_prompt = PromptTemplate.from_template(zs_prompt_template)

zs_rag_chain = (
    {"input": RunnablePassthrough()}
    | zs_prompt
    | aoai_llm
    | StrOutputParser()
)

In [11]:
# Add a new column to the DataFrame for storing responses  
data_test['pf_gpt_zs'] = None  

# Define the number of iterations in your loop  
total_iterations = len(data_test)  
  
# Use tqdm to create a progress bar  
progress_bar = tqdm(total=total_iterations, desc='Processing', unit='iteration')  
  
total_cost = 0  
  
for index, row in data_test.iterrows():  
    prob_desc = row["Description"]  
    with get_openai_callback() as cb:  
        response = zs_rag_chain.invoke(prob_desc)
        prob_form = extract_python_code(response)
        data_test.at[index, 'pf_gpt_zs'] = prob_form  
        total_cost += cb.total_cost
    #print(f"cost: ${format(cb.total_cost, '.6f')}")
    time.sleep(1)  
    progress_bar.update(1)
  
progress_bar.close()  
  
#print(f"Total cost: ${format(total_cost, '.6f')}")  

Processing: 100%|██████████| 20/20 [05:48<00:00, 17.40s/iteration]


In [12]:
# write the updated DataFrame to a new CSV file  
data_test.to_csv("./data/results_analysis6.csv", index=False)  

## [A2] GPT4 with One shot

In [163]:
#print(data_train.iloc[15]["Description"]) 

In [13]:
os_prompt_template_s = f""" You are an AI assistant with expertise in creating python problem formulation for a given job shop scheduling problem description.
I will provide you a job scheduling problem description and you goal is the answer the problem formulation in python. 
The answer you provide will be passed to a constraint programming solver(cpmpy library) to validate the problem formulation you provide.
So, revisit the problem formulation step by step to validate any syntax or logical errors and only output the valid python code for the problem formulation with no additional text and description.
If you don't know answer, just say that you don't know, don't try to make up an answer.

Example:
Input: {data_train.iloc[15]["Description"]}  
Output: 
{double_curly_braces(data_train.iloc[15]["Prob_Formulation"])} 
"""
os_template_suffix = """
Input: {input}

Output:"""

os_prompt_template = os_prompt_template_s + os_template_suffix

os_prompt = PromptTemplate.from_template(os_prompt_template)

os_rag_chain = (
    {"input": RunnablePassthrough()}
    | os_prompt
    | aoai_llm
    | StrOutputParser()
)

In [174]:
#print(os_prompt_template)

In [14]:
# Add a new column to the DataFrame for storing responses  
data_test['pf_gpt_os'] = None  

# Define the number of iterations in your loop  
# total_iterations = len(data_test)  
  
# Use tqdm to create a progress bar  
progress_bar2 = tqdm(total=total_iterations, desc='Processing', unit='iteration')  
  
total_cost = 0  
  
for index, row in data_test.iterrows():  
    prob_desc = row["Description"]  
    with get_openai_callback() as cb:  
        response = os_rag_chain.invoke(prob_desc)
        prob_form = extract_python_code(response)
        data_test.at[index, 'pf_gpt_os'] = prob_form  
        total_cost += cb.total_cost
    #print(f"cost: ${format(cb.total_cost, '.6f')}")
    time.sleep(1)  
    progress_bar2.update(1)
  
progress_bar2.close()  
  
#print(f"Total cost: ${format(total_cost, '.6f')}") 

Processing: 100%|██████████| 20/20 [10:49<00:00, 32.48s/iteration]

Total cost: $0.000000





In [15]:
# write the updated DataFrame to a new CSV file  
data_test.to_csv("./data/results_analysis6.csv", index=False)  

## [A3] GPT4 with FewShot RAG

In [16]:
loader = CSVLoader(file_path='./data/Schedule_Data_Train.csv')
docs = loader.load()

In [17]:
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="IKEA-CS-text-embedding-ada-002",
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"]
)

vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

# Retrieve and generate using the relevant snippets.
# retriever = vectorstore.as_retriever()
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [19]:
examples = retriever.invoke("Create job shop scheduling model with 7 jobs and 7 machines. All jobs have random routes and their operations have random durations. The due dates are calculated based on a total processing time of each job multiplied by a due date allowance of 1.3. Release time of a job is a random value from 0 to 50. Jobs cannot start before their release times. Each job has a weight following a random distribution in which 20% will have weight of 1, 60% will have weight of 2, and 20% will have weight of 4. The objective function is total weighted flowtime. Maximum duration is 20. After solving the problem, solutions will be printed and visualized. Note: The first task related to each job should be completed before the completion of any job.")

In [20]:
fs_prompt_template = """ You are an AI assistant with expertise in creating python problem formulation for a given job shop scheduling problem description.
I will provide you a job scheduling problem description and you goal is the answer the problem formulation in python. 
The answer you provide will be passed to a constraint programming solver(cpmpy library) to validate the problem formulation you provide.
So, revisit the problem formulation step by step to validate any syntax or logical errors and only output the valid python code for the problem formulation with no additional text and description.

If you don't know answer, just say that you don't know, don't try to make up an answer.

Examples:
{examples}

Description: {input}

Prob_formulation:"""

fs_prompt = PromptTemplate.from_template(fs_prompt_template)

fs_rag_chain = (
    {"examples": RunnablePassthrough(), "input": RunnablePassthrough()}
    | fs_prompt
    | aoai_llm
    | StrOutputParser()
)

In [216]:
# print(fs_rag_chain)
# fs_rag_chain.invoke({"examples": examples, "input": data_test.iloc[0]["Description"]})

In [21]:
# Add a new column to the DataFrame for storing responses  
data_test['pf_gpt_fs_rag'] = None  

# Define the number of iterations in your loop  
# total_iterations = len(data_test)  
  
# Use tqdm to create a progress bar  
progress_bar3 = tqdm(total=total_iterations, desc='Processing', unit='iteration')  
  
total_cost = 0  
  
for index, row in data_test.iterrows():  
    prob_desc = row["Description"]
    matched_examples =  retriever.invoke(prob_desc)
    fs_prompt_examples = format_docs(matched_examples)
    with get_openai_callback() as cb:  
        response = fs_rag_chain.invoke({"examples": fs_prompt_examples, "input": prob_desc})
        prob_form = extract_python_code(response)
        data_test.at[index, 'pf_gpt_fs_rag'] = prob_form  
        total_cost += cb.total_cost
    #print(f"cost: ${format(cb.total_cost, '.6f')}")
    time.sleep(1)  
    progress_bar3.update(1)
  
progress_bar3.close()  
  
#print(f"Total cost: ${format(total_cost, '.6f')}") 

Processing: 100%|██████████| 20/20 [10:15<00:00, 30.75s/iteration]

Total cost: $0.000000





In [22]:
# write the updated DataFrame to a new CSV file  
data_test.to_csv("./data/results_analysis6.csv", index=False)  