# What is we create a persona
Creating a prompt with a persona is a method of generating questions that are targeted towards a specific user group, 
in this case prospective students and applicants of the University of Texas at Dallas.

In [None]:
# imports libraries and setup
import os
import openai
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.
#setting up OpenAI Api
OPENKEY_API = os.getenv("OPENAI_KEY")
ORGANIZATION_ID = os.getenv("ORGANIZATION_ID")
os.environ["OPENAI_API_KEY"] = OPENKEY_API
openai.organization = ORGANIZATION_ID
# get this from top-right dropdown on OpenAI under organization > settings
openai.api_key = OPENKEY_API

# Fields

#  Majors
1. Computer Science
2. Computer Engineering
3. Electrical Engineering
4. Mechanical Engineering
5. Mathematics
6. ...

In [None]:
MAJORS = [
    "Arts and Technology",
    "Emerging Media and Communication",
    "Accounting",
    "Business Administration",
    "Business Analytics",
    "Energy Management",
    "Finance",
    "Healthcare Management",
    "Information Technology and Systems",
    "Innovation and Entrepreneurship",
    "International Management Studies",
    "Management",
    "Marketing",
    "Supply Chain Management",
    "Aerospace Engineering",
    "Bioengineering",
    "Computer Engineering",
    "Computer Science",
    "Cybersecurity",
    "Electrical Engineering",
    "Materials Science and Engineering",
    "Mechanical Engineering",
    "Software Engineering",
    "Applied Behavior Analysis",
    "Audiology",
    "Child Learning and Development",
    "Cognition and Neuroscience",
    "Communication Disorders",
    "Criminology",
    "Developmental Psychology",
    "Neuroscience",
    "Economics",
    "Geospatial Information Sciences",
    "International Political Economy",
    "International Relations",
    "Political Science",
    "Public Affairs",
    "Public Policy and Political Economy",
    "Sociology",
    "Actuarial Science",
    "Applied Mathematics",
    "Biology",
    "Chemistry",
    "Data Science",
    "Geosciences",
    "Mathematics",
    "Molecular and Cell Biology",
    "Physics",
    "Statistics"
]

# Base Pesona Prompt using ChatGPT

In [None]:
from rich import print
prompt = "Provide a list of questions that a student at University of Texas at Dallas  would create in number format"


response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo", 
  messages = [
        {"role": "system", "content" : "You are ChatGPT, a large language model trained by OpenAI. Follow the user's instructions carefully. Respond using markdown.\nKnowledge cutoff: 2021-09-01\nCurrent date: 2023-03-02"},
        {"role": "user", "content" : prompt},
  ]
)
print( response )

# Preprocess

In [None]:
def preprocess (chatgpt_response) -> str:
    #print( chatgpt_response)
    ## Split the response into a list of instructions
    list_instruction = chatgpt_response['choices'][0]['message']["content"].split('\n')
    list_instruction = [instruction for instruction in list_instruction if instruction != '']
    print(list_instruction)
    ## remoe all the instruction that does not have a number and dot afterward
    list_instruction = [instruction for instruction in list_instruction if instruction[0].isdigit() and instruction[1] == '.']
    ## Remove the number listed in the list of instructions
    list_instruction = [instruction.split('.',1)[1] for instruction in list_instruction]
    print(list_instruction)
    ## Remove trailing spaces
    list_instruction = [instruction.strip() for instruction in list_instruction]
    #print(list_instruction)
    return list_instruction

# Store the instruction in a Pandas Dataframe that will be saved using in a csv

In [None]:
def store_the_persona_question(chatgpt_response , **kwargs):
    import pandas as pd
    list_of_instructions = preprocess(chatgpt_response)
    # Store the list of instructions in a new csv file instruction_dataet.csv
    answer = pd.DataFrame( list_of_instructions, columns=['Question'])
    ## add new column date_created
    answer['date_created'] = pd.to_datetime('today')
    ## what model was used to generate the instructions
    answer['model'] = 'gpt-3.5-turbo'
    for key, value in kwargs.items():
        answer[key] = value
    ## The prompt of the model
    #print(f"The columns of the dataframe are {answer.columns}")
    #answer['prompt'] = PROMPT
    #print(f"The columns of the dataframe are {answer.columns}")
    #print(answer)
    ## save the dataframe to a csv file
    # Load to the instruction dataset.csv
    load_csv = pd.read_csv('evaluation.csv')
    load_dataset = pd.concat([load_csv, answer], ignore_index=True)
    #answer.to_csv('instruction_dataset.csv', index=False)
    load_dataset.to_csv('evaluation.csv', index=False)
    #answer.to_csv('instruction_dataset.csv', index=False)
    #answer.to_csv('evaluation.csv', index=False)

# Prompt Format with different foramt 
1. What type of majors
Initial Prompt
```
prompt = "Provide a list of questions that a student {{who is {majoring}}} at University of Texas at Dallas  would create in number format"
```

In [None]:
from rich import print


#print( completion )
for major in MAJORS:
  prompt = f"Provide a list of questions that a student who is {major} at University of Texas at Dallas  would create in number format"
  completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo", 
  messages = [
        {"role": "system", "content" : "You are ChatGPT, a large language model trained by OpenAI. Follow the user's instructions carefully. Respond using markdown.\nKnowledge cutoff: 2021-09-01\nCurrent date: 2023-03-02"},
        {"role": "user", "content" : prompt},
  ],
  temperature = 1,
  top_p = 1,
  )
  store_the_persona_question(completion, major=major , prompt=prompt)

In [None]:
list_of_instructions = preprocess(response)

# Add the context of the grades of the person is the preson a freshman, sophmore, junior, senior

In [None]:
from rich import print


print( completion )
for grade in ["undergraduate freshman", " undergraduate sophomore", "undergraduate junior", "undergraduate senior" , "master", "PHD"]:
    for major in MAJORS:
        prompt = f"Provide a list of questions that a {grade} {major} student at UTD  would in number format.t"
        completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", 
        messages = [
                {"role": "system", "content" : "You are ChatGPT, a large language model trained by OpenAI. Follow the user's instructions carefully. Respond using markdown.\nKnowledge cutoff: 2021-09-01\nCurrent date: 2023-03-02"},
                {"role": "user", "content" : prompt},
        ],
        temperature = 1,
        top_p = 1   
        )
        store_the_persona_question(completion, grade=grade, major=major , prompt=prompt , temperature = 1 , top_p = 1)
        

# Add Time of Expected of Graduation Dates

In [None]:
'''
for time_of_gradiation in range(2023 , 2027):
    for grade_number , grade in enumerate(["undergraduate freshman", " undergraduate sophomore", "undergraduate junior", "undergraduate senior" , "master", "PHD"]):
        if "undergraduate" in grade and grade_number > time_of_gradiation - 2023:
            for major in MAJORS:
                prompt = f"Provide a list of questions that a {grade} student who is {major} at University of Texas at Dallas  would create in number format"
                completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo", 
                messages = [
                        {"role": "system", "content" : "You are ChatGPT, a large language model trained by OpenAI. Follow the user's instructions carefully. Respond using markdown.\nKnowledge cutoff: 2021-09-01\nCurrent date: 2023-03-02"},
                        {"role": "user", "content" : prompt},
                ],
                temperature = 1,
                top_p = 1   
                )
                store_the_persona_question(completion, grade=grade, major=major , prompt=prompt , temperature = 1 , top_p = 1)
'''

# Add student Interestes

In [None]:
INTEREST  = [
    "doing research",
    "graduate school",
    "doing well in class",
    "graduate soon as possible"
]
## add a suffix in INTEREST
INTEREST = ["interested in "+ interest for interest in INTEREST]
print(INTEREST)
## add an empty line 
INTEREST.append("")
    

In [None]:
for interest in INTEREST:
    for grade_number , grade in enumerate(["undergraduate freshman", " undergraduate sophomore", "undergraduate junior", "undergraduate senior" , "master", "PHD"]):
            for major in MAJORS:
                
                prompt = f"Provide a list of questions that a {grade} {major} student at UTD {interest} would in number format."
                completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo", 
                messages = [
                        {"role": "system", "content" : "You are ChatGPT, a large language model trained by OpenAI. Follow the user's instructions carefully. Respond using markdown.\nKnowledge cutoff: 2021-09-01\nCurrent date: 2023-03-02"},
                        {"role": "user", "content" : prompt},
                ],
                temperature = 1,
                top_p = 1   
                )
                store_the_persona_question(completion, grade=grade, major=major , prompt=prompt , temperature = 1 , top_p = 1)

# Filter the evaluation  of the Dataset

In [18]:
def check_if_it_relevant_question():
    import pandas as pd
    df = pd.read_csv('evaluation.csv')
    list_of_questions_topics = ["UTD", "housing", "university", "Texas", "Dallas" , "transfer" , "meal plan" , "GPA" , "study" , "abroad"]
    list_of_questions_topics += MAJORS
    list_of_questions_topics += INTEREST
    list_of_questions_topics += ["undergraduate", "graduate", "freshman", "sophomore", "junior", "senior", "master", "PHD"]
    list_of_questions_topics += ["interested in"]
    ## count the number of time list of questions topics appear in the question
    df['count'] = df['Question'].apply(lambda x: sum([1 for topic in list_of_questions_topics if topic in x]))
    ## if the count is 0 then it is not relevant
    df.to_csv('evaluation.csv', index=False)
check_if_it_relevant_question()

KeyError: 'question'