In [1]:
import os
import time
from openai import OpenAI
import datetime
import pandas as pd
import re
import logging

In [2]:
os.environ["OPENAI_API_KEY"] = Enter Your Open AI Key Here
# Set MODEL to "gpt-3.5-turbo"
model = "gpt-3.5-turbo"
# Set temperature to 0.9, it can be from 0~1, 0 is the most conservative, 1 is the most creative
temperature = 0.9
output_file = 'results.txt'

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# Configure logging
logging.basicConfig(filename='status.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
# Functions to handle string manipulation

# Clean up the purpose field of files df
def remove_numbers_and_txt(input_string):
    # Remove all numbers
    result = re.sub(r'\d+', '', input_string)
    # Remove '.txt'
    result = re.sub(r'\.txt', '', result)
    return result

# Splits responses into individual questions
def split_text_into_qa(text):
    qa_pairs = []
    lines = text.split("\n")
    for line in lines:
        if line.startswith("### Human:"):
            question = line
        elif line.startswith("### Assistant:"):
            answer = line
            qa_pairs.append(f"{question}, {answer}")
        else:
            continue
    return qa_pairs

# Split a string into smaller strings of at most max_length characters each.
def split_string(string, max_length=10000):
    return [string[i:i+max_length] for i in range(0, len(string), max_length)]

In [4]:
# This function assesses the data files at the given folder path
# It's a little goofy because I made it based on the arbitrary file names I made
def enumerate_files(folder_path):
    files_data = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            full_path = os.path.join(root, file)
            file_name = os.path.basename(full_path)
            split_name = file_name.split("-")
            #print(split_name)
            module_number = split_name[0][3:]
            purpose =  remove_numbers_and_txt(split_name[1])
            if ("Code" in file_name) and file_name.endswith(".txt"):
                is_code_file = "Lab" in purpose
                code_type = file_name.split("-")[2] if is_code_file else None
            else:
                is_code_file = False
                code_type = None
                module = None
            files_data.append({"Full Path": full_path, "File Name": file_name, "Is Code File": is_code_file, "Code Type": code_type, "Module": module_number, "Purpose": purpose})
    return pd.DataFrame(files_data)

In [5]:
# File Functions

# This reads in specific system required files
def grab_fileinfo(dict, key, data_folder):
    data = ''
    # Current working directory
    current_dir = os.getcwd()
    path = os.path.join(current_dir, data_folder, dict[key])
    with open(path, 'r') as file:
        data = file.read()
    return data

# This reads all data files into a string and appends it into a list of strings
def read_file_into_string(file_path):
    with open(file_path, "r", encoding="utf8") as file:
        file_contents = file.read()
        if len(file_contents) > 8000:
            file_contents = split_string(file_contents)
    return file_contents

In [6]:
# OpenAPI messaging 

# This creates the formate required for a gpt3.5 inference
def create_message(role, input):
    message_list = []
    system_message = {"role": "system", "content": role}
    message_list.append(system_message)
    user_message = {"role": "user", "content": input}
    message_list.append(user_message)
    return message_list

def with_hist(message_list, as_resp, repeat_prompt):
    assistant_message = {"role": "user", "content": as_resp}
    user_message = {"role": "user", "content": repeat_prompt}
    message_list.append(assistant_message)
    message_list.append(user_message)

# This gets the response from the API
def grab_respons(message_list):
    chat_completion = client.chat.completions.create(
        messages=message_list,
        model=model,
        temperature=temperature
    )
    return chat_completion.choices[0].message.content

In [7]:
def one_msg(msg_list, iterations, df_res, prompt):
    for i in range(iterations):
        output = grab_respons(msg_list)
        with_hist(msg_list, output, prompt)
        df_res += split_text_into_qa(output)
        logging.info(f"File Iteration: {i}")

# This does a sweep of the run
def full_sweep(messagingDict, df):
    df_res = []
    for index, row in df.iterrows():
        iterations = 1
        if row['Is Code File']:
            iterations = messagingDict['CodeIter']
        else:
            iterations = messagingDict['OtherIter']
        
        texts = read_file_into_string(row['Full Path'])

        if not isinstance(texts, str):
            for text in texts:
                logging.info(f"Multipart file: {row['Full Path']}")
                message_list = create_message(messagingDict['CreatePrompt'], text)
                one_msg(message_list, iterations, df_res, messagingDict['RepeatPrompt'])
        else:
            message_list = create_message(messagingDict['CreatePrompt'], texts)
            one_msg(message_list, iterations, df_res, messagingDict['RepeatPrompt'])

        
        # Log status updates
        logging.info(f"Processed file: {row['Full Path']}")
        
        # Append df_res to a file called current_questions
        with open('2-Derived_Data\current_questions.txt', 'a') as f:
            for qa_pair in df_res:
                f.write(f"{qa_pair}\n")
        df_res = []

In [8]:
# This is all system metadata
messagingDict = {}

# location of the prompts folder
prompts_folder = '1-TA_Prompts'

# Location of the data folder
data_folder = "1-TA_Data"

# get the prompts for usage
filenames = {
    'CreatePrompt' : 'create_prompt.txt',
    'RepeatPrompt' : 'repeat_prompt.txt'
}

for k in filenames.keys():
    messagingDict[k] = grab_fileinfo(filenames, k, prompts_folder)

# Question generation criteria
messagingDict['CodeIter'] = 2
# messagingDict['Batch'] = 11 # This is hard coded in the prompt right now
# messagingDict['LabIter'] = 2 # Not implemented 
messagingDict['OtherIter'] = 2


In [9]:
#try:
df = enumerate_files(data_folder)
full_sweep(messagingDict, df)
#except IOError:
    #print("Could not write file")
#finally:
    #f.close()

In [10]:
df.head(20)

Unnamed: 0,Full Path,File Name,Is Code File,Code Type,Module,Purpose
0,TA_Data\Mod0-Overview.txt,Mod0-Overview.txt,False,,0,Overview
1,TA_Data\Mod1-Lab1.txt,Mod1-Lab1.txt,False,,1,Lab
2,TA_Data\Mod1-Presentations.txt,Mod1-Presentations.txt,False,,1,Presentations
3,TA_Data\Mod2-Lab1.txt,Mod2-Lab1.txt,False,,2,Lab
4,TA_Data\Mod2-Presentations1.txt,Mod2-Presentations1.txt,False,,2,Presentations
5,TA_Data\Mod2-Presentations2.txt,Mod2-Presentations2.txt,False,,2,Presentations
6,TA_Data\Mod3-Lab-Python-Code1.txt,Mod3-Lab-Python-Code1.txt,True,Python,3,Lab
7,TA_Data\Mod3-Lab-Python-Code2.txt,Mod3-Lab-Python-Code2.txt,True,Python,3,Lab
8,TA_Data\Mod3-Lab1.txt,Mod3-Lab1.txt,False,,3,Lab
9,TA_Data\Mod3-Lab2.txt,Mod3-Lab2.txt,False,,3,Lab


In [12]:
def read_dataset(file_path):
    with open(file_path, 'r', encoding="cp1252") as file:
        lines = file.readlines()
    return lines

def show_set(showset):
  for s in showset:
    print(s)



dataset = read_dataset('current_questions.txt')

for i, d in enumerate(dataset):
  d = d.replace(", ###", "\n###").strip()
  d = d[:-1]
  dataset[i] = d

show_set(dataset[0:5])

### Human: What are the key software languages taught in ADS 500B course?
### Assistant: The key software languages taught in ADS 500B course are Unix, SQL, R, and Python
### Human: What is the focus of this course?
### Assistant: This course is focused on practical aspects of exploratory data analysis, analytics, and basic machine learning techniques
### Human: What types of questions are included in the quizzes of this course?
### Assistant: The quizzes in this course consist of multiple choice and true and false questions
### Human: What should students leverage to help them through the assignments and quizzes?
### Assistant: Students should leverage the supplemental Unix material in Blackboard to help them through the assignments and quizzes
### Human: What is recommended for students to install in preparation for Module 2?
### Assistant: It is recommended for students to install the full graphical user interface version of Anaconda in preparation for Module 2
