In [1]:
import pandas as pd
import numpy as np
from openai import OpenAI
import json
import ast

In [2]:
followup_df = pd.read_csv('followup_ques_uncategorized.csv')
followup_df = followup_df[followup_df.order_id.notnull()]
followup_df = followup_df.drop_duplicates(subset=['order_id'], keep='first').reset_index(drop=True)
followup_df

Unnamed: 0,order_id,additional_questionnaire
0,5ff575580438d7001357b80e,"{""1: Follow Up Questions "": {""1: Your current ..."
1,5ffc729d0d564400125bcc31,"{""1: Follow Up Questions "": {""1: Your current ..."
2,5ff893a5dd2f9600139ab166,"{""1: Follow Up Questions "": {""1: GENERAL: In o..."
3,5ff8945fdd2f9600139ab16a,"{""1: Follow Up Questions "": {""1: Your current ..."
4,5ff7cd94dd2f9600139ab137,"{""1: Follow Up Questions "": {""1: Just a note: ..."
...,...,...
987,63ab370268746a00137ab45e,"{""1: Follow Up Questions"": {""1: Your current L..."
988,63c57f42d28912001b92f8fb,"{""2: General Questions"": {""1: What are you typ..."
989,63c71397a837f6001a206708,"{""1: Follow Up Questions"": {""1: Is the resume ..."
990,63c57a469cac930013a54719,{}


In [6]:
categorize_followup_questions = """Given a JSON object containing various follow-up questions and answers related to personal information, work experience, education, skills, and other experiences (questions about awards, certifications, volunteering experiences, personal projects, and professional affiliations that are not work experiences), your task is to categorize these questions into separate lists for each of the 5 categories. 
For work experience, education, and other experiences, rewrite each question to include the job or degree or experience name in the question itself, eliminating the nested JSON format. Please note that "other experiences" refer to questions about awards, certifications, volunteering experiences, personal projects, and professional affiliations only. You will be rewarded for not putting any other question in the "other experience" section apart from the ones mentioned above. Questions related to career goals should strictly be placed under personal information. You will be rewarded for correctly categorizing each question and for not omitting any questions/answer pairs.

Finally, return a JSON object with keys for personal information, work experience, education, skills, and other experiences, each containing a list of the categorized questions along with answers (as shown in the schema below). 
{
  "personal_info": [string of correctly categorized question+answer],
  "work_experience": [string of correctly categorized question+answer],
  "education": [string of correctly categorized question+answer],
  "skills": [string of correctly categorized question+answer],
  "other_experience":[string of correctly categorized question+answer]
}

Here is the JSON object containing the follow-up questions and answers:
"""

In [7]:
def _get_model_output(followup_value):
    client =  OpenAI(
    api_key="",
)
    response = client.chat.completions.create(
            model="gpt-4o",
            messages = [
        {"role": "system", "content": "You specialize in categorizing question-answer pairs accurately to 5 categories, namely, personal information, work experience, education, skills, and other experiences."},
        {"role": "user", "content": categorize_followup_questions + followup_value}
    ], 
    response_format={"type": "json_object"}
            )
    result = response.choices[0].message.content
    return result

In [8]:
res = []
for id, row in followup_df.iterrows():
    followup_ques = row['additional_questionnaire']
    response = _get_model_output(followup_ques)
    print (id, response)
    res.append(response)


0 {
  "personal_info": [
    "1: Your current LinkedIn URL is not customized. Customized URLs are important because they look more professional and make your profile more accessible to recruiters and hiring managers. The LinkedIn URL Linkedin.com/in/quinngawronski/ is available. I am going to add that URL to your resume. We recommend you update your URL on your LinkedIn account to this. Does this make sense? : Yes"
  ],
  "work_experience": [
    "Social Media and Marketing Intern – Outpost Club: What key networks did your social media strategy span across? : Do you mean what platforms? My strategy was for Facebook, Instagram and email marketing",
    "Social Media and Marketing Intern – Outpost Club: How frequently were you developing marketing email campaigns and newsletters? Do you have any data to show an increase in open rates or something similar?  : As needed — there are some weeks where we send out three and weeks where there are none. Last year our open rate increased by 6.2%,

In [9]:
followup_df['gpt4o_response'] = res

In [10]:
followup_df

Unnamed: 0,order_id,additional_questionnaire,gpt4o_response
0,5ff575580438d7001357b80e,"{""1: Follow Up Questions "": {""1: Your current ...","{\n ""personal_info"": [\n ""1: Your current ..."
1,5ffc729d0d564400125bcc31,"{""1: Follow Up Questions "": {""1: Your current ...","{\n ""personal_info"": [\n ""Your current Lin..."
2,5ff893a5dd2f9600139ab166,"{""1: Follow Up Questions "": {""1: GENERAL: In o...","{\n ""personal_info"": [\n ""GENERAL: In opti..."
3,5ff8945fdd2f9600139ab16a,"{""1: Follow Up Questions "": {""1: Your current ...","{\n ""personal_info"": [\n ""1: Your current ..."
4,5ff7cd94dd2f9600139ab137,"{""1: Follow Up Questions "": {""1: Just a note: ...","{\n ""personal_info"": [\n ""Is there anythin..."
...,...,...,...
987,63ab370268746a00137ab45e,"{""1: Follow Up Questions"": {""1: Your current L...","{\n ""personal_info"": [\n ""Your current Lin..."
988,63c57f42d28912001b92f8fb,"{""2: General Questions"": {""1: What are you typ...","{\n ""personal_info"": [\n ""What are you typ..."
989,63c71397a837f6001a206708,"{""1: Follow Up Questions"": {""1: Is the resume ...","{\n ""personal_info"": [\n ""Is the resume yo..."
990,63c57a469cac930013a54719,{},"{\n ""personal_info"": [],\n ""work_experience""..."


In [37]:
### split the 5 categories into 5 individual columns
def _followup_explode(row, section_name):
    try: 
        json_row = json.loads(row)

        if section_name in json_row.keys():
            return json_row[section_name]
        else:  # some resume's don't have certain fields.
            return {}
    except: ## when gpt did not return json 
        print ("CORRUPT FOLLOWUP CATEGORIZATION")
        return "CORRUPT FOLLOWUP CATEGORIZATION"

    
df_explode = followup_df.copy()
df_explode['personalinfo'] = df_explode.gpt4o_response.apply(lambda x: _followup_explode(x, "personal_info"))
df_explode['workexp'] = df_explode.gpt4o_response.apply(lambda x: _followup_explode(x, "work_experience"))
df_explode['education'] = df_explode.gpt4o_response.apply(lambda x: _followup_explode(x, "education"))
df_explode['otherexp'] = df_explode.gpt4o_response.apply(lambda x: _followup_explode(x, "other_experience"))
df_explode['skills'] = df_explode.gpt4o_response.apply(lambda x: _followup_explode(x, "skills"))

df_explode.columns

CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION
CORRUPT FOLLOWUP CATEGORIZATION


Index(['order_id', 'additional_questionnaire', 'gpt4o_response',
       'personalinfo', 'workexp', 'education', 'otherexp', 'skills'],
      dtype='object')

In [42]:
df_explode = df_explode[df_explode.personalinfo!='CORRUPT FOLLOWUP CATEGORIZATION'].reset_index(drop=True) # remove 6 resumes with corrupt json parsing
df_explode.to_csv('followup_q&a_categorized.csv')