In [None]:
from openai import OpenAI
import json
import gpt as gpt
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4, ensure_ascii=False)

# openai
openai_api_key = open("openai_api_key").read() 
openai_client=OpenAI(api_key=openai_api_key)

In [None]:
def toString(segment):
    result = ''
    for dialogue in segment:
        if dialogue['speaker'] == 'interviewer':
            result += "Interviewer: "
        else:
            result += "Interviewee: "
        result += dialogue['content'] + "\n"
    return result

def extract_keywords(paragraph):
    messages = [
        {
            "role": "system",
            "content": """You are a keyword extraction system that extracts keywords from a monologue. 
            The monologue is about a person who is talking about their life.
            The keywords should be the most important words in the monologue.
            Use the exact words that the person uses in the monologue.
            Replay with a list of keywords in the following JSON format:
            {{
                "keywords": ["keyword1", "keyword2", "keyword3", ...]
            }}
            """
       },
       {
           "role": "user",
           "content": paragraph
       }
    ]
    # keywords = request_chatgpt_gpt4(messages)
    return messages
def extract_title(conversation):
    messages = [
        {
            "role": "system",
            "content": """You are a title analysis system. 
            You are given a conversation between two people: Interviewer and Interviewee. 
            Give a concise title for the conversation with no more than 10 words.
        """
        },
        {
            "role": "user",
            "content": conversation
        }
    ]
    return messages


In [None]:
import glob
# reformat
for segmented_file in glob.glob("segmented_transcripts_more/*.json"):
    participant_id = segmented_file.split("/")[-1].split(".")[0]
    segments = json.load(open(segmented_file))
    reformat_segments = []
    for segment_index, segment in enumerate(segments):
        reformat_segments.append({
            "id": "{}_{}".format(participant_id, segment_index),
            "conversation": segment,
        })
    save_json(reformat_segments, 'tmp/segments_formatted/{}.json'.format(participant_id))

all_chunks = []
for interview_file in glob.glob("tmp/segments_formatted/*.json"):
    pid = interview_file.split("/")[-1].split(".")[0]
    interview_data = json.load(open(interview_file))
    print(interview_file)
    keyword_prompts = []
    title_prompts = []
    for segment in interview_data:
        interviewee_messages = "\n".join([message['content'] for message in segment['conversation'] if message['speaker'] == "interviewee"])
        segment_messages = toString(segment['conversation'])
        keyword_prompts.append(extract_keywords(interviewee_messages))
        title_prompts.append(extract_title(segment_messages))

    segment_keywords = gpt.multithread_prompts(openai_client, keyword_prompts, temperature=0, response_format="json")
    segment_keywords = [json.loads(keyword)['keywords'] for keyword in segment_keywords]
    segment_titles = gpt.multithread_prompts(openai_client, title_prompts, temperature=1.0)
    result = []
    for segment, keywords, title in zip(interview_data, segment_keywords, segment_titles):
        segment_messages = toString(segment['conversation'])
        segment['raw_keywords'] = keywords
        segment['title'] = title
        result.append(segment)
        all_chunks.append(segment)
    save_json(result, "result/documents/{}.json".format(pid))
save_json(all_chunks, "result/chunks.json")

In [None]:
all_chunks = []
for document in glob.glob("result/documents/*.json"):
    chunks = json.load(open(document))
    all_chunks += chunks
save_json(all_chunks, "result/chunks.json")
len(all_chunks)


In [None]:
project_background = """Zaun is a heavily industrialized city characterized by unregulated technological development and alchemical innovation. 
Its rapid growth has resulted in significant environmental degradation, including severe air and water pollution. 
Emissions from factories and laboratories produce a persistent smog, known as the "Gray Wind," leading to widespread respiratory issues among residents. 
The city's waterways are heavily contaminated with industrial waste, impacting aquatic ecosystems and public health. 
These environmental challenges highlight the urgent need for sustainable practices and effective regulatory frameworks to balance Zaun's industrial progress with environmental and human well-being.
"""
# generating explanations for keywords, the explanations are used to generated embeddings for the keywords
def explain_prompt(keyword):
    messages = [
        {
            "role": "system",
            "content": f"""You are a dictionary system.
            The user wants to know the meaning of a specific word in the following context: {project_background}
            Reply with the definition of the word.
        """
        },
        {
            "role": "user",
            "content": "Please explain {}".format(keyword)
        }
    ]
    return messages


In [None]:
from collections import defaultdict
keyword_freq = defaultdict(int)
for interview_file in glob.glob("result/documents/*.json"):
    print(interview_file)
    interview_data = json.load(open(interview_file))
    for segment in interview_data:
        keywords = segment['raw_keywords']
        for k in keywords:
            keyword_freq[k] += 1

# filter out keywords that only appear once
all_keywords = [k for k in keyword_freq.keys() if keyword_freq[k] > 1]

print("Number of keywords: ", len(all_keywords))
# collect statistics
keyword_statistics = [{"keyword": k, "frequency": keyword_freq[k]} for k in all_keywords]

# get explanations
explain_prompts = [explain_prompt(keyword) for keyword in all_keywords]

# get embeddings from explanations
explanations = gpt.multithread_prompts(openai_client, explain_prompts)
embeddings = gpt.multithread_embeddings(openai_client, explanations)

# save
result = []
for keyword, explanation, embedding in zip(all_keywords, explanations, embeddings):
    result.append({
        "keyword": keyword,
        "explanation": explanation,
        "embeddings": embedding
    })
save_json(result, "keyword/keywords.json")
save_json(keyword_statistics, "keyword/keyword_statistics.json")
