In [None]:
from openai import OpenAI
import json
import gpt as gpt
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

# openai
openai_api_key = open("openai_api_key").read()
openai_client=OpenAI(api_key=openai_api_key)
def messages_to_string(messages, pid):
    messages_str = ""
    for index, message in enumerate(messages):
        messages_str += str(index) + ". "
        if message['speaker'] == "1":
            messages_str += "採訪者: " + message['content'] + "\n"
        elif message['speaker'] == "0":
            messages_str += pid + ": " + message['content'] + "\n"
    return messages_str


In [None]:
import glob
# reformat
for chunk_file in glob.glob("chunks/*.json"):
    participant_id = chunk_file.replace(".json", "").replace("chunks/", "")
    chunks = json.load(open(chunk_file))
    reformat_chunks = []
    for chunk_index, chunk in enumerate(chunks):
        reformat_chunks.append({
            "id": "{}_{}".format(participant_id, chunk_index),
            "conversation": chunk,
        })
    save_json(reformat_chunks, 'chunks_formatted/{}.json'.format(participant_id))

In [None]:
def toString(chunk):
    result = ''
    for dialogue in chunk:
        if dialogue['speaker'] == '1':
            result += "Interviewer: "
        else:
            result += "Interviewee: "
        # result += dialogue['speaker'] + ": " + dialogue['content'] + '\n'
        result += dialogue['content'] + "\n"
    return result

def extract_keywords(paragraph):
    messages = [
        {
            "role": "system",
            "content": """You are a keyword extraction system that extracts keywords from a monologue. 
            The monologue is about a person who is talking about their life.
            The keywords should be the most important words in the monologue.
            Use the exact words that the person uses in the monologue.
            Replay with a list of keywords in Traditional Chinese in the following JSON format:
            {{
                "keywords": ["keyword1", "keyword2", "keyword3", ...]
            }}
            """
       },
       {
           "role": "user",
           "content": paragraph
       }
    ]
    # keywords = request_chatgpt_gpt4(messages)
    return messages
def extract_title(conversation):
    messages = [
        {
            "role": "system",
            "content": """You are a title analysis system. 
            You are given a conversation between two people: Interviewer and Interviewee. 
            Give a concise title for the conversation with no more than 10 words.
            Reply with Traditional Chinese.
        """
        },
        {
            "role": "user",
            "content": conversation
        }
    ]
    return messages


In [None]:
import glob
for interview_file in glob.glob("chunks_formatted/*.json"):
    pid = interview_file.replace(".json", "").replace("chunks_formatted/", "")
    interview_data = json.load(open(interview_file))
    print(interview_file)
    keyword_prompts = []
    title_prompts = []
    for chunk in interview_data:
        interviewee_messages = "\n".join([message['content'] for message in chunk['conversation'] if message['speaker'] == "0"])
        chunk_messages = toString(chunk['conversation'])
        keyword_prompts.append(extract_keywords(interviewee_messages))
        title_prompts.append(extract_title(chunk_messages))

    chunk_keywords = gpt.multithread_prompts(openai_client, keyword_prompts)
    chunk_titles = gpt.multithread_prompts(openai_client, title_prompts)
    result = []
    for chunk, keywords, title in zip(interview_data, chunk_keywords, chunk_titles):
        chunk_messages = toString(chunk['conversation'])
        chunk['raw_keywords'] = keywords
        chunk['title'] = title
        result.append(chunk)
    save_json(result, "chunks_formatted_w_kt/{}.json".format(pid))

In [None]:
def explain_prompt(keyword):
    messages = [
        {
            "role": "system",
            "content": """You are a dictionary system.
            The user wants to know the meaning of a specific word in the context of environment, ecology, an related topics.
            Reply in Traditional Chinese with the definition of the word.
        """
        },
        {
            "role": "user",
            "content": "請解釋一下「{}」".format(keyword)
        }
    ]
    return messages


In [None]:
done_file = []
for interview_file in glob.glob("chunks_formatted_w_kt/*.json"):
    print(interview_file)
    interview_data = json.load(open(interview_file))
    keyword_prompts = []
    for chunk in interview_data:
        interviewee_messages = "\n".join([message['content'] for message in chunk['conversation'] if str(message['speaker']) == "0"])
        keyword_prompts.append(extract_keywords(interviewee_messages))
    keywords_by_chunk = gpt.multithread_prompts(openai_client, keyword_prompts, format="json")
    index = 0
    for chunk, keywords in zip(interview_data, keywords_by_chunk):
        keywords = json.loads(keywords)['keywords']
        print(keywords)
        interview_data[index]['raw_keywords'] = keywords
        index += 1
    save_json(interview_data, interview_file)
    done_file.append(interview_file)

In [None]:
from collections import defaultdict
keyword_freq = defaultdict(int)
for interview_file in glob.glob("chunks_formatted_w_kt/*.json"):
    print(interview_file)
    interview_data = json.load(open(interview_file))
    for chunk in interview_data:
        keywords = chunk['raw_keywords']
        for k in keywords:
            keyword_freq[k] += 1

# filter out keywords that only appear once
all_keywords = [k for k in keyword_freq.keys() if keyword_freq[k] > 1]

# collect statistics
keyword_statistics = [{"keyword": k, "frequency": keyword_freq[k]} for k in all_keywords]

# get explanations
explain_prompts = [explain_prompt(keyword) for keyword in all_keywords]

# get embeddings from explanations
explanations = gpt.multithread_prompts(openai_client, explain_prompts)
embeddings = gpt.multithread_embeddings(openai_client, explanations)

# save
result = []
for keyword, explanation, embedding in zip(all_keywords, explanations, embeddings):
    result.append({
        "keyword": keyword,
        "explanation": explanation,
        "embeddings": embeddings
    })
save_json(result, "result/keywords.json")
save_json(keyword_statistics, "result/keyword_statistics.json")
