In [None]:
from openai import OpenAI
import glob
import tiktoken
import json
api_key = open("api_key", "r").read()
client = OpenAI(api_key=api_key)

def request_gpt4(messages):
    enc = tiktoken.encoding_for_model("gpt-4-1106-preview")
    # enc = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")
    text = json.dumps(messages)
    print(len(enc.encode(text)))
    kept_index = 0
    while len(enc.encode(text)) > 16385:
    # while len(enc.encode(text)) > 128000:
        print("truncating...")
        # find the first user input
        for index, message in enumerate(messages):
            if message['role'] == 'user' and len(message['content']) > 1000:
                messages[index] = {
                    "role": "user",
                    "content": message['content'][:-1000]
                }
                break
        text = json.dumps(messages)
        print(len(enc.encode(text)))
    try:
        response = client.chat.completions.create(
            # model="gpt-4-1106-preview",
            model="gpt-3.5-turbo-1106",
            messages=messages,
        )
    except Exception as e:
        print(e)
        print("retrying...")
        return request_gpt4(messages)
    return response.choices[0].message.content

def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)



In [None]:
def extract_keywords(paragraph):
    messages = [
        {
            "role": "system",
            "content": """You are a keyword extraction system that extracts keywords from a monologue. 
            The monologue is about a person who is talking about their life.
            The keywords should be the most important words in the monologue.
            Use the exact words that the person uses in the monologue.
            Replay with a list of keywords in Traditional Chinese.
            """
       },
       {
           "role": "user",
           "content": paragraph
       }
    ]
    keywords = request_gpt4(messages)
    return keywords

In [None]:
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    print(interview_file)
    for chunk in interview_data:
        if 'keywords' in chunk:
            continue
        interviewee_messages = "\n".join([message['content'] for message in chunk['conversation'] if message['speaker'] == 0])
        keywords = extract_keywords(interviewee_messages)
        print(interviewee_messages)
        print(keywords)
        chunk['keywords'] = keywords
    save_json(interview_data, interview_file)

In [None]:
from collections import defaultdict
from pprint import pprint
import re
topic_keywords = defaultdict(lambda: defaultdict(int))
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    for chunk in interview_data:
        keywords_str = chunk['keywords']
        keywords = re.split(", |、", keywords_str)
        topic = chunk['topic']
        for keyword in keywords:
            topic_keywords[topic][keyword] += 1
save_json(topic_keywords, "../data/result/topic_keywords.json")

In [27]:
known_topics = ['整體經濟', '住屋', '公有土地', '貿易', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源']

In [None]:
# reverse index: keyword -> topic
keyword_topic_appearances = defaultdict(set)
keyword_frequency = defaultdict(int)
topic_keywords = json.load(open("../data/result/topic_keywords.json"))
for topic, keywords_freq in topic_keywords.items():
    keywords = list(keywords_freq.keys())
    for keyword in keywords:
        keyword_topic_appearances[keyword].add(topic)
        keyword_frequency[keyword] += keywords_freq[keyword]
# sort by number of topics
keyword_topic_appearances = sorted(keyword_topic_appearances.items(), key=lambda x: len(x[1]), reverse=True)
for keyword, topics in keyword_topic_appearances:
    if keyword in known_topics: continue
    if len(topics) == 1: continue
    if keyword_frequency[keyword] < 2: continue
    print(keyword, keyword_frequency[keyword], topics)

環境 56 {'整體經濟', '住屋', '公有土地', '貿易', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
綠島 206 {'整體經濟', '住屋', '公有土地', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
民宿 41 {'整體經濟', '公有土地', '住屋', '貿易', '政府運作', '環境生態', '其他', '交通', '能源'}
政府 55 {'公有土地', '貿易', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
離島 18 {'公有土地', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
遊客 41 {'住屋', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
土地 23 {'整體經濟', '公有土地', '住屋', '政府運作', '環境生態', '其他', '交通', '能源'}
觀光 50 {'整體經濟', '貿易', '政府運作', '醫療', '環境生態', '其他', '交通', '能源'}
臺灣 17 {'住屋', '公有土地', '政府運作', '醫療', '環境生態', '其他', '交通', '能源'}
生態 48 {'住屋', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
問題 19 {'整體經濟', '公有土地', '住屋', '政府運作', '環境生態', '其他', '交通', '能源'}
溫泉 24 {'整體經濟', '住屋', '政府運作', '災害', '環境生態', '其他', '交通', '能源'}
規劃 17 {'住屋', '政府運作', '災害', '環境生態', '其他', '交通', '能源'}
資源 19 {'政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
夏天 9 {'整體經濟', '政府運作', '災害', '環境生態', '其他', '交通', '能源'}
颱風 24 {'政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源'}
潛水 44 