In [None]:
from openai import OpenAI
import glob
import tiktoken
import json
import requests
api_key = open("api_key", "r").read()
client = OpenAI(api_key=api_key)

def request_gpt4(messages):
    enc = tiktoken.encoding_for_model("gpt-4-1106-preview")
    # enc = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")
    text = json.dumps(messages)
    print(len(enc.encode(text)))
    kept_index = 0
    while len(enc.encode(text)) > 16385:
    # while len(enc.encode(text)) > 128000:
        print("truncating...")
        # find the first user input
        for index, message in enumerate(messages):
            if message['role'] == 'user' and len(message['content']) > 1000:
                messages[index] = {
                    "role": "user",
                    "content": message['content'][:-1000]
                }
                break
        text = json.dumps(messages)
        print(len(enc.encode(text)))
    try:
        response = client.chat.completions.create(
            # model="gpt-4-1106-preview",
            model="gpt-3.5-turbo-1106",
            messages=messages,
        )
    except Exception as e:
        print(e)
        print("retrying...")
        return request_gpt4(messages)
    return response.choices[0].message.content

def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

def get_embedding(text, model="text-embedding-ada-002"):
    enc = tiktoken.encoding_for_model(model)
    while len(enc.encode(text)) > 8191:
        text = text[:-100]
    url = 'https://api.openai.com/v1/embeddings'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': "Bearer {}".format(api_key)
    }
    data = {
        "input": text,
        "model": model
    }
    res = requests.post(url, headers=headers, json=data)
    res = res.json()
    return res['data'][0]['embedding']

def cosine_similarity(a, b):
    from numpy import dot
    from numpy.linalg import norm
    return dot(a, b)/(norm(a)*norm(b))


In [None]:
def extract_keywords(paragraph):
    messages = [
        {
            "role": "system",
            "content": """You are a keyword extraction system that extracts keywords from a monologue. 
            The monologue is about a person who is talking about their life.
            The keywords should be the most important words in the monologue.
            Use the exact words that the person uses in the monologue.
            Replay with a list of keywords in Traditional Chinese.
            """
       },
       {
           "role": "user",
           "content": paragraph
       }
    ]
    keywords = request_gpt4(messages)
    return keywords

In [None]:
# Note: run time very long, consider running on kwon
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    print(interview_file)
    for chunk in interview_data:
        if 'keywords' in chunk:
            continue
        interviewee_messages = "\n".join([message['content'] for message in chunk['conversation'] if message['speaker'] == 0])
        keywords = extract_keywords(interviewee_messages)
        print(interviewee_messages)
        print(keywords)
        chunk['keywords'] = keywords
    save_json(interview_data, interview_file)

In [None]:
from collections import defaultdict
from pprint import pprint
import re
topic_keywords = defaultdict(lambda: defaultdict(int))
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    for chunk in interview_data:
        keywords_str = chunk['keywords']
        keywords = re.split(", |、", keywords_str)
        topic = chunk['topic']
        for keyword in keywords:
            topic_keywords[topic][keyword] += 1
save_json(topic_keywords, "../data/result/topic_keywords.json")

In [None]:
from collections import defaultdict
import chinese_converter
known_topics = ['整體經濟', '住屋', '公有土地', '貿易', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源']
stopwords = open('stopwords-master/all.txt', 'r', encoding='utf-8').read().split('\n')
stopwords = [chinese_converter.to_traditional(word) for word in stopwords]

In [None]:
# reverse index: keyword -> topic
keyword_topic_appearances = defaultdict(set)
keyword_frequency = defaultdict(int)
topic_keywords = json.load(open("../data/result/topic_keywords.json"))
for topic, keywords_freq in topic_keywords.items():
    keywords = list(keywords_freq.keys())
    for keyword in keywords:
        keyword_topic_appearances[keyword].add(topic)
        keyword_frequency[keyword] += keywords_freq[keyword]
# sort by number of topics
keyword_topic_appearances = sorted(keyword_topic_appearances.items(), key=lambda x: len(x[1]), reverse=True)
count = 0
keywords = []
for keyword, topics in keyword_topic_appearances:
    if keyword in stopwords: continue
    if keyword in known_topics: continue
    # if len(topics) == 1: continue
    if keyword_frequency[keyword] < 2: continue
    print(keyword, keyword_frequency[keyword], topics)
    count += 1
    keywords.append(keyword)
print(keywords)
save_json(keywords, "../data/result/tmp/keywords.json")

In [None]:
from pprint import pprint 
pprint(topic_keywords)

In [68]:
keyword_embeddings = json.load(open("../data/result/keyword_coordinates.json"))
if "綠島" in keyword_embeddings:
    print("!!!")
    del keyword_embeddings["綠島"]
save_json(keyword_embeddings, "../data/result/keyword_coordinates.json")

!!!


In [65]:
keywords = json.load(open("../data/result/tmp/keywords.json"))
keyword_statistics = {}
for keyword, topics in keyword_topic_appearances:
    if keyword not in keywords: continue
    topic_freq = {}
    for topic in topics:
        topic_freq[topic] = topic_keywords[topic][keyword]
    print(keyword, keyword_frequency[keyword], topic_freq)
    keyword_statistics[keyword] = {
        "frequency": keyword_frequency[keyword],
        "frequency_by_topic": topic_freq
    }
save_json(keyword_statistics, "../data/result/keyword_statistics.json")

環境 56 {'整體經濟': 1, '政府運作': 5, '交通': 3, '能源': 1, '其他': 4, '環境生態': 34, '災害': 3, '住屋': 1, '貿易': 1, '公有土地': 2, '醫療': 1}
民宿 41 {'整體經濟': 1, '政府運作': 5, '交通': 8, '能源': 1, '其他': 5, '環境生態': 11, '住屋': 6, '貿易': 2, '公有土地': 2}
政府 55 {'政府運作': 26, '交通': 11, '能源': 1, '其他': 2, '環境生態': 10, '災害': 1, '貿易': 1, '公有土地': 2, '醫療': 1}
離島 18 {'政府運作': 4, '交通': 1, '能源': 2, '其他': 1, '環境生態': 2, '災害': 1, '公有土地': 1, '醫療': 6}
遊客 41 {'政府運作': 2, '交通': 7, '能源': 3, '其他': 5, '環境生態': 18, '災害': 2, '住屋': 1, '醫療': 3}
土地 23 {'整體經濟': 1, '政府運作': 3, '交通': 2, '能源': 1, '其他': 3, '環境生態': 4, '住屋': 6, '公有土地': 3}
觀光 50 {'整體經濟': 1, '政府運作': 9, '交通': 13, '能源': 1, '其他': 3, '環境生態': 21, '貿易': 1, '醫療': 1}
臺灣 17 {'政府運作': 2, '交通': 1, '能源': 1, '其他': 4, '環境生態': 3, '住屋': 2, '公有土地': 1, '醫療': 3}
生態 48 {'政府運作': 2, '交通': 4, '能源': 3, '其他': 3, '環境生態': 30, '災害': 2, '住屋': 2, '醫療': 2}
溫泉 24 {'整體經濟': 2, '政府運作': 1, '交通': 3, '能源': 1, '其他': 5, '環境生態': 9, '災害': 2, '住屋': 1}
規劃 17 {'政府運作': 6, '交通': 4, '能源': 1, '其他': 1, '環境生態': 2, '災害': 1, '住屋': 2}
資源 19 {'政府運作': 3, '交

In [None]:
keyword_embeddings = {}
for keyword in keywords:
    keyword_embedding = get_embedding(keyword)
    keyword_embeddings[keyword] = keyword_embedding
save_json(keyword_embeddings, "../data/result/keyword_embeddings.json")


In [None]:
import numpy as np
import scipy.spatial as spatial
def distance_matrix(embeddings):
    embeddings = np.array(embeddings)
    return spatial.distance.cdist(embeddings, embeddings, metric='cosine')
D = distance_matrix(list(keyword_embeddings.values()))
import math
max = D.max()
D /= max
keyword_0_distances = D[0]
sorted_indices = sorted(range(len(keyword_0_distances)), key=lambda k: keyword_0_distances[k])
print(sorted_indices)

In [None]:
for i in sorted_indices:
    print(list(keyword_embeddings.keys())[i])

In [None]:
from minisom import MiniSom    
data = np.array(list(keyword_embeddings.values()))
som = MiniSom(30, 30, 1536, sigma=0.1, learning_rate=0.1) 
som.train(data, 2000) 

In [None]:
som.winner(data[2])
for keyword, embedding in keyword_embeddings.items():
    print(keyword, som.winner(embedding))

In [None]:
def remove_stop_words(text):
    stopwords = open('stopwords-master/all.txt', 'r', encoding='utf-8').read().split('\n')
    stopwords = [chinese_converter.to_traditional(word) for word in stopwords]
    for word in stopwords:
        text = text.replace(word, "")
    return text
def explain_keyword(keyword):
    messages = [
        {
            "role": "system",
            "content": """You are a Chinese phrase explainer. You explain the meaning of a Chinese phrase.
                The user will give you a Chinese phrase, please explain the meaning of the phrase in Traditional Chinese.
                Reply with Traditional Chinese.
              """
        },
        {
            "role": "user",
            "content": keyword
        }
    ]
    return request_gpt4(messages)
explanation_1 = explain_keyword("民宿")
explanation_2 = explain_keyword("溫泉")
explanation_3 = explain_keyword("貨櫃")
print("民宿", explanation_1)
print("溫泉", explanation_2)
print("貨櫃", explanation_3)
embedding_1 = get_embedding(explanation_1)
embedding_1_1 = get_embedding("民宿")
embedding_2 = get_embedding(explanation_2)
embedding_2_2 = get_embedding("溫泉")
embedding_3 = get_embedding(explanation_3)
embedding_3_3 = get_embedding("貨櫃")
print(cosine_similarity(embedding_1, embedding_2), cosine_similarity(embedding_1_1, embedding_2_2))
print(cosine_similarity(embedding_1, embedding_3), cosine_similarity(embedding_1_1, embedding_3_3))
print(cosine_similarity(embedding_2, embedding_3), cosine_similarity(embedding_2_2, embedding_3_3))