In [2]:
from openai import OpenAI
import glob
import tiktoken
import json
import requests
api_key = open("api_key", "r").read()
client = OpenAI(api_key=api_key)

def request_gpt4(messages):
    enc = tiktoken.encoding_for_model("gpt-4-1106-preview")
    # enc = tiktoken.encoding_for_model("gpt-3.5-turbo-1106")
    text = json.dumps(messages)
    print(len(enc.encode(text)))
    kept_index = 0
    while len(enc.encode(text)) > 16385:
    # while len(enc.encode(text)) > 128000:
        print("truncating...")
        # find the first user input
        for index, message in enumerate(messages):
            if message['role'] == 'user' and len(message['content']) > 1000:
                messages[index] = {
                    "role": "user",
                    "content": message['content'][:-1000]
                }
                break
        text = json.dumps(messages)
        print(len(enc.encode(text)))
    try:
        response = client.chat.completions.create(
            # model="gpt-4-1106-preview",
            model="gpt-3.5-turbo-1106",
            messages=messages,
            temperature=0
        )
    except Exception as e:
        print(e)
        print("retrying...")
        return request_gpt4(messages)
    return response.choices[0].message.content

def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

def get_embedding(text, model="text-embedding-ada-002"):
    enc = tiktoken.encoding_for_model(model)
    while len(enc.encode(text)) > 8191:
        text = text[:-100]
    url = 'https://api.openai.com/v1/embeddings'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': "Bearer {}".format(api_key)
    }
    data = {
        "input": text,
        "model": model
    }
    res = requests.post(url, headers=headers, json=data)
    res = res.json()
    return res['data'][0]['embedding']

def cosine_similarity(a, b):
    from numpy import dot
    from numpy.linalg import norm
    return dot(a, b)/(norm(a)*norm(b))


In [None]:
def extract_keywords(paragraph):
    messages = [
        {
            "role": "system",
            "content": """You are a keyword extraction system that extracts keywords from a monologue. 
            The monologue is about a person who is talking about their life.
            The keywords should be the most important words in the monologue.
            Use the exact words that the person uses in the monologue.
            Replay with a list of keywords in Traditional Chinese.
            """
       },
       {
           "role": "user",
           "content": paragraph
       }
    ]
    keywords = request_gpt4(messages)
    return keywords

In [None]:
# Note: run time very long, consider running on kwon
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    print(interview_file)
    for chunk in interview_data:
        if 'keywords' in chunk:
            continue
        interviewee_messages = "\n".join([message['content'] for message in chunk['conversation'] if message['speaker'] == 0])
        keywords = extract_keywords(interviewee_messages)
        print(interviewee_messages)
        print(keywords)
        chunk['keywords'] = keywords
    save_json(interview_data, interview_file)

In [17]:
from collections import defaultdict
from pprint import pprint
import re
topic_keywords = defaultdict(lambda: defaultdict(int))
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    for chunk in interview_data:
        keywords_str = chunk['raw_keywords']
        keywords = re.split(", |、", keywords_str)
        keywords = [keyword.replace("\"", "").replace('[', "") for keyword in keywords]
        topic = chunk['topic']
        for keyword in keywords:
            topic_keywords[topic][keyword] += 1
save_json(topic_keywords, "../data/result/topic_keywords.json")

In [13]:
from collections import defaultdict
import chinese_converter
known_topics = ['整體經濟', '住屋', '公有土地', '貿易', '政府運作', '災害', '醫療', '環境生態', '其他', '交通', '能源']
stopwords = open('stopwords-master/all.txt', 'r', encoding='utf-8').read().split('\n')
stopwords.append("綠島")
# stopwords.append('["綠島"')
stopwords = [chinese_converter.to_traditional(word) for word in stopwords]

In [14]:
import string
# reverse index: keyword -> topic
keyword_topic_appearances = defaultdict(set)
keyword_frequency = defaultdict(int)
topic_keywords = json.load(open("../data/result/topic_keywords.json"))
for topic, keywords_freq in topic_keywords.items():
    keywords = list(keywords_freq.keys())
    for keyword in keywords:
        keyword_topic_appearances[keyword].add(topic)
        keyword_frequency[keyword] += keywords_freq[keyword]
# sort by number of topics
keyword_topic_appearances = sorted(keyword_topic_appearances.items(), key=lambda x: len(x[1]), reverse=True)
count = 0
keywords = []
for keyword, topics in keyword_topic_appearances:
    keyword = keyword.translate(str.maketrans('', '', string.punctuation))
    if keyword in stopwords: continue
    if keyword in known_topics: continue
    # if len(topics) == 1: continue
    if keyword_frequency[keyword] < 2: continue
    print(keyword, keyword_frequency[keyword], topics)
    count += 1
    keywords.append(keyword)
print(keywords)
save_json(keywords, "../data/result/tmp/keywords.json")

觀光 41 {'環境生態', '整體經濟', '能源', '公有土地', '住屋', '災害', '醫療', '政府運作', '交通'}
民宿 36 {'環境生態', '整體經濟', '能源', '公有土地', '住屋', '災害', '醫療', '政府運作', '交通'}
遊客 44 {'環境生態', '能源', '公有土地', '住屋', '災害', '醫療', '政府運作', '交通'}
土地 22 {'環境生態', '能源', '公有土地', '住屋', '災害', '醫療', '政府運作', '交通'}
臺東 38 {'環境生態', '能源', '公有土地', '住屋', '災害', '醫療', '政府運作', '交通'}
潛水 38 {'環境生態', '整體經濟', '公有土地', '住屋', '災害', '醫療', '政府運作', '交通'}
政府 38 {'環境生態', '能源', '公有土地', '住屋', '災害', '政府運作', '交通'}
工作 21 {'環境生態', '能源', '公有土地', '住屋', '醫療', '政府運作', '交通'}
生態 32 {'環境生態', '能源', '公有土地', '災害', '醫療', '政府運作', '交通'}
規劃 12 {'環境生態', '能源', '公有土地', '住屋', '醫療', '政府運作', '交通'}
環境 24 {'環境生態', '整體經濟', '公有土地', '住屋', '災害', '政府運作', '交通'}
蘭嶼 17 {'環境生態', '能源', '公有土地', '住屋', '災害', '醫療', '交通'}
總體經濟 13 {'環境生態', '整體經濟', '能源', '公有土地', '住屋', '醫療', '交通'}
開發 16 {'環境生態', '公有土地', '住屋', '災害', '政府運作', '交通'}
居民 27 {'環境生態', '公有土地', '住屋', '醫療', '政府運作', '交通'}
縣政府 22 {'環境生態', '整體經濟', '公有土地', '住屋', '政府運作', '交通'}
投資 9 {'環境生態', '整體經濟', '能源', '公有土地', '住屋', '災害'}
賺錢 13 {'環境生態', '公有土地', '住屋', '災

In [18]:
import json
import glob
import re
candidate_keywords = json.load(open("../data/result/tmp/keywords.json"))
for interview_file in glob.glob("../data/result/chunk_summaries/*.json"):
    interview_data = json.load(open(interview_file))
    for index, chunk in enumerate(interview_data):
        keywords_str = chunk['raw_keywords']
        keywords = re.split(", |、", keywords_str)
        keywords = [keyword.replace("\"", "").replace('[', "") for keyword in keywords]
        keywords = [keyword for keyword in keywords if keyword in candidate_keywords]
        chunk['keywords'] = keywords
        interview_data[index] = chunk
    save_json(interview_data, interview_file)

In [68]:
keyword_embeddings = json.load(open("../data/result/keyword_coordinates.json"))
if "綠島" in keyword_embeddings:
    print("!!!")
    del keyword_embeddings["綠島"]
save_json(keyword_embeddings, "../data/result/keyword_coordinates.json")

!!!


In [19]:
keywords = json.load(open("../data/result/tmp/keywords.json"))
keyword_statistics = {}
for keyword, topics in keyword_topic_appearances:
    if keyword not in keywords: continue
    topic_freq = {}
    for topic in topics:
        topic_freq[topic] = topic_keywords[topic][keyword]
    print(keyword, keyword_frequency[keyword], topic_freq)
    keyword_statistics[keyword] = {
        "frequency": keyword_frequency[keyword],
        "frequency_by_topic": topic_freq
    }
save_json(keyword_statistics, "../data/result/keyword_statistics.json")

觀光 41 {'環境生態': 21, '整體經濟': 1, '能源': 1, '公有土地': 1, '住屋': 1, '災害': 1, '醫療': 2, '政府運作': 7, '交通': 6}
民宿 36 {'環境生態': 11, '整體經濟': 1, '能源': 1, '公有土地': 2, '住屋': 7, '災害': 1, '醫療': 2, '政府運作': 3, '交通': 8}
遊客 44 {'環境生態': 23, '能源': 3, '公有土地': 1, '住屋': 3, '災害': 2, '醫療': 3, '政府運作': 2, '交通': 7}
土地 22 {'環境生態': 4, '能源': 1, '公有土地': 2, '住屋': 5, '災害': 1, '醫療': 1, '政府運作': 5, '交通': 3}
臺東 38 {'環境生態': 14, '能源': 2, '公有土地': 3, '住屋': 4, '災害': 3, '醫療': 5, '政府運作': 3, '交通': 4}
潛水 38 {'環境生態': 20, '整體經濟': 1, '公有土地': 1, '住屋': 6, '災害': 2, '醫療': 1, '政府運作': 1, '交通': 6}
政府 38 {'環境生態': 17, '能源': 1, '公有土地': 1, '住屋': 2, '災害': 1, '政府運作': 11, '交通': 5}
工作 21 {'環境生態': 10, '能源': 1, '公有土地': 2, '住屋': 5, '醫療': 1, '政府運作': 1, '交通': 1}
生態 32 {'環境生態': 24, '能源': 1, '公有土地': 1, '災害': 1, '醫療': 1, '政府運作': 2, '交通': 2}
規劃 12 {'環境生態': 5, '能源': 1, '公有土地': 1, '住屋': 1, '醫療': 1, '政府運作': 2, '交通': 1}
環境 24 {'環境生態': 11, '整體經濟': 1, '公有土地': 2, '住屋': 1, '災害': 1, '政府運作': 5, '交通': 3}
蘭嶼 17 {'環境生態': 7, '能源': 1, '公有土地': 2, '住屋': 2, '災害': 1, '醫療': 1, '交通': 3}


In [21]:
keyword_embeddings = {}
for index, keyword in enumerate(keywords):
    print("{}/{}".format(index, len(keywords)))
    keyword_embedding = get_embedding(keyword)
    keyword_embeddings[keyword] = keyword_embedding
save_json(keyword_embeddings, "../data/result/keyword_embeddings.json")


0/1021
1/1021
2/1021
3/1021
4/1021
5/1021
6/1021
7/1021
8/1021
9/1021
10/1021
11/1021
12/1021
13/1021
14/1021
15/1021
16/1021
17/1021
18/1021
19/1021
20/1021
21/1021
22/1021
23/1021
24/1021
25/1021
26/1021
27/1021
28/1021
29/1021
30/1021
31/1021
32/1021
33/1021
34/1021
35/1021
36/1021
37/1021
38/1021
39/1021
40/1021
41/1021
42/1021
43/1021
44/1021
45/1021
46/1021
47/1021
48/1021
49/1021
50/1021
51/1021
52/1021
53/1021
54/1021
55/1021
56/1021
57/1021
58/1021
59/1021
60/1021
61/1021
62/1021
63/1021
64/1021
65/1021
66/1021
67/1021
68/1021
69/1021
70/1021
71/1021
72/1021
73/1021
74/1021
75/1021
76/1021
77/1021
78/1021
79/1021
80/1021
81/1021
82/1021
83/1021
84/1021
85/1021
86/1021
87/1021
88/1021
89/1021
90/1021
91/1021
92/1021
93/1021
94/1021
95/1021
96/1021
97/1021
98/1021
99/1021
100/1021
101/1021
102/1021
103/1021
104/1021
105/1021
106/1021
107/1021
108/1021
109/1021
110/1021
111/1021
112/1021
113/1021
114/1021
115/1021
116/1021
117/1021
118/1021
119/1021
120/1021
121/1021
122/1021
123

In [None]:
import numpy as np
import scipy.spatial as spatial
def distance_matrix(embeddings):
    embeddings = np.array(embeddings)
    return spatial.distance.cdist(embeddings, embeddings, metric='cosine')
D = distance_matrix(list(keyword_embeddings.values()))
import math
max = D.max()
D /= max
keyword_0_distances = D[0]
sorted_indices = sorted(range(len(keyword_0_distances)), key=lambda k: keyword_0_distances[k])
print(sorted_indices)

In [None]:
for i in sorted_indices:
    print(list(keyword_embeddings.keys())[i])

In [None]:
from minisom import MiniSom    
data = np.array(list(keyword_embeddings.values()))
som = MiniSom(30, 30, 1536, sigma=0.1, learning_rate=0.1) 
som.train(data, 2000) 

In [None]:
som.winner(data[2])
for keyword, embedding in keyword_embeddings.items():
    print(keyword, som.winner(embedding))

In [None]:
def remove_stop_words(text):
    stopwords = open('stopwords-master/all.txt', 'r', encoding='utf-8').read().split('\n')
    stopwords = [chinese_converter.to_traditional(word) for word in stopwords]
    for word in stopwords:
        text = text.replace(word, "")
    return text
def explain_keyword(keyword):
    messages = [
        {
            "role": "system",
            "content": """You are a Chinese phrase explainer. You explain the meaning of a Chinese phrase.
                The user will give you a Chinese phrase, please explain the meaning of the phrase in Traditional Chinese.
                Reply with Traditional Chinese.
              """
        },
        {
            "role": "user",
            "content": keyword
        }
    ]
    return request_gpt4(messages)
explanation_1 = explain_keyword("民宿")
explanation_2 = explain_keyword("溫泉")
explanation_3 = explain_keyword("貨櫃")
print("民宿", explanation_1)
print("溫泉", explanation_2)
print("貨櫃", explanation_3)
embedding_1 = get_embedding(explanation_1)
embedding_1_1 = get_embedding("民宿")
embedding_2 = get_embedding(explanation_2)
embedding_2_2 = get_embedding("溫泉")
embedding_3 = get_embedding(explanation_3)
embedding_3_3 = get_embedding("貨櫃")
print(cosine_similarity(embedding_1, embedding_2), cosine_similarity(embedding_1_1, embedding_2_2))
print(cosine_similarity(embedding_1, embedding_3), cosine_similarity(embedding_1_1, embedding_3_3))
print(cosine_similarity(embedding_2, embedding_3), cosine_similarity(embedding_2_2, embedding_3_3))