In [None]:
from openai import OpenAI
api_key = open("api_key").readline()
client = OpenAI(api_key=api_key)
import json
import requests
import tiktoken
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

def request_gpt4(messages, response_format=None):
    if response_format == "json":
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=messages,
            response_format={ "type": "json_object" },
        )
    else:
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=messages,
        )
    return response.choices[0].message.content
def get_embedding(text, model="text-embedding-ada-002"):
    enc = tiktoken.encoding_for_model(model)
    while len(enc.encode(text)) > 8191:
        text = text[:-100]
    url = 'https://api.openai.com/v1/embeddings'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': "Bearer {}".format(api_key)
    }
    data = {
        "input": text,
        "model": model
    }
    res = requests.post(url, headers=headers, json=data)
    res = res.json()
    return res['data'][0]['embedding']

In [None]:
import glob
import json
from pprint import pprint
def get_division_dict():
    res = {}
    divisions = open('division_index.txt').readlines()
    for division in divisions:
        participant, index = division.split(' ')
        res["N"+participant] = index
    return res
division_dict = get_division_dict()

def divide_background_topics(transcript, division_index):
    questions = [transcript['content'] for transcript in transcript if transcript['speaker'] == '採訪者']
    division_question = questions[division_index]
    for index, message in enumerate(transcript):
        if message['content'] == division_question:
            division_index = index
            break
    background_messages = transcript[:division_index]
    topic_messages = transcript[division_index:]
    return background_messages, topic_messages

for transcript_file in glob.glob('../data/raw/transcript/json/*.json'):
    transcript = json.load(open(transcript_file))
    participant = transcript_file.split('/')[-1].split('.')[0].replace("_done", "")
    questions = [transcript['content'] for transcript in transcript if transcript['speaker'] == '採訪者']
    background_messages, topic_messages = divide_background_topics(transcript, int(division_dict[participant]))
    save_json(background_messages, f'../data/result/transcripts/{participant}_background.json')
    save_json(topic_messages, f'../data/result/transcripts/{participant}_topics.json')

In [None]:
def infer_questions(answer):
    messages = [
        # {
        #     "role": "system",
        #     "content": """You are an question inferer, you infer what questions are being asked based on the answers.
        #     You will be given an answer, please infer what questions are being asked.
        #     Reply in Traditional Chinese.
        #     """
        # },
        {
            "role": "system",
            "content": """
            你是一個訪談的記錄者，現在有一個受訪者的回答，但是沒有訪談者問的問題。你需要根據這個訪談的回答推斷出訪談者問的問題。
            請用問句回答，並以“問題：”開頭。
            """
        },
        {
            "role": "user",
            "content": "{}".format(answer)
        }
    ]
    return chatgpt(messages)
import glob
interviews = glob.glob('result/chunks/*.json')
all_important_questions = []
for interview in interviews:
    chunks = json.load(open(interview, 'r'))
    chunk_important_questions = []
    for index, chunk in enumerate(chunks):
        answers = [message['content'] for message in chunk if message['speaker'] == 0]
        question = infer_questions("\n\n\n".join(answers))
        print(index, question)
    print("====================================")

In [None]:
import glob
import json
from pprint import pprint
def cluster_question():
    interviews = glob.glob('../data/result/chunks/*.json')
    all_important_questions = []
    candidates = ['10個', '十個', '議題']
    for interview in interviews:
        chunks = json.load(open(interview, 'r'))
        chunk_important_questions = []
        print(interview)
        index = 0
        for chunk in chunks:
            # important_chunk_question = [message['content'] for message in chunk if message['speaker'] == 1][0]
            chunk_questions = [message['content'] for message in chunk if message['speaker'] == 1]
            for question in chunk_questions:
                if any([candidate in question for candidate in candidates]):
                    print(index, question)
                index += 1
            # chunk_important_questions.append(important_chunk_question)
        all_important_questions.append(chunk_important_questions)
        # pprint(chunk_important_questions)
        print("===================================")
cluster_question()


In [None]:
from pprint import pprint
import glob
# interview_files = glob.glob("result/chunks/*_N10.json")
interview_files = glob.glob("../data/result/chunk_summaries/*.json")
chunk_embeddings = {}
for chunks_file in interview_files:
    chunks = json.load(open(chunks_file, 'r'))
    # interview_id = chunks_file.split("/")[-1].split("_")[1].replace(".json", "")
    for chunk_index, chunk in enumerate(chunks):
        # id = interview_id + "_" + str(chunk_index)
        id = chunk['id']
        answers = [message['content'] for message in chunk['conversation'] if message['speaker'] == 'Interviewee']
        if len(answers) == 0:
            embedding = get_embedding("No content")
        else:
            answers = " ".join(answers)
            embedding = get_embedding(answers)
        chunk_embeddings[id] = {
            "id": id,
            "conversation": chunk['conversation'],
            "embedding": embedding,
        }
save_json(chunk_embeddings, "../data/result/chunk_embeddings/all.json")

In [9]:
from pprint import pprint
import glob
keyword_set = set()
topic_keywords = json.load(open("../data/result/topic_keywords.json"))
for keywords in topic_keywords.values():
    keyword_set.update(keywords)
print(keyword_set)
keyword_embeddings = {}
for index, keyword in enumerate(keyword_set):
    print("{}/{}".format(index, len(keyword_set)))
    # embedding = get_embedding(keyword)
    keyword_embeddings[keyword] = embedding
# save_json(keyword_embeddings, "../data/result/keyword_embeddings.json")

{'台東縣政府', '開挖', '山壁', '故鄉', '計劃', '精神錯亂', '小學', '30幾', '政策', '環境改變', '漁夫]', '槍枝', '油污', '大修', '醫生壓力', '當地店家', '誤會', '藏]', '農會', '死傷', '誤傳', '緊急', '島民心態', '醫師', '燈塔]', '老宅', '乾貨', '概念', '颱風', '全島', '減塑', '車輛', '喝', '蔡英文', '海溫計', '食魚教育', '差別', '過山蝦', '水電', '小隊長', '火警', '舞者', '讀賣', '家鄉', '厭煩', '床', '延繩釣', '摩托車店', '動物管理處', '人情', '多元能源', '電子產品', '脫貧', '不守法', '溪流', '清潔隊', '表演場所', '喬事情', '心機', '維護', '風險', '體力活', '填海', '中巴', '封港', '黃金期', '圖片介紹', '橡皮筋', '柴口', '成果', '網美', '墾丁', '花蓮航港局', '山豬老師', '乾旱', '觀察者', '小店家', '地方', '地理環境', '研究站', '克服', '總額管制', '供電穩定', '慢熟型', '漁業違規', '人力短缺]', '捕撈方式', '王陽明', '曝曬', '導航', '耐心', '船隻提升', '保護', '基地臺', '簡單]', '季風', '流馬溝', '用電', '救難單位', '海巡雷達', '存提款', '免稅店', '遊船', '職業', '地殼', '收入', '工地', '新研]', '可笑', '媒體', '學會', '政治犯', '流量管制', '加油站', '除油裝備', '建議]', '手排', '觀光景點', '海廢牆', '態度', 'GT大叔', '日常用品', '翅膀', '沖水', '圖卡', '有趣', '質量', '花生湯', '正門口', '礁石落差', '珊瑚礁協會', '阿曼', '吸管', '農作物', '公單位', '資產', '消費能力', '器具', '五專', '違反漁業', '蘭嶼樹騎', '汰舊換新', '長城]', '感冒', '打魚種田', '公告', '公所', '管理權', '

In [None]:
import jieba
import chinese_converter
from collections import defaultdict
def remove_stopwords(text):
# read stopwords from stopwords-master/all.txt
    stopwords = open('stopwords-master/all.txt', 'r', encoding='utf-8').read().split('\n')
    stopwords+= ['说', '做', '讲', '东西', '真的', '事情', '是因为', '这件', '…']
    stopwords = [chinese_converter.to_traditional(stopword) for stopword in stopwords]
    tokens = [token for token in jieba.cut(text, cut_all=False) if token not in stopwords]
    tokens = "".join(tokens)
    return tokens

In [None]:
import glob
import json
interview_files = glob.glob("../data/result/chunks/v2_1029/*.json")
chunk_embeddings = {}
print(interview_files)
for chunks_file in interview_files:
    chunks = json.load(open(chunks_file, 'r'))
    interview_id = chunks_file.split("/")[-1].split("_")[1].replace(".json", "")
    for chunk_index, chunk in enumerate(chunks):
        id = interview_id + "_" + str(chunk_index)
        answers = [message['content'] for message in chunk if message['speaker'] == 0]
        print(answers)
        if len(answers) == 0:
            embedding = get_embedding("No content")
        else:
            answers = " ".join(answers)
            answers = remove_stopwords(answers)
            embedding = get_embedding(answers)
        chunk_embeddings[id] = {
            "id": id,
            "conversation": chunk,
            "embedding": embedding,
        }
save_json(chunk_embeddings, "../data/result/chunk_embeddings/1129/all.json")

In [None]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from collections import defaultdict

chunk_embeddings = json.load(open('result/chunk_embeddings/all.json', 'r'))
embeddings = [chunk_embeddings[id]['embedding'] for id in chunk_embeddings]
embeddings = [[0 for _ in range(1536)] if embedding == "No content" else embedding for embedding in embeddings]
ids = [chunk_embeddings[id]['id'] for id in chunk_embeddings]
embeddings = np.array(embeddings)
pairwise_distances = pdist(embeddings, metric='cosine')
distance_matrix = squareform(pairwise_distances)


In [None]:
for row_index in range(len(distance_matrix)):
    all_distances = distance_matrix[row_index]
    distances_grouped = defaultdict(list)
    for id_index in range(len(all_distances)):
        id = ids[id_index]
        interviewee_id = id.split("_")[0]
        distances_grouped[interviewee_id].append(all_distances[id_index]) 
    for interviewee_id, distances in distances_grouped.items():
        avg_distance = np.mean(distances)
        print(ids[row_index], interviewee_id, 1-avg_distance)
    print("---------------------------------")

In [None]:
def conversation_to_string(conversation):
    res = ""
    for message in conversation:
        speaker_flag = message['speaker']
        content = message['content']
        if speaker_flag == 0:
            speaker = 'Interviewee'
        else:
            speaker = 'Interviewer'
        res += f"{speaker}: {content}\n"
    return res
def prompt_connection(c1, c2):
    messages = [
        {
            "role": "system",
            "content": """
            You are a conversation similarity measurer. 
            The user will give you two conversations, first, decide if they about similar topics.
            If they are, summarize the topic in one phrase.
            Otherwise, reply with "Not similar".
            Reply concisely with one phrase in Traditional Chinese.
            """
        },
        {
            "role": "user",
            "content": c1
        },
        {
            "role": "user",
            "content": c2
        }
    ]
    return chatgpt(messages)

In [None]:
chunks = json.load(open('../data/result/chunk_embeddings/1103/all_chunks.json'))
chunk_dict = {chunk['id']: chunk for chunk in chunks}
chunk_similarities = json.load(open('../data/result/chunk_embeddings/1103/chunk_similarities.json'))
for c1, c2, sim in chunk_similarities:
    if sim > 0.9:
        conversation1 = conversation_to_string(chunk_dict[c1]['conversation'])
        conversation2 = conversation_to_string(chunk_dict[c2]['conversation'])
        print(conversation1)
        print(conversation2)
        res = prompt_connection(conversation1, conversation2)
        print(res)
        break

In [None]:
def query_time(file_name):
    messages = [
        {
            "role": "system",
            "content": """
            You are a time extractor. 
            The user will give you a report titles, your job is to extract the time of the report from the title.
            Reply with the following JSON format:
            {{
                'year': 2021,
                'month': 12,
                'day': 31
            }}
            """
        },
        {
            "role": "user",
            "content": file_name
        }
    ]

    return request_gpt4(messages, response_format="json")
def extract_report_time(report_files):
    for report_file in report_files:
        time = query_time(report_file)
        report_data = json.load(open(report_file))
        report_data['time'] = time
        print(report_file)
        print(time)
        save_json(report_data, report_file)
import glob
report_files = glob.glob('../data/raw/reports/representative_proposals/json/*.json')
extract_report_time(report_files)


In [None]:
for report_files in glob.glob('../data/raw/reports/representative_proposals/json/*.json'):
    report_data = json.load(open(report_files))
    file_name = report_files.split('/')[-1].split('.')[0]
    report_embedding_file = f'../data/result/proposal_embeddings/{file_name}.json'
    report_embedding = json.load(open(report_embedding_file))
    report_embedding['time'] = report_data['time']
    save_json(report_embedding, report_embedding_file)

In [None]:
for report_files in glob.glob('../data/result/proposal_embeddings/*.json'):
    report_data = json.load(open(report_files))
    time = json.loads(report_data['time'])
    print(time)
    report_data['time'] = time
    save_json(report_data, report_files)