In [23]:
import openai
api_key = open("api_key", "r").readline()
openai.api_key = api_key
import json
import requests
import tiktoken
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

def chatgpt(messages):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k-0613",
        messages=messages,
    )
    return response['choices'][0]['message']['content']
def get_embedding(text, model="text-embedding-ada-002"):
    enc = tiktoken.encoding_for_model(model)
    while len(enc.encode(text)) > 8191:
        text = text[:-100]
    url = 'https://api.openai.com/v1/embeddings'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': "Bearer {}".format(api_key)
    }
    data = {
        "input": text,
        "model": model
    }
    res = requests.post(url, headers=headers, json=data)
    res = res.json()
    return res['data'][0]['embedding']

In [24]:
import glob
import json
from pprint import pprint
def get_division_dict():
    res = {}
    divisions = open('division_index.txt').readlines()
    for division in divisions:
        participant, index = division.split(' ')
        res["N"+participant] = index
    return res
division_dict = get_division_dict()

def divide_background_topics(transcript, division_index):
    questions = [transcript['content'] for transcript in transcript if transcript['speaker'] == '採訪者']
    division_question = questions[division_index]
    for index, message in enumerate(transcript):
        if message['content'] == division_question:
            division_index = index
            break
    background_messages = transcript[:division_index]
    topic_messages = transcript[division_index:]
    return background_messages, topic_messages

for transcript_file in glob.glob('../data/raw/transcript/json/*.json'):
    transcript = json.load(open(transcript_file))
    participant = transcript_file.split('/')[-1].split('.')[0].replace("_done", "")
    questions = [transcript['content'] for transcript in transcript if transcript['speaker'] == '採訪者']
    background_messages, topic_messages = divide_background_topics(transcript, int(division_dict[participant]))
    save_json(background_messages, f'../data/result/transcripts/{participant}_background.json')
    save_json(topic_messages, f'../data/result/transcripts/{participant}_topics.json')

In [None]:
def infer_questions(answer):
    messages = [
        # {
        #     "role": "system",
        #     "content": """You are an question inferer, you infer what questions are being asked based on the answers.
        #     You will be given an answer, please infer what questions are being asked.
        #     Reply in Traditional Chinese.
        #     """
        # },
        {
            "role": "system",
            "content": """
            你是一個訪談的記錄者，現在有一個受訪者的回答，但是沒有訪談者問的問題。你需要根據這個訪談的回答推斷出訪談者問的問題。
            請用問句回答，並以“問題：”開頭。
            """
        },
        {
            "role": "user",
            "content": "{}".format(answer)
        }
    ]
    return chatgpt(messages)
import glob
interviews = glob.glob('result/chunks/*.json')
all_important_questions = []
for interview in interviews:
    chunks = json.load(open(interview, 'r'))
    chunk_important_questions = []
    for index, chunk in enumerate(chunks):
        answers = [message['content'] for message in chunk if message['speaker'] == 0]
        question = infer_questions("\n\n\n".join(answers))
        print(index, question)
    print("====================================")

In [None]:
import glob
import json
from pprint import pprint
def cluster_question():
    interviews = glob.glob('../data/result/chunks/*.json')
    all_important_questions = []
    candidates = ['10個', '十個', '議題']
    for interview in interviews:
        chunks = json.load(open(interview, 'r'))
        chunk_important_questions = []
        print(interview)
        index = 0
        for chunk in chunks:
            # important_chunk_question = [message['content'] for message in chunk if message['speaker'] == 1][0]
            chunk_questions = [message['content'] for message in chunk if message['speaker'] == 1]
            for question in chunk_questions:
                if any([candidate in question for candidate in candidates]):
                    print(index, question)
                index += 1
            # chunk_important_questions.append(important_chunk_question)
        all_important_questions.append(chunk_important_questions)
        # pprint(chunk_important_questions)
        print("===================================")
cluster_question()


In [None]:
dataset = []
def query_topic(datum):
    # topic prompt
    messages = [
        {
            "role": "system",
            "content": """
            You are a article topic summarizer. 
            The user will give you an article, your job is to summarize the topic of the article.
            Reply with no more than three sentences.
            """
        },
    ]
    return
for datum in dataset:
    topic = query_topic(datum)


In [None]:
from pprint import pprint
import glob
# interview_files = glob.glob("result/chunks/*_N10.json")
interview_files = glob.glob("result/chunks/*.json")
chunk_embeddings = {}
for chunks_file in interview_files:
    chunks = json.load(open(chunks_file, 'r'))
    interview_id = chunks_file.split("/")[-1].split("_")[1].replace(".json", "")
    for chunk_index, chunk in enumerate(chunks):
        id = interview_id + "_" + str(chunk_index)
        answers = [message['content'] for message in chunk if message['speaker'] == 0]
        if len(answers) == 0:
            embedding = get_embedding("No content")
        else:
            answers = " ".join(answers)
            embedding = get_embedding(answers)
        chunk_embeddings[id] = {
            "id": id,
            "conversation": chunk,
            "embedding": embedding,
        }
save_json(chunk_embeddings, "result/chunk_embeddings/all.json")

In [None]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from collections import defaultdict

chunk_embeddings = json.load(open('result/chunk_embeddings/all.json', 'r'))
embeddings = [chunk_embeddings[id]['embedding'] for id in chunk_embeddings]
embeddings = [[0 for _ in range(1536)] if embedding == "No content" else embedding for embedding in embeddings]
ids = [chunk_embeddings[id]['id'] for id in chunk_embeddings]
embeddings = np.array(embeddings)
pairwise_distances = pdist(embeddings, metric='cosine')
distance_matrix = squareform(pairwise_distances)


In [None]:
for row_index in range(len(distance_matrix)):
    all_distances = distance_matrix[row_index]
    distances_grouped = defaultdict(list)
    for id_index in range(len(all_distances)):
        id = ids[id_index]
        interviewee_id = id.split("_")[0]
        distances_grouped[interviewee_id].append(all_distances[id_index]) 
    for interviewee_id, distances in distances_grouped.items():
        avg_distance = np.mean(distances)
        print(ids[row_index], interviewee_id, 1-avg_distance)
    print("---------------------------------")