In [1]:
import openai
import json
from pprint import pprint
import requests
api_key = open("api_key").read()
openai.api_key = api_key

def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4)

def request_chatgpt_gpt4(messages):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k-0613",
        messages=messages,
    )
    return response['choices'][0]['message']['content']

def get_embedding(text, model="text-embedding-ada-002"):
   # text = text.replace("\n", " ")
   # return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']
   url = 'https://api.openai.com/v1/embeddings'
   headers = {
      'Content-Type': 'application/json',
      'Authorization': "Bearer {}".format(api_key)
   }
   data = {
      "input": text,
      "model": model
   }
   res = requests.post(url, headers=headers, json=data)
   res = res.json()
   return res['data'][0]['embedding']

In [None]:
# trying the traditional way: tokenize -> stop word removal -> word count
# conclusion: does not work well
import jieba
import chinese_converter
from collections import defaultdict
# read stopwords from stopwords-master/all.txt
stopwords = open('stopwords-master/all.txt', 'r', encoding='utf-8').read().split('\n')
stopwords+= ['说', '做', '讲', '东西', '真的', '事情', '是因为', '这件', '…']
def clean(messages):
    res = []
    for message in messages:
        simp_message = chinese_converter.to_simplified(message)
        tokens = [token for token in jieba.cut(simp_message, cut_all=False) if token not in stopwords]
        res.append(tokens)
    return res
def freq(tokens_list):
    freq_dict = defaultdict(int)
    for tokens in tokens_list:
        for token in tokens:
            freq_dict[token] += 1
    freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: item[1], reverse=True)}
    return freq_dict
tokens_list = clean(interviewee_messages)
for message in tokens_list:
    freq_dict = freq([message])
    print(freq_dict)

In [None]:
# final script: 1-3
# 1. transform transcript to qa pairs
def collect_interviewee_messages(transcripts):
    # 0: interviewee, 1: interviewer
    speakers = [0 if t['speaker'] == participant_id else 1 for t in transcripts]
    qa_pairs = []
    i = 0
    cur_q = []
    while i < len(speakers):
        if i == 0:
            cur_q.append(i)
            i += 1
            continue
        if speakers[i] == 1: # interviewer
            qa_pairs.append(cur_q)
            cur_q = [i]
            i += 1
        else: # interviewee
            cur_q.append(i)
            i += 1
    qa_pairs.append(cur_q)
    return qa_pairs
def qa_index_to_message(qa_pairs, transcripts):
    res = []
    for qa in qa_pairs:
        res.append([transcripts[i]['content'] for i in qa])
    return res

participant_id = 'N6'
section = "topics"
for i in range(1, 20):
    participant_id = 'N{}'.format(i)
    # transcripts = json.load(open('../data/raw/transcript/json/{}_done.json'.format(participant_id)))
    transcripts = json.load(open('../data/result/transcripts/{}_{}.json'.format(participant_id, section)))
    interviewee_messages = [t['content'] for t in transcripts if t['speaker'] == participant_id]
    interviewer_messages = [t['content'] for t in transcripts if t['speaker'] == '採訪者']
    qa_pairs = collect_interviewee_messages(transcripts)
    qa_messages = qa_index_to_message(qa_pairs, transcripts)
    save_json(qa_messages, '../data/result/tmp/v3_1029/qa_messages_{}_{}.json'.format(participant_id, section))
# print(len(qa_pairs))

In [None]:
# 2. classify interviewer messages with chatgpt
def classify_interviewer_message(question, answer):
    message = [
        # {
        #     'role': 'system',
        #     'content': """
        #         The user is dealing with a transcript of an interview.
        #         In the transcript, some interviewer messages are just repeating what the interviewee said, these messages are less interesting.
        #         Others are for invoking a new discussion, or asking about a new topic in the interview.
        #         You are a classification system that helps user decide if a sentence is invoking a new topic.
        #         Normally, if the sentence is long it should be yes.
        #         Reply with "yes" if the sentence is worth further examination, otherwise reply with "no".
        #     """
        # },
        {
            'role': 'system',
            'content': """
                你是一個訪談紀錄的分類系統。現在有一個很長的訪談紀錄，需要你幫忙分類。
                用戶的需求是：在訪談紀錄中，有些發言是引導新的討論，有些發言只是重複之前的討論。
                通常，如果發言很長或是有提到具體的名詞，就是引導新的討論, 請回答「是」。
                如果發言很短且只是語助詞，比如‘真的喔？’，‘的確’， ‘有趣’，就不太可能是在引導新的討論，請回答「否」。
                請判斷一個發言是否在引導新的討論，用「是」或「否」來回答。
            """
        },
        # example 1
        {
            'role': 'system',
            'name': 'example_user',
            'content': '問：這麼多喔。'

        },
        {
            'role': 'system',
            'name': 'example_system',
            'content': '否'
        },
        # example 2
        {
            'role': 'system',
            'name': 'example_user',
            'content': """
            問：的確，我們這邊就是社會，目前也還沒有說真的要針對哪一塊，但是我們是想要聊解現在這個狀況，就是綠島人民來說就是居民來說不就新居民就居民，他們現在重視有什麼樣的議題。\n
            """

        },
        {
            'role': 'system',
            'name': 'example_system',
            'content': '是'
        },
        # example 3
        {
            'role': 'system',
            'name': 'example_user',
            'content': """
            問：那您就是您覺得您在就是綠島工作，有沒有因為是綠島的緣故，就是讓您工作比較有挑戰性的地方？或者是有，就是特別特別喜歡的地方?\n
            """

        },
        {
            'role': 'system',
            'name': 'example_system',
            'content': '是'
        },
        {
            'role': 'user',
            'content': "問：{}".format(question)
        }
    ]
    response = request_chatgpt_gpt4(message)
    return response

section = "topics"
for i in range(1, 20):
    participant_id = 'N{}'.format(i)
    # transcripts = json.load(open('../data/raw/transcript/json/{}_done.json'.format(participant_id)))
    # transcripts = json.load(open('../data/result/transcripts/{}_{}.json'.format(participant_id, section)))
    qa_pairs = json.load(open('../data/result/tmp/v3_1029/qa_messages_{}_{}.json'.format(participant_id, section)))
    # interviewee_messages = [t['content'] for t in transcripts if t['speaker'] == participant_id]
    # interviewer_messages = [t['content'] for t in transcripts if t['speaker'] == '採訪者']
    res = []
    print("participant_id: {}".format(participant_id))
    for qa_index, qa_pair in enumerate(qa_pairs):
        if len(qa_pair) == 1:
            qa_pair = ['（訪談開始）', qa_pair[0]]
            qa_pairs[qa_index] = qa_pair
        if len(qa_pair[0]) < 50 and qa_index > 0 and len(qa_pairs[qa_index-1][1]) < 30:
            label = '否'
        else:
            label = classify_interviewer_message(qa_pair[0], qa_pair[1])
            if label not in ['是', '否']:
                label = classify_interviewer_message(qa_pair[0], qa_pair[1])
                if label not in ['是', '否']:
                    label = '否'
        print(label, len(qa_pair[0]), len(qa_pairs[max(0, qa_index-1)][1]),qa_pair)
        res.append({
            'sentence': qa_pair[0],
            'label': label
        })
    print("===========================")
    save_json(res, '../data/result/tmp/v3_1029/interviewer_message_classification_{}_{}.json'.format(participant_id, section))
    # break
        

In [None]:
# 3. chunk qa messages by topic 
def chunk_by_topic(qa_messages, speakers, interviewer_clf):
    index = 0
    interviewer_index = 0
    chunks = []
    new_chunk = []
    for qa_index, qa in enumerate(qa_messages):
        for message in qa:
            if speakers[index] == 1:
                new_chunk.append({
                    "speaker": 1,
                    "content": message
                })
                interviewer_index += 1
            else:
                new_chunk.append({
                    "speaker": 0,
                    "content": message
                })
            if qa_index < len(qa_messages)-1 and speakers[index+1] == 1 and interviewer_clf[interviewer_index] == 1:
                chunks.append(new_chunk)
                new_chunk = []
            index += 1
    chunks.append(new_chunk)
    return chunks
section = 'topics'
for i in range(1, 20):
    participant_id = 'N{}'.format(i)
    # transcripts = json.load(open('../data/raw/transcript/json/{}_done.json'.format(participant_id)))
    transcripts = json.load(open('../data/result/transcripts/{}_{}.json'.format(participant_id, section)))
    res = []
    print("participant_id: {}".format(participant_id))
    qa_messages = json.load(open('../data/result/tmp/v3_1029//qa_messages_{}_{}.json'.format(participant_id, section)))
    speakers = [0 if t['speaker'] == participant_id else 1 for t in transcripts]
    intvwer_clf = json.load(open('../data/result/tmp/v3_1029/interviewer_message_classification_{}_{}.json'.format(participant_id, section)))
    intvwer_clf = [1 if i['label'] == '是' else 0 for i in intvwer_clf]
    chunks = chunk_by_topic(qa_messages, speakers, intvwer_clf) 
    save_json(chunks, '../data/result/chunks/v2_1029/chunks_{}_{}.json'.format(participant_id, section))


In [5]:
# generate embeddings for each chunk
section = 'topics'
for i in range(1, 20):
    participant_id = 'N{}'.format(i)
    chunks = json.load(open('../data/result/chunks/v2_1029/chunks_{}_{}.json'.format(participant_id, section)))
    for chunk_index, chunk in enumerate(chunks):
        for message_index, message in enumerate(chunk):
            message['chunk_index'] = chunk_index
            message['message_index'] = message_index
            try:
                message['embedding'] = get_embedding(message['content'])
            except:
                print(participant_id, chunk_index, message_index)
    save_json(chunks, '../data/result/chunk_embeddings/1029/chunks_{}_{}_with_embedding.json'.format(participant_id, section))

In [None]:
# create embeddings (dep)
embeddings = []
for index, message in enumerate(interviewee_messages):
    print('{}/{}'.format(index, len(interviewee_messages)))
    embedding = get_embedding(message)
    embeddings.append({
        'content': message,
        'embedding': embedding
    })
save_json(embeddings, '../data/result/interviewee_embeddings.json')

In [None]:
# calculate sim matrix (dep)
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

embeddings = json.load(open('../data/result/interviewee_embeddings.json'))
vecs = [e['embedding'] for e in embeddings]
sim_matrix = cosine_similarity(np.array(vecs))

In [None]:
def chunk_by_embedding(sim_matrix):
    chunks = []
    length = sim_matrix.shape[0]
    i = 0
    while i < length:
        prev_sim = 1
        for j in range(i+1, length):
            if j == i+1:
                if sim_matrix[i][j] > 0.85:
                    prev_sim = sim_matrix[i][j]
                    chunks.append((i, j, prev_sim))
                    continue
                else:
                    chunks.append((i, i, 1))
                    i = i+1
                    break
            if sim_matrix[i][j] < 0.85 or prev_sim - sim_matrix[i][j] > 0.04:
                i = j
                break
            else:
                prev_sim = sim_matrix[i][j]
                chunks.append((i, j, prev_sim))
        if j == length-1:
            i = length
            break
    return chunks
chunks = chunk_by_embedding(sim_matrix)
pprint(chunks)

In [None]:
def collect_chunks(chunks_index, interviewee_messages):
    chunks = defaultdict(list)
    for i, j, _ in chunks_index:
        if chunks[i] == []:
            chunks[i].append(interviewee_messages[i])
        if i == j: continue
        chunks[i].append(interviewee_messages[j])
    return list(chunks.values())
message_chunks = collect_chunks(chunks, interviewee_messages)
pprint(message_chunks[5:])

In [None]:
# plot sim matrix distribution
import matplotlib.pyplot as plt

def plot(data):
    x_values = list(range(1, len(data) + 1))

    # Create a line chart
    plt.plot(x_values, data, marker='o', linestyle='-')
    plt.title('Line Chart of similarity')
    plt.xlabel('Index')
    plt.ylabel('Value')
    # Display the chart (if you're using a Jupyter Notebook, you can omit this line)
    plt.show()
def exclude(data, index):
    return list(data)[:index] + list(data)[index+1:]
plot(exclude(sim_matrix[2], 2))
plot(exclude(sim_matrix[3], 3))

In [None]:
response_chunks = json.load(open('../data/result/chunks_N1.json'))
print(len(interviewee_messages), len(response_chunks))
summaries = []
for chunk in response_chunks:
    messages = [
        {
            'role': 'system',
            'content': """
                You are a reporter in Taiwan. 
                You interviewed a person who is a local residence in Lyudao (綠島), a small island near Taiwan. 
                Please summarize the interview. Reply in traditional Chinese.
            """
        },
        {
            'role': 'user',
            'content': 'Interview: \n' + "\n".join(chunk)
        }
    ]
    res = request_chatgpt_gpt4(messages)
    summaries.append(res)
    print(res)
    print("---------------------------")

In [None]:
save_json(summaries, '../data/result/chunk_summaries.json')