In [None]:
from openai import OpenAI
import json
import gpt as gpt
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4, ensure_ascii=False)

# openai
openai_api_key = open("openai_api_key").read() 
openai_client=OpenAI(api_key=openai_api_key)

In [None]:
def toString(segment):
    result = ''
    for dialogue in segment:
        if dialogue['speaker'] == 'interviewer':
            result += "Interviewer: "
        else:
            result += "Interviewee: "
        result += dialogue['content'] + "\n"
    return result

def extract_keywords(paragraph):
    messages = [
        {
            "role": "system",
            "content": """You are a keyword extraction system that extracts keywords from a monologue. 
            The monologue is about a person who is talking about their life.
            The keywords should be the most important words in the monologue.
            Use the exact words that the person uses in the monologue.
            Replay with a list of keywords in the following JSON format:
            {{
                "keywords": ["keyword1", "keyword2", "keyword3", ...]
            }}
            """
       },
       {
           "role": "user",
           "content": paragraph
       }
    ]
    # keywords = request_chatgpt_gpt4(messages)
    return messages
def extract_title(conversation):
    messages = [
        {
            "role": "system",
            "content": """You are a title analysis system. 
            You are given a conversation between two people: Interviewer and Interviewee. 
            Give a concise title for the conversation with no more than 10 words.
        """
        },
        {
            "role": "user",
            "content": conversation
        }
    ]
    return messages


In [None]:
import glob
# reformat
for segmented_file in glob.glob("segmented_transcripts/*.json"):
    participant_id = segmented_file.split("/")[-1].split(".")[0]
    segments = json.load(open(segmented_file))
    reformat_segments = []
    for segment_index, segment in enumerate(segments):
        reformat_segments.append({
            "id": "{}_{}".format(participant_id, segment_index),
            "conversation": segment,
        })
    save_json(reformat_segments, 'tmp/segments_formatted/{}.json'.format(participant_id))

all_chunks = []
for interview_file in glob.glob("tmp/segments_formatted/*.json"):
    pid = interview_file.split("/")[-1].split(".")[0]
    interview_data = json.load(open(interview_file))
    print(interview_file)
    keyword_prompts = []
    title_prompts = []
    for segment in interview_data:
        interviewee_messages = "\n".join([message['content'] for message in segment['conversation'] if message['speaker'] == "interviewee"])
        segment_messages = toString(segment['conversation'])
        keyword_prompts.append(extract_keywords(interviewee_messages))
        title_prompts.append(extract_title(segment_messages))

    segment_keywords = gpt.multithread_prompts(openai_client, keyword_prompts, temperature=0, response_format="json")
    segment_keywords = [json.loads(keyword)['keywords'] for keyword in segment_keywords]
    segment_titles = gpt.multithread_prompts(openai_client, title_prompts, temperature=1.0)
    result = []
    for segment, keywords, title in zip(interview_data, segment_keywords, segment_titles):
        segment_messages = toString(segment['conversation'])
        segment['raw_keywords'] = keywords
        segment['title'] = title
        result.append(segment)
    save_json(result, "tmp/extracted/{}.json".format(pid))
save_json(all_chunks, "result/chunks.json")

In [None]:
# generating explanations for keywords, the explanations are used to generated embeddings for the keywords
def explain_prompt(keyword):
    messages = [
        {
            "role": "system",
            "content": """You are a dictionary system.
            The user wants to know the meaning of a specific word in the context of environment, ecology, an related topics.
            Reply with the definition of the word.
        """
        },
        {
            "role": "user",
            "content": "Please explain {}".format(keyword)
        }
    ]
    return messages


In [10]:
from collections import defaultdict
keyword_freq = defaultdict(int)
for interview_file in glob.glob("tmp/extracted/*.json"):
    print(interview_file)
    interview_data = json.load(open(interview_file))
    for segment in interview_data:
        keywords = segment['raw_keywords']
        for k in keywords:
            keyword_freq[k] += 1

# filter out keywords that only appear once
all_keywords = [k for k in keyword_freq.keys() if keyword_freq[k] > 1]

print("Number of keywords: ", len(all_keywords))
# collect statistics
keyword_statistics = [{"keyword": k, "frequency": keyword_freq[k]} for k in all_keywords]

# get explanations
explain_prompts = [explain_prompt(keyword) for keyword in all_keywords]

# get embeddings from explanations
explanations = gpt.multithread_prompts(openai_client, explain_prompts)
embeddings = gpt.multithread_embeddings(openai_client, explanations)

# save
result = []
for keyword, explanation, embedding in zip(all_keywords, explanations, embeddings):
    result.append({
        "keyword": keyword,
        "explanation": explanation,
        "embeddings": embedding
    })
save_json(result, "keyword/keywords.json")
save_json(keyword_statistics, "keyword/keyword_statistics.json")


tmp/extracted/Tavin Wrynn.json
tmp/extracted/Renar Krynn.json
tmp/extracted/Edrick Vallis.json
tmp/extracted/Cendra Brax.json
tmp/extracted/Koryx Darnell.json
tmp/extracted/Ferin Goss.json
tmp/extracted/Dessa Talvek.json
tmp/extracted/Joran Fell.json
tmp/extracted/Mila Thark.json
tmp/extracted/Selina Morvale.json
tmp/extracted/Kyla Horth.json
tmp/extracted/Vek Marlow.json
Number of keywords:  203


100%|██████████| 203/203 [00:09<00:00, 21.27it/s]
  0%|          | 0/203 [00:00<?, ?it/s]

10098
97
105
100
108
118
309

102
111
109
178
204
100
99
103
133
95
201
108
86
113
231
109
201
101
122
84
175
95
205
115
251
104
110
108
104
134
153
86
153
85
212
94
111
108
166
99
103
104
99
85
111
103
155
101
103
91
121
102
113
155
131
150
89
100
101
148
124
96
87
97
90
297
102
183
117
86
177
94
155
122
99
88
87
112
130
101
100
97
88
97
106
84
87
118
103
127
78
103


  3%|▎         | 7/203 [00:00<00:11, 17.08it/s]

92
82
186
215
95
197
108
102
107
107
101
95
78
88


 15%|█▌        | 31/203 [00:00<00:02, 60.15it/s]

106
123
108
116
87
104
86
100
241
152
137
73
99
92
113
94
108
136
169
115
71
78
102
205
92
118
105
130
103
109


 30%|███       | 61/203 [00:00<00:01, 95.44it/s]

103
160
102
84
96
101
90
135
275
122
105
106
113
98
94
84
88
125
88
90
94
79
93
169
164
96
102
84
122
103
95
83
101
82
107
96
111
82
117


 44%|████▍     | 90/203 [00:01<00:00, 118.49it/s]

119
116
240
238
235
99
102
169
95
99
107
116


 59%|█████▊    | 119/203 [00:01<00:00, 119.49it/s]

103
98
97
101
133
99
118
87


100%|██████████| 203/203 [00:03<00:00, 52.38it/s] 
