In [None]:
from openai import OpenAI
import json
import gpt as gpt
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(data, fp, indent=4, ensure_ascii=False)

# openai
openai_api_key = open("openai_api_key").read()
openai_client=OpenAI(api_key=openai_api_key)

In [None]:
background = """
Zaun is a heavily industrialized city characterized by unregulated technological development and alchemical innovation. 
Its rapid growth has resulted in significant environmental degradation, including severe air and water pollution. 
Emissions from factories and laboratories produce a persistent smog, known as the "Gray Wind," leading to widespread respiratory issues among residents. 
The city's waterways are heavily contaminated with industrial waste, impacting aquatic ecosystems and public health. 
These environmental challenges highlight the urgent need for sustainable practices and effective regulatory frameworks to balance Zaun's industrial progress with environmental and human well-being.
"""
interviewer_description = """
An environmental scientist is conducting a community-based study in Zaun to identify actionable solutions to the city’s environmental issues. 
Through interviews with residents, the scientist aims to understand the lived experiences of those most affected by air and water pollution. 
The study focuses on gathering insights into health impacts, coping strategies, and potential mitigation measures. 
By integrating local knowledge with scientific analysis, the initiative seeks to develop practical and community-informed solutions to improve environmental and public health outcomes in Zaun.
"""
desired_segments = 10
reply_format = """
{
    transcript: [
            {
                "speaker": "interviewer",
                "content": xxx,
            },
            {
                "speaker": "interviewee",
                "content": xxx,
            },
            {
                "speaker "interviewer",
                "content": xxx,
            },
            {
                "speaker": "interviewee",
                "content": xxx,
            },
            ...
    ]
}"""

In [None]:
# generate transcripts
interviewees = json.load(open("interviewees.json"))
generation_prompts = []
for interviewee in interviewees:
    prompt = [
        {
            "role": "system",
            "content": f"""You are a transcript generator.  According to the user provided information, generate the transcript mimicking real-world conversations.
            The interviewer should first ask about the interviewee's background, ask about their personal experiences, and then gradually move on to the main topic of the interview.
            The generated transcript should have back-and-forth conversations between the interviewer and the interviewee.
            The interviewer should occasionally ask follow-up questions or provide feedback to keep the conversation engaging.
            Try to make the conversations natural and realistic, like they are transcribed from a recording.
            You can be creative and add details based on the provided context.
            There should be at least 70 turns of conversation in the transcript.
            Reply with the following JSON format:
            {reply_format} 
            """
        },
        {
            "role": "user",
            "content": f"""
            Background of the interview: {background}
            Description of the interviewer: {interviewer_description}
            Description of the interviewee: {json.dumps(interviewee)}
            """
        }
    ]
    generation_prompts.append(prompt)
responses = gpt.multithread_prompts(openai_client, generation_prompts, temperature=1.0, response_format="json")
responses = [json.loads(response)['transcript'] for response in responses]
for response, interviewee in zip(responses, interviewees):
    save_json(response, f"transcripts/{interviewee['name']}.json")

In [None]:
# segmentation
transcript_str_generator = lambda t: "\n".join([f"{i} - {turn['speaker']}: {turn['content']}" for i, turn in enumerate(t)])
segmentation_prompt_generator = lambda transcript_str: [
    {
        "role": "system",
        "content": """You are a transcript segmenter. The user will give you a indexed transcript. 
        Your task is to divide the provided transcript into segments that are coherent and meaningful.
        In each segment, the conversation should flow naturally and cover a specific topic or subtopic.
        Reply with the start and end index of each segment with the following JSON format:
        {
            "segments": [
                [start_index_1, end_index_1],
                [start_index_2, end_index_2],
                ...
             ] 
        }
        """
    },
    {
        "role": "user",
        "content": transcript_str
    }
]
def segment_list(data, segment_indices):
    segments = []
    for segment_range in segment_indices:
        start = segment_range[0]
        end = segment_range[1]
        segments.append(data[start:end+1])
    return segments

import glob
segmentation_prompts = []
interviewees = []
transcripts = []
for transcript_file in glob.glob("transcripts/*.json"):
    transcript = json.load(open(transcript_file))
    transcript_str = transcript_str_generator(transcript)
    prompt = segmentation_prompt_generator(transcript_str)
    segmentation_prompts.append(prompt)
    interviewees.append(transcript_file.split("/")[-1].split(".")[0])
    transcripts.append(transcript)
responses = gpt.multithread_prompts(openai_client, segmentation_prompts, temperature=0, response_format="json")
responses = [json.loads(response)['segments'] for response in responses]
for response, interviewee, transcript in zip(responses, interviewees, transcripts):
    segmented_transcript = segment_list(transcript, response)
    save_json(segmented_transcript, f"segmented_transcripts/{interviewee}.json")
