In [37]:
import tiktoken
def count_tokens_in_messages(messages, model="gpt-4o-mini"):
    """Count tokens in messages using tiktoken"""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")

    total_tokens = 0
    for message in messages:
        # Count tokens for role and content
        total_tokens += len(encoding.encode(message.get("role", "")))
        total_tokens += len(encoding.encode(message.get("content", "")))
        # Add extra tokens for message formatting (estimated)
        total_tokens += 4

    return total_tokens

# model = "gpt-4o-mini"
# for name, content in transcripts.items():
#     encoding = tiktoken.encoding_for_model(model)
#     current_tokens = len(encoding.encode(content))
#     print(current_tokens)

In [38]:
import concurrent
from tqdm import tqdm
import tiktoken
import json
import requests
from collections import defaultdict

# from . import prompts
from openai import RateLimitError, APITimeoutError
import time
from pydantic import BaseModel
from typing import Dict, List
def request_gpt(
    client, messages, model="gpt-4o-mini", temperature=0.5, format=None, seed=None
):
    try:
        if format == "json":
            response = client.chat.completions.create(
                model=model,
                messages=messages,
                response_format={"type": "json_object"},
                temperature=temperature,
                seed=seed,
            )
        else:
            response = client.chat.completions.create(
                model=model, messages=messages, temperature=temperature, seed=seed
            )
        return response.choices[0].message.content
    except RateLimitError as e:
        print("RateLimitError")
        print(e)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)
    except APITimeoutError as e:
        print("APITimeoutError")
        print(messages)
        time.sleep(5)
        return request_gpt(client, messages, model, temperature, format)

In [39]:
import re

def parseTranscript(content: str):
    messages = []
    lines = content.split('\n')
    current_message = None

    for i, line in enumerate(lines):
        line = line.strip()
        
        # Check if line matches speaker pattern: [Speaker Name] hh:mm:ss
        speaker_match = re.match(r'^\[([^\]]+)\]\s+(\d{1,2}:\d{2}:\d{2})$', line)
        
        if speaker_match:
            # If we were building a previous message, save it
            if current_message:
                messages.append(current_message)
            
            # Start a new message
            current_message = {
                'speaker': speaker_match.group(1),
                'timestamp': speaker_match.group(2),
                'content': ""
            }
        elif current_message and line != '':
            # Add content line to current message
            current_message['content'] += line + "\n"
        elif current_message and line == '':
            # Empty line - could be end of message or just spacing
            # We'll keep building the current message until we hit a new speaker
            current_message['content'] += "\n"
    
    # Don't forget to add the last message if it exists
    if current_message:
        messages.append(current_message)
    
    return messages

def messages_to_string(messages):
    messages_str = ""
    for index, message in enumerate(messages):
        messages_str += f"{index}: [{message['speaker']}] {message['content']}"
    return messages_str

# Test the function with the Alex transcript
# if 'Alex' in transcripts:
#     parsed_messages = parseTranscript(transcripts['Alex'])
#     messages_str = messages_to_string(parsed_messages)
#     print(messages_str)

In [40]:
def segmentation_prompt(messages_str):
    return [
        {
            "role": "system",
            "content": """You are a helpful assistant that segment transcripts.
            The user will give you a transcript with indices for each message, and the criteria for segmentation.
            You will follow the criteria to segment the transcript into sections, providing the start and end indices for each segment.
            Reply in the following JSON format:
            {
                "segments": [
                    {
                        "start_index": <int>,
                        "end_index": <int>,
                        "title": "<str>"
                    },
                    ...
                ]
            }
            """
        },
        {
            "role": "user",
            "content": """
            This transcript is from a user study. The study is divided into an introduction session, three scenario/task sessions, each followed by a brief questionnaire, and then a final interview session. 
            Here is the transcript:
            {transcript}
            
            Please segment the transcript into sections based on the following criteria:
            - The first segment is the introduction, where one speaker introduces the topic and procedure.
            - The second segment is the first scenario/task session with its questionnaire.
            - The third segment is the second scenario/task session with its questionnaire.
            - The fourth segment is the third scenario/task session with its questionnaire.
            - The final segment is the interview session.
            Return the segments in the specified JSON format.
            """.format(transcript=messages_str)
        }
    ]

In [35]:
import os
import glob
from openai import OpenAI
# read txt files from transcripts/**/*.txt
transcript_files = glob.glob("transcripts/Cars/*.txt") + glob.glob("transcripts/Movies/*.txt")
transcripts = {}

for file_path in transcript_files:
    print(f"Reading transcript file: {file_path}")
    # Extract filename without extension to use as key
    filename = os.path.splitext(os.path.basename(file_path))[0]
    
    # Read the content of each file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        transcripts[filename] = content

print(f"Found {len(transcripts)} transcript files:")
for name in transcripts.keys():
    print(f"- {name}")

Reading transcript file: transcripts/Cars/Shasha.txt
Reading transcript file: transcripts/Cars/Chris.txt
Reading transcript file: transcripts/Cars/Sage.txt
Reading transcript file: transcripts/Cars/Sebastian.txt
Reading transcript file: transcripts/Cars/Ederson.txt
Reading transcript file: transcripts/Cars/Sarah.txt
Reading transcript file: transcripts/Cars/Advait.txt
Reading transcript file: transcripts/Cars/Alex.txt
Reading transcript file: transcripts/Cars/Ezekiel.txt
Reading transcript file: transcripts/Cars/Ian.txt
Reading transcript file: transcripts/Movies/Shay.txt
Reading transcript file: transcripts/Movies/Zoe.txt
Reading transcript file: transcripts/Movies/Jacob.txt
Reading transcript file: transcripts/Movies/Maria.txt
Reading transcript file: transcripts/Movies/Hailey.txt
Reading transcript file: transcripts/Movies/Atharva.txt
Reading transcript file: transcripts/Movies/Crosby.txt
Reading transcript file: transcripts/Movies/Ronny.txt
Reading transcript file: transcripts/Movi

In [52]:
client = OpenAI()
responses = []
for name, content in transcripts.items():
    messages = parseTranscript(content)
    messages_to_str = messages_to_string(messages)
    prompt = segmentation_prompt(messages_to_str)
    segmentation_response = request_gpt(client, prompt, model="gpt-4o-mini", temperature=0, format="json")
    responses.append((name, messages, segmentation_response))

In [48]:
responses[0][1]

[{'speaker': 'Chifang Chou',
  'timestamp': '15:14:07',
  'content': 'Hello. Can you hear me? I can hear your voice. Hello.\n\n'},
 {'speaker': 'Chifang Chou',
  'timestamp': '15:14:19',
  'content': 'Seems like your mic is muted.\n\n'},
 {'speaker': 'iPhone (50)',
  'timestamp': '15:14:29',
  'content': 'Okay. Hi. Okay.\n\n'},
 {'speaker': 'Chifang Chou',
  'timestamp': '15:14:29',
  'content': 'Hello? Yeah, I can… Hi. Um, so, um, are you joining through iPhone, is it?\n\n'},
 {'speaker': 'iPhone (50)',
  'timestamp': '15:14:39',
  'content': "Yeah, because I use my, uh… Windows system made sure that I can't open the, uh, camera, so I use the iPhone.\n\n"},
 {'speaker': 'Chifang Chou',
  'timestamp': '15:14:48',
  'content': "Okay, so, um, cause, uh, later I'll provide you the, like, the account, the files, the data set through.\n\n"},
 {'speaker': 'Chifang Chou',
  'timestamp': '15:14:56',
  'content': 'Zoom chat, so… Is it possible for you to do it through the… computer, your laptop

In [49]:
segmentation_response = json.loads(responses[0][2])
segmentation_response

{'segments': [{'start_index': 0, 'end_index': 31, 'title': 'Introduction'},
  {'start_index': 32, 'end_index': 82, 'title': 'First Scenario/Task Session'},
  {'start_index': 101,
   'end_index': 301,
   'title': 'Second Scenario/Task Session'},
  {'start_index': 302,
   'end_index': 373,
   'title': 'Third Scenario/Task Session'},
  {'start_index': 374, 'end_index': 376, 'title': 'Interview Session'}]}

In [53]:
def parse_index(segment, messages):
    start_index = segment["start_index"]
    end_index = segment["end_index"]
    segment_messages = messages[start_index:end_index + 1]
    segment["messages"] = segment_messages
    return segment
for name, messages, segmentation_response_str in responses:
    segmentation_list = json.loads(segmentation_response_str)
    print(segmentation_list)
    # create directory segmented/{name}
    os.makedirs(f"segmented/{name}", exist_ok=True)
    for index, segment_response in enumerate(segmentation_list["segments"]):
        segmented_with_messages = parse_index(segment_response, messages)
        print(f"Transcript: {name}")
        with open(f"segmented/{name}/{index}.json", "w") as f:
            json.dump(segmented_with_messages, f, indent=4)

{'segments': [{'start_index': 0, 'end_index': 31, 'title': 'Introduction'}, {'start_index': 32, 'end_index': 83, 'title': 'First Scenario/Task Session with Questionnaire'}, {'start_index': 268, 'end_index': 301, 'title': 'Second Scenario/Task Session with Questionnaire'}, {'start_index': 302, 'end_index': 373, 'title': 'Third Scenario/Task Session with Questionnaire'}, {'start_index': 374, 'end_index': 376, 'title': 'Interview Session'}]}
Transcript: Shasha
Transcript: Shasha
Transcript: Shasha
Transcript: Shasha
Transcript: Shasha
{'segments': [{'start_index': 0, 'end_index': 19, 'title': 'Introduction'}, {'start_index': 20, 'end_index': 66, 'title': 'Scenario 1 Task Session and Questionnaire'}, {'start_index': 188, 'end_index': 201, 'title': 'Scenario 2 Task Session and Questionnaire'}, {'start_index': 232, 'end_index': 275, 'title': 'Scenario 3 Task Session and Questionnaire'}, {'start_index': 278, 'end_index': 356, 'title': 'Final Interview Session'}]}
Transcript: Chris
Transcript:

In [32]:
from boxsdk import Client, OAuth2
token = "OAafdBbbgjfAI70MV1QspEgxMbAS0dLH"
oauth = OAuth2(client_id="tw3cv4gyun93nwxwep4b4lxtbuppc9ub", client_secret="E31ZiBvLuU0DGjJi7Is2arCu7r2YshQI", access_token=token)
client = Client(oauth)
shared_link_url = "https://ucdavis.app.box.com/s/xonzrpx7fo1rw7sfw3ov7x49yw7xy2cw/folder/350818283216"  # Replace with actual URL
items = client.folder('0').get_items()
# If it's a web_link, you need to get the shared link URL first
for item in items:
    if item.type == 'web_link':
        print(f"Found shared folder: {item.name}")
        # Get the web link object
        web_link = client.web_link(item.id).get()
        print(f"Shared link URL: {web_link.url}")
        
        # If you have the actual shared link URL, you can access it like this:
        # shared_link_url = web_link.url  # or use the direct URL if you know it
        
        # Create a client for the shared link
        try:
            # You may need to use the shared link to access the folder
            shared_folder = client.get_shared_item(web_link.url)
            shared_items = shared_folder.get_items()
            
            print("Items in shared folder:")
            for shared_item in shared_items:
                print(f"{shared_item.type}: {shared_item.name} (ID: {shared_item.id})")
                
        except Exception as e:
            print(f"Error accessing shared folder: {e}")
            print("You may need the password or different permissions")


  client = Client(oauth)


Found shared folder: Shared Folder - AI-generated visualization User study data
Shared link URL: https://ucdavis.app.box.com/s/xonzrpx7fo1rw7sfw3ov7x49yw7xy2cw/folder/350818283216


[31m"GET https://api.box.com/2.0/shared_items" 404 158
{'date': 'Mon, 08 Dec 2025 20:08:32 GMT', 'content-type': 'application/json', 'x-envoy-upstream-service-time': '102', 'box-request-id': '09adec2b28ed67638ed39316e7744d84b', 'cache-control': 'no-cache, no-store', 'strict-transport-security': 'max-age=31536000', 'via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'Transfer-Encoding': 'chunked'}
{'code': '---ound',
 'help_url': 'http://developers.box.com/docs/#errors',
 'message': 'Not Found',
 'request_id': 'v64atti844tp27g8',
 'status': 404,
 'type': 'error'}
[0m


Error accessing shared folder: Message: Not Found
Status: 404
Code: not_found
Request ID: v64atti844tp27g8
Headers: {'date': 'Mon, 08 Dec 2025 20:08:32 GMT', 'content-type': 'application/json', 'x-envoy-upstream-service-time': '102', 'box-request-id': '09adec2b28ed67638ed39316e7744d84b', 'cache-control': 'no-cache, no-store', 'strict-transport-security': 'max-age=31536000', 'via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'Transfer-Encoding': 'chunked'}
URL: https://api.box.com/2.0/shared_items
Method: GET
Context Info: None
You may need the password or different permissions


In [58]:
de_id_list = open("de_id", 'r').read().splitlines()
print(de_id_list)
de_id_dict = {}
for (original_name, de_id_name) in zip(de_id_list[:20], de_id_list[20:]):
    print(original_name, "-", de_id_name)
    de_id_dict[original_name] = de_id_name





['Esther Biden', 'Malik Ashan', 'Jaylon Cooper', 'Ronny', 'Chris Martin ', 'Zoe Cheng', 'Advait ', 'Shay Drake', 'Alex Marasigan', 'Atharva Harshe', 'jacob adams', 'jack ederson', 'Ina Chou', 'Sebastian', 'Ian Wong', 'Maria Rodriguez', 'Jian Chen', 'Sarah Odinma ', 'Crosby Cox', 'Sage Zope', 'Maya', 'Oliver', 'Serena', 'Devin', 'Lina', 'Jasper', 'Naomi', 'Theo', 'Carmen', 'Felix', 'Aria', 'Marcus', 'Tessa', 'Gabriel', 'Rhea', 'Daniel', 'Ivy', 'Victor', 'Elinor', 'Nolan']
Esther Biden - Maya
Malik Ashan - Oliver
Jaylon Cooper - Serena
Ronny - Devin
Chris Martin  - Lina
Zoe Cheng - Jasper
Advait  - Naomi
Shay Drake - Theo
Alex Marasigan - Carmen
Atharva Harshe - Felix
jacob adams - Aria
jack ederson - Marcus
Ina Chou - Tessa
Sebastian - Gabriel
Ian Wong - Rhea
Maria Rodriguez - Daniel
Jian Chen - Ivy
Sarah Odinma  - Victor
Crosby Cox - Elinor
Sage Zope - Nolan


In [59]:
import shutil
import os

# Create the de_id_segmented directory
os.makedirs("de_id_segmented", exist_ok=True)

# Get list of folders in segmented directory
segmented_folders = [item for item in os.listdir("segmented") if os.path.isdir(os.path.join("segmented", item))]
print("Original folders in segmented directory:", segmented_folders)

# Create a mapping from lowercase original names to de-id names
lowercase_de_id_dict = {}
for original_name, de_id_name in de_id_dict.items():
    lowercase_de_id_dict[original_name.lower().strip()] = de_id_name

print("\nLowercase de_id mapping:")
for orig, de_id in lowercase_de_id_dict.items():
    print(f"  '{orig}' -> '{de_id}'")

# Copy folders with name replacement
for folder in segmented_folders:
    folder_lower = folder.lower().strip()
    
    # Find matching de-id name (case-insensitive)
    de_id_folder_name = None
    for orig_name_lower, de_id_name in lowercase_de_id_dict.items():
        if folder_lower == orig_name_lower:
            de_id_folder_name = de_id_name
            break
    
    if de_id_folder_name:
        # Copy the entire folder with new name
        src_path = os.path.join("segmented", folder)
        dst_path = os.path.join("de_id_segmented", de_id_folder_name)
        shutil.copytree(src_path, dst_path)
        print(f"Copied '{folder}' -> '{de_id_folder_name}'")
    else:
        print(f"Warning: No de-id mapping found for folder '{folder}'")

# Also copy the README.md file if it exists
readme_src = "segmented/README.md"
readme_dst = "de_id_segmented/README.md"
if os.path.exists(readme_src):
    shutil.copy2(readme_src, readme_dst)
    print("Copied README.md")

print("\nDe-identification completed!")
print("Contents of de_id_segmented directory:")
for item in sorted(os.listdir("de_id_segmented")):
    print(f"  {item}")

Original folders in segmented directory: ['Alex', 'Zoe', 'Ian', 'Shay', 'Shasha', 'Jian', 'Hailey', 'Advait', 'Ina', 'Jacob', 'Ezekiel', 'Ronny', 'Jaylon', 'Chris', 'Crosby', 'Maria', 'Sebastian', 'Sarah', 'Ederson', 'Atharva', 'Sage']

Lowercase de_id mapping:
  'esther biden' -> 'Maya'
  'malik ashan' -> 'Oliver'
  'jaylon cooper' -> 'Serena'
  'ronny' -> 'Devin'
  'chris martin' -> 'Lina'
  'zoe cheng' -> 'Jasper'
  'advait' -> 'Naomi'
  'shay drake' -> 'Theo'
  'alex marasigan' -> 'Carmen'
  'atharva harshe' -> 'Felix'
  'jacob adams' -> 'Aria'
  'jack ederson' -> 'Marcus'
  'ina chou' -> 'Tessa'
  'sebastian' -> 'Gabriel'
  'ian wong' -> 'Rhea'
  'maria rodriguez' -> 'Daniel'
  'jian chen' -> 'Ivy'
  'sarah odinma' -> 'Victor'
  'crosby cox' -> 'Elinor'
  'sage zope' -> 'Nolan'
Copied 'Advait' -> 'Naomi'
Copied 'Ronny' -> 'Devin'
Copied 'Sebastian' -> 'Gabriel'
Copied README.md

De-identification completed!
Contents of de_id_segmented directory:
  Devin
  Gabriel
  Naomi
  README.

In [60]:
# Remove the previous incomplete attempt
if os.path.exists("de_id_segmented"):
    shutil.rmtree("de_id_segmented")

# Create the de_id_segmented directory
os.makedirs("de_id_segmented", exist_ok=True)

# Get list of folders in segmented directory
segmented_folders = [item for item in os.listdir("segmented") if os.path.isdir(os.path.join("segmented", item))]
print("Original folders in segmented directory:", segmented_folders)

print("\nDe_id_dict mapping:")
for orig, de_id in de_id_dict.items():
    print(f"  '{orig}' -> '{de_id}'")

# Copy folders with name replacement using partial matching
for folder in segmented_folders:
    folder_lower = folder.lower().strip()
    
    # Find matching de-id name by checking if folder name appears in the original names
    de_id_folder_name = None
    for orig_name, de_id_name in de_id_dict.items():
        orig_name_lower = orig_name.lower().strip()
        
        # Check if folder name matches the beginning of original name or is contained in it
        if (folder_lower in orig_name_lower or 
            orig_name_lower.startswith(folder_lower) or
            any(word.startswith(folder_lower) for word in orig_name_lower.split())):
            de_id_folder_name = de_id_name
            print(f"Matched '{folder}' with '{orig_name}' -> '{de_id_name}'")
            break
    
    if de_id_folder_name:
        # Copy the entire folder with new name
        src_path = os.path.join("segmented", folder)
        dst_path = os.path.join("de_id_segmented", de_id_folder_name)
        shutil.copytree(src_path, dst_path)
        print(f"Copied '{folder}' -> '{de_id_folder_name}'")
    else:
        print(f"Warning: No de-id mapping found for folder '{folder}'")

# Also copy the README.md file if it exists
readme_src = "segmented/README.md"
readme_dst = "de_id_segmented/README.md"
if os.path.exists(readme_src):
    shutil.copy2(readme_src, readme_dst)
    print("Copied README.md")

print("\nDe-identification completed!")
print("Contents of de_id_segmented directory:")
for item in sorted(os.listdir("de_id_segmented")):
    print(f"  {item}")

Original folders in segmented directory: ['Alex', 'Zoe', 'Ian', 'Shay', 'Shasha', 'Jian', 'Hailey', 'Advait', 'Ina', 'Jacob', 'Ezekiel', 'Ronny', 'Jaylon', 'Chris', 'Crosby', 'Maria', 'Sebastian', 'Sarah', 'Ederson', 'Atharva', 'Sage']

De_id_dict mapping:
  'Esther Biden' -> 'Maya'
  'Malik Ashan' -> 'Oliver'
  'Jaylon Cooper' -> 'Serena'
  'Ronny' -> 'Devin'
  'Chris Martin ' -> 'Lina'
  'Zoe Cheng' -> 'Jasper'
  'Advait ' -> 'Naomi'
  'Shay Drake' -> 'Theo'
  'Alex Marasigan' -> 'Carmen'
  'Atharva Harshe' -> 'Felix'
  'jacob adams' -> 'Aria'
  'jack ederson' -> 'Marcus'
  'Ina Chou' -> 'Tessa'
  'Sebastian' -> 'Gabriel'
  'Ian Wong' -> 'Rhea'
  'Maria Rodriguez' -> 'Daniel'
  'Jian Chen' -> 'Ivy'
  'Sarah Odinma ' -> 'Victor'
  'Crosby Cox' -> 'Elinor'
  'Sage Zope' -> 'Nolan'
Matched 'Alex' with 'Alex Marasigan' -> 'Carmen'
Copied 'Alex' -> 'Carmen'
Matched 'Zoe' with 'Zoe Cheng' -> 'Jasper'
Copied 'Zoe' -> 'Jasper'
Matched 'Ian' with 'Sebastian' -> 'Gabriel'
Copied 'Ian' -> 'Gabr

FileExistsError: [Errno 17] File exists: 'de_id_segmented/Gabriel'

In [61]:
# Remove the previous incomplete attempt
if os.path.exists("de_id_segmented"):
    shutil.rmtree("de_id_segmented")

# Create the de_id_segmented directory
os.makedirs("de_id_segmented", exist_ok=True)

# Get list of folders in segmented directory
segmented_folders = [item for item in os.listdir("segmented") if os.path.isdir(os.path.join("segmented", item))]
print("Original folders in segmented directory:", segmented_folders)

print("\nDe_id_dict mapping:")
for orig, de_id in de_id_dict.items():
    print(f"  '{orig}' -> '{de_id}'")

# Keep track of used de_id names to avoid duplicates
used_de_id_names = set()

# Copy folders with name replacement using better matching
for folder in segmented_folders:
    folder_lower = folder.lower().strip()
    
    # Find the best matching de-id name
    best_match = None
    best_score = 0
    
    for orig_name, de_id_name in de_id_dict.items():
        orig_name_lower = orig_name.lower().strip()
        
        # Skip if this de_id name is already used
        if de_id_name in used_de_id_names:
            continue
            
        # Calculate match score
        score = 0
        
        # Exact match gets highest score
        if folder_lower == orig_name_lower:
            score = 100
        # Check if folder name is a word in the original name
        elif folder_lower in orig_name_lower.split():
            score = 90
        # Check if folder name starts the original name
        elif orig_name_lower.startswith(folder_lower):
            score = 80
        # Check if folder name is contained in original name
        elif folder_lower in orig_name_lower:
            score = 70
            
        if score > best_score:
            best_score = score
            best_match = (orig_name, de_id_name)
    
    if best_match and best_score > 0:
        orig_name, de_id_folder_name = best_match
        used_de_id_names.add(de_id_folder_name)
        
        # Copy the entire folder with new name
        src_path = os.path.join("segmented", folder)
        dst_path = os.path.join("de_id_segmented", de_id_folder_name)
        shutil.copytree(src_path, dst_path)
        print(f"Copied '{folder}' (matched with '{orig_name}') -> '{de_id_folder_name}'")
    else:
        print(f"Warning: No de-id mapping found for folder '{folder}'")

# Also copy the README.md file if it exists
readme_src = "segmented/README.md"
readme_dst = "de_id_segmented/README.md"
if os.path.exists(readme_src):
    shutil.copy2(readme_src, readme_dst)
    print("Copied README.md")

print("\nDe-identification completed!")
print("Contents of de_id_segmented directory:")
for item in sorted(os.listdir("de_id_segmented")):
    print(f"  {item}")

Original folders in segmented directory: ['Alex', 'Zoe', 'Ian', 'Shay', 'Shasha', 'Jian', 'Hailey', 'Advait', 'Ina', 'Jacob', 'Ezekiel', 'Ronny', 'Jaylon', 'Chris', 'Crosby', 'Maria', 'Sebastian', 'Sarah', 'Ederson', 'Atharva', 'Sage']

De_id_dict mapping:
  'Esther Biden' -> 'Maya'
  'Malik Ashan' -> 'Oliver'
  'Jaylon Cooper' -> 'Serena'
  'Ronny' -> 'Devin'
  'Chris Martin ' -> 'Lina'
  'Zoe Cheng' -> 'Jasper'
  'Advait ' -> 'Naomi'
  'Shay Drake' -> 'Theo'
  'Alex Marasigan' -> 'Carmen'
  'Atharva Harshe' -> 'Felix'
  'jacob adams' -> 'Aria'
  'jack ederson' -> 'Marcus'
  'Ina Chou' -> 'Tessa'
  'Sebastian' -> 'Gabriel'
  'Ian Wong' -> 'Rhea'
  'Maria Rodriguez' -> 'Daniel'
  'Jian Chen' -> 'Ivy'
  'Sarah Odinma ' -> 'Victor'
  'Crosby Cox' -> 'Elinor'
  'Sage Zope' -> 'Nolan'
Copied 'Alex' (matched with 'Alex Marasigan') -> 'Carmen'
Copied 'Zoe' (matched with 'Zoe Cheng') -> 'Jasper'
Copied 'Ian' (matched with 'Ian Wong') -> 'Rhea'
Copied 'Shay' (matched with 'Shay Drake') -> 'The

In [62]:
import json
import glob

# Create a reverse mapping from de-id names back to original names
original_to_de_id = {}
for orig_name, de_id_name in de_id_dict.items():
    original_to_de_id[orig_name.lower().strip()] = de_id_name

print("Original to de-id mapping:")
for orig, de_id in original_to_de_id.items():
    print(f"  '{orig}' -> '{de_id}'")

# Get all JSON files in de_id_segmented directory
json_files = glob.glob("de_id_segmented/*/*.json")
print(f"\nFound {len(json_files)} JSON files to process")

# Process each JSON file
for json_file in json_files:
    print(f"\nProcessing: {json_file}")
    
    # Extract folder name (which is the de-identified name)
    folder_name = os.path.basename(os.path.dirname(json_file))
    
    # Read the JSON file
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Track changes for logging
    changes_made = 0
    
    # Process each message
    for message in data.get('messages', []):
        original_speaker = message.get('speaker', '')
        
        # Skip if speaker is already "Chifang Chou"
        if original_speaker == "Chifang Chou":
            continue
            
        # Find if this speaker should be replaced with the folder name
        speaker_lower = original_speaker.lower().strip()
        should_replace = False
        
        # Check if this speaker corresponds to the folder's original name
        for orig_name_lower, de_id_name in original_to_de_id.items():
            if de_id_name == folder_name:
                # Check if the speaker matches this original name
                if (speaker_lower == orig_name_lower or 
                    speaker_lower in orig_name_lower or 
                    orig_name_lower.split()[0] == speaker_lower.split()[0]):
                    should_replace = True
                    break
        
        # Replace speaker name with folder name if it should be replaced
        if should_replace:
            message['speaker'] = folder_name
            changes_made += 1
            print(f"  Changed '{original_speaker}' -> '{folder_name}'")
    
    # Write back the updated JSON file if changes were made
    if changes_made > 0:
        with open(json_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4)
        print(f"  Updated {changes_made} speaker entries in {json_file}")
    else:
        print(f"  No changes needed for {json_file}")

print("\nSpeaker de-identification completed!")

Original to de-id mapping:
  'esther biden' -> 'Maya'
  'malik ashan' -> 'Oliver'
  'jaylon cooper' -> 'Serena'
  'ronny' -> 'Devin'
  'chris martin' -> 'Lina'
  'zoe cheng' -> 'Jasper'
  'advait' -> 'Naomi'
  'shay drake' -> 'Theo'
  'alex marasigan' -> 'Carmen'
  'atharva harshe' -> 'Felix'
  'jacob adams' -> 'Aria'
  'jack ederson' -> 'Marcus'
  'ina chou' -> 'Tessa'
  'sebastian' -> 'Gabriel'
  'ian wong' -> 'Rhea'
  'maria rodriguez' -> 'Daniel'
  'jian chen' -> 'Ivy'
  'sarah odinma' -> 'Victor'
  'crosby cox' -> 'Elinor'
  'sage zope' -> 'Nolan'

Found 90 JSON files to process

Processing: de_id_segmented/Devin/0.json
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -> 'Devin'
  Changed 'Ronny' -

In [None]:
# Verification: Check speaker names in a few sample files
sample_files = [
    "de_id_segmented/Carmen/0.json",
    "de_id_segmented/Jasper/1.json", 
    "de_id_segmented/Rhea/2.json"
]

print("Verification - Sample speaker names after de-identification:")
for file_path in sample_files:
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        speakers = set()
        for message in data.get('messages', []):
            speakers.add(message.get('speaker', ''))
        
        print(f"\n{file_path}:")
        for speaker in sorted(speakers):
            print(f"  - {speaker}")

# Count total files processed
total_files = len(glob.glob("de_id_segmented/*/*.json"))
print(f"\nTotal JSON files processed: {total_files}")
print("All speaker names should now be either 'Chifang Chou' or the folder name (de-identified name)")