# =============================================================================
# Step 1: Import Necessary Libraries
# =============================================================================

In [2]:
# 'os' is used for interacting with the operating system, like creating file paths.
import os
# 'json' is used for parsing JSON strings and files.
import json
# 'pickle' is used for serializing and de-serializing Python object structures (saving our data).
import pickle

# =============================================================================
# Step 2: Configure Folder Paths and Output Filename
# =============================================================================

In [36]:
# List of the full paths to the four folders containing your JSON files.
# Using raw strings (r"...") is a good practice on Windows to avoid issues with backslashes.
FOLDER_PATHS = [
    r"C:\Users\pfeil\My Drive\Studys\CAS - NLB - Uni Bern\Projects\RAG - API connection\Data\automotive_inbound_50",
    r"C:\Users\pfeil\My Drive\Studys\CAS - NLB - Uni Bern\Projects\RAG - API connection\Data\home_service_inbound_50",
    r"C:\Users\pfeil\My Drive\Studys\CAS - NLB - Uni Bern\Projects\RAG - API connection\Data\insurance_outbound_50",
    r"C:\Users\pfeil\My Drive\Studys\CAS - NLB - Uni Bern\Projects\RAG - API connection\Data\medical_equipment_outbound_50"
]

In [37]:
# The name for your final output file.
OUTPUT_PICKLE_FILE = "repackaged_transcript_data.pkl"

# =============================================================================
# Step 3: Find and Sort All JSON Files from Each Folder
# =============================================================================

In [38]:
# We will create a list of lists. Each inner list will hold the sorted filenames for one folder.
# This is a robust way to keep the files from each source separate but ordered.
all_sorted_files = []

# Loop through each folder path we defined earlier.
for folder in FOLDER_PATHS:
    try:
        # Get a list of all entries in the folder directory.
        files_in_folder = os.listdir(folder)
        # Filter this list to include only files that end with '.json'.
        json_files = [f for f in files_in_folder if f.endswith('.json')]
        # IMPORTANT: Sort the list of JSON filenames alphabetically.
        # This ensures '01.json' comes before '02.json', etc., for correct pairing.
        json_files.sort()
        # Add the sorted list of filenames for this folder to our main list.
        all_sorted_files.append(json_files)
        # A quick check to make sure we found 50 files as expected.
        if len(json_files) != 50:
            print(f"Warning: Found {len(json_files)} JSON files in {os.path.basename(folder)}, not 50.")
    except FileNotFoundError:
        # Handle cases where a folder path is incorrect or the folder doesn't exist.
        print(f"ERROR: The folder was not found: {folder}")
        # Add an empty list as a placeholder to prevent the script from crashing later.
        all_sorted_files.append([])


# =============================================================================
# Step 4: Process Files and Repackage Data
# =============================================================================

In [39]:
# This is the main list where we will store all the repackaged data.
# It will end up being a list containing 50 inner lists (our rows).
repackaged_data = []

# We assume there are 50 files to process in each folder.
# We loop from 0 to 49, representing the index of each file in the sorted lists.
print("Starting data extraction and repackaging process...")
for i in range(50):
    # This list will hold the data for one row (one string from each of the 4 folders).
    current_row_data = []
    
    # Now, we loop through our four lists of sorted filenames. 'folder_idx' will be 0, 1, 2, 3.
    for folder_idx in range(len(all_sorted_files)):
        try:
            # Get the list of sorted files for the current folder.
            file_list = all_sorted_files[folder_idx]
            # Get the filename at the current index 'i'.
            filename = file_list[i]
            # Construct the full path to the JSON file.
            full_file_path = os.path.join(FOLDER_PATHS[folder_idx], filename)
            
            # Open the JSON file for reading. 'with open' handles closing the file automatically.
            with open(full_file_path, 'r', encoding='utf-8') as f:
                # Load the file's content into a Python dictionary.
                data = json.load(f)
                # Extract the value associated with the "text" key.
                text_content = data["text"]
                # Add the extracted text to our current row.
                current_row_data.append(text_content)

        except (FileNotFoundError, IndexError):
            # This 'except' block handles two cases:
            # 1. FileNotFoundError: The file path is wrong (shouldn't happen with our code).
            # 2. IndexError: There are fewer than 50 files in this folder.
            print(f"⚠️ Warning: File not found or missing at index {i} in folder {os.path.basename(FOLDER_PATHS[folder_idx])}. Inserting 'None'.")
            # As requested, add None as a placeholder for the missing data.
            current_row_data.append(None)
        except (json.JSONDecodeError, KeyError) as e:
            # This 'except' block handles two more cases:
            # 1. json.JSONDecodeError: The file is not a valid JSON.
            # 2. KeyError: The JSON is valid but does not have a "text" field.
            print(f"⚠️ Warning: Error processing file '{filename}' ({e}). Inserting 'None'.")
            # Add None as a placeholder.
            current_row_data.append(None)

    # After processing one file from each of the four folders, add the complete row to our main data list.
    repackaged_data.append(current_row_data)

print("Data processing complete.")

Starting data extraction and repackaging process...
Data processing complete.


# =============================================================================
# Step 5: Save the Repackaged Data to a Pickle File
# =============================================================================

In [40]:
# Now we save the 'repackaged_data' list to a file.
# We open the file in 'write binary' ('wb') mode, which is required by pickle.
with open(OUTPUT_PICKLE_FILE, 'wb') as f:
    # Use pickle.dump() to serialize the Python object and write it to the file.
    pickle.dump(repackaged_data, f)

print(f"\nSuccess! All data has been repackaged and saved to '{OUTPUT_PICKLE_FILE}'")


Success! All data has been repackaged and saved to 'repackaged_transcript_data.pkl'


# =============================================================================
# Step 6: Verify the Output
# =============================================================================

In [41]:
# This part is to show you how to load and check your new pickle file.
print("\n--- Verifying the output file ---")
with open(OUTPUT_PICKLE_FILE, 'rb') as f:
    # Load the data from the pickle file back into a Python variable.
    loaded_data = pickle.load(f)

    # Print the total number of rows to confirm it's 50.
    print(f"Total number of rows in the pickle file: {len(loaded_data)}")
    # Print the first row to give you a sample of what the data looks like.
    if len(loaded_data) > 0:
        print("Sample of the first row:")
        print(loaded_data[0])


--- Verifying the output file ---
Total number of rows in the pickle file: 50
Sample of the first row:
["Thank you for calling [ORGANIZATION] [ORGANIZATION] [ORGANIZATION], [ORGANIZATION]. This is [PERSON_NAME] speaking. How can I help you? Hi, Mr. [PERSON_NAME], please speak to Mr. [PERSON_NAME], please. This is [PERSON_NAME] [PERSON_NAME]. Let's see here. [PERSON_NAME] [PERSON_NAME]? Or do you need [PERSON_NAME] [PERSON_NAME]? I believe it's [PERSON_NAME] [PERSON_NAME]. He's the [OCCUPATION], I believe. Yes, ma' am. Let me see if I can put you on a brief hold and get a hold of him. Okay, thank you. It alrighty. I'll get you transferred right to him. Okay. Yes, thank you. You're [ORGANIZATION]. Thanks for calling [ORGANIZATION] [ORGANIZATION] [ORGANIZATION]. [ORGANIZATION], how may I help you? Hi, this is [PERSON_NAME] again. I'm so sorry to keep bothering you. I just got off the phone. I just got off the phone with my [OCCUPATION] and he asked me just in case I get pulled over, is i

In [None]:
# Print all transcripts for every category
category_names = [os.path.basename(path) for path in FOLDER_PATHS]
print('\\n--- Full transcript listing by category ---')
for idx, row in enumerate(loaded_data, start=1):
    print(f'\\n=== Transcript {idx} ===')
    for cat, text in zip(category_names, row):
        print(f'[{cat}]')
        print(text if text is not None else 'None')
        print()


In [42]:
size_table = []

for i, row in enumerate(loaded_data):
    row_counts = []
    for j, s in enumerate(row):
        length = len(str(s).split())  # str() guards against None/other types
        row_counts.append(length)
    size_table.append(row_counts)


In [43]:
import heapq

# size_table: list[list[int]]
vals = []
for i, row in enumerate(size_table):
    for j, v in enumerate(row):
        # skip non-numbers just in case
        if isinstance(v, (int, float)):
            vals.append((v, i, j))

k = min(10, len(vals))
topk = heapq.nlargest(k, vals, key=lambda t: t[0])  # [(value, i, j), ...]

for rank, (v, i, j) in enumerate(topk, 1):
    print(f"{rank:2d}. value={v} at cell [{i}][{j}]")

 1. value=499 at cell [43][3]
 2. value=496 at cell [45][3]
 3. value=494 at cell [20][3]
 4. value=488 at cell [33][3]
 5. value=487 at cell [21][3]
 6. value=486 at cell [13][3]
 7. value=481 at cell [25][3]
 8. value=478 at cell [0][3]
 9. value=477 at cell [7][3]
10. value=477 at cell [23][3]
