# Formatting transcripts as instructions for training 

In [1]:
import pandas as pd
import random, re
from utils.constants import Paths

In [2]:
df = pd.read_excel(Paths.mccray_folder + r"manually_cleaned/McCray (1940s, 100 transcript subset).xlsx")
print(list(df))

for item in list(df):
    if item == "Cleaned Transcript":
        print(item)
        

['Subset Index', 'Original Index', 'Title', 'Creator', 'Contributors', 'Date', 'Approximate Date', 'Source', 'Subject', 'Local Subject', 'S.C. County', 'Description', 'Extent', 'Digital Collection', 'Website', 'Contributing Institution', 'Rights', 'Time Period', 'Geographic Location', 'Language', 'Digitization Specifications', 'Date Digital', 'Type', 'Format', 'Media Type', 'Identifier', 'Note', 'Digital Assistant', 'Cleaned Transcript', 'OCLC number', 'Date created', 'Date modified', 'Reference URL', 'CONTENTdm number', 'CONTENTdm file name', 'CONTENTdm file path', 'Year']
Cleaned Transcript


## Transcript completion finetunning to improve model understanding examples
### Basic missing text completion
- Input: 'Fill in the missing information from the transcript for {Title}\n {first 1/3}... [this text is missing]... {final 1/3}'
- Output: 'This is the missing text from the transcript for "{Title}"\n\n {middle 1/3}'

- Input: 'Fill in the first half of the missing transcript for {Title}\n ...{final 1/2}'
- Output: 'Here is the rest of the missing text from the transcript for "{Title}"\n\n {first 1/2}'

<!-- - Input: 'Complete the transcript for {Title}\n {first 1/2}...'
- Output: 'Here is the rest of the missing text from the transcript for "{Title}"\n\n {final 1/2}' -->

### Entire transcript missing
- Input: 'Give the complete transcript for {Title}.'
- Output: 'This is the complete transcript for "{Title}"\n\n {transcript}'

### Contexual completion
- Input: 'Based on the date {date} and title {title}, complete this transcript:\n{first 1/2}...'
- Output: 'Here is the rest of the missing text...\n\n {final 2/3}'

In [3]:
mid_split = random.uniform(0.4, 0.7)  # 40-70% through document
print(mid_split)

def three_split():
    beginning = random.uniform(0.15, 0.25)  # 15-25%
    middle_gap = random.uniform(0.30, 0.50)  # 30-50% missing
    end_start = beginning + middle_gap + random.uniform(0.05, 0.15)  # 5-15% buffer
    return (beginning, middle_gap, end_start)


0.48747193377999887


In [4]:
print(three_split()[2])

0.6616793079727393


In [5]:
def split_transcript_smart(transcript, split_ratio):
    """
    Split transcript at sentence boundaries near the target ratio while preserving original formatting.
    """
    # Find sentence boundaries
    matches = list(re.finditer(r'(?<=[.!?])\s+', transcript))
    sentences = []
    last_idx = 0
    for m in matches:
        sentences.append(transcript[last_idx:m.end()])
        last_idx = m.end()
    sentences.append(transcript[last_idx:])  # Add last sentence

    target_idx = int(len(sentences) * split_ratio)
    if target_idx >= len(sentences):
        target_idx = len(sentences) - 1
    elif target_idx <= 0:
        target_idx = 1

    # Join slices to preserve formatting
    first = ''.join(sentences[:target_idx])
    second = ''.join(sentences[target_idx:])
    return first, second

In [6]:
def create_completion_tasks(df, title_col='Title', transcript_col='Cleaned Transcript', date_col='Year', idx_col='Original Index', min_section_words=5, min_full_length=20):
    """
    Creation of completion tasks
    """
    completion_data = []

    for i, row in df.iterrows():
        title = row[title_col]
        transcript = row[transcript_col]
        date = row[date_col]
        idx = row[idx_col]

        # Clean and prepare transcript
        transcript = transcript.strip()
        words = transcript.split()
        total_words = len(words)

        # Skip very short transcripts
        if total_words < min_full_length: 
            print(f"Skipping document {idx}: too short ({total_words} words)")
            continue

        try:
            max_first = min(int(total_words * 0.35), total_words - 2 * min_section_words)
            min_first = max(min_section_words, int(total_words * 0.20))

            if max_first <= min_first:
                print(f"Skipping document {idx}: cannot create valid middle section")
                continue

            first_end = random.randint(min_first, max_first)
            max_middle_len = min(int(total_words * 0.50), total_words - first_end - min_section_words)
            min_middle_len = min_section_words

            if max_middle_len <= min_middle_len:
                print(f"Skipping document {idx}: cannot create valid middle section")
                continue

            middle_len = random.randint(min_middle_len, max_middle_len)
            middle_start = first_end
            middle_end = middle_start + middle_len

            first_part = ' '.join(words[:first_end])
            middle_part = ' '.join(words[middle_start:middle_end])
            final_part = ' '.join(words[middle_end:])

            if len(final_part.split()) >= min_section_words:
                completion_data.append({
                    'Original Index': idx,
                    'input': f'Fill in the missing information from the transcript for "{title}":\n{first_part}... [this text is missing]... {final_part}',
                    'output': f'This is the missing text from the transcript for "{title}":\n\n{middle_part}',
                    'task_type': 'missing_middle'
                })

            half_split = random.randint(
                max(min_section_words, int(total_words * 0.40)),
                min(total_words - min_section_words, int(total_words * 0.55))
            )

            first_half = ' '.join(words[:half_split])
            second_half = ' '.join(words[half_split:])

            completion_data.append({
                'Original Index': idx,
                'input': f'Fill in the first half of the missing transcript for "{title}":\n...{second_half}',
                'output': f'Here is the rest of the missing text from the transcript for "{title}":\n\n{first_half}',
                'task_type': 'complete_first_half'
            })

            completion_data.append({
                'Original Index': idx,
                'input': f'Give the complete transcript for "{title}".',
                'output': f'This is the complete transcript for "{title}":\n\n{transcript}',
                'task_type': 'complete_full'
            })

            context_split = random.randint(
                max(min_section_words, int(total_words * 0.45)),
                min(total_words - min_section_words, int(total_words * 0.55))
            )

            context_first = ' '.join(words[:context_split])
            context_second = ' '.join(words[context_split:])

            completion_data.append({
                'Original Index': idx,
                'input': f'Based on the year, {date}, and title, "{title}", complete this transcript:\n{context_first}...',
                'output': f'Here is the rest of the missing text...\n\n{context_second}',
                'task_type': 'contextual_completion'
            })

        except Exception as e:
            print(f"Error processing document {idx}: {e}")
            continue

    return pd.DataFrame(completion_data)

In [7]:
completion_df = create_completion_tasks(df)

In [8]:
completion_df

Unnamed: 0,Original Index,input,output,task_type
0,1834,Fill in the missing information from the trans...,This is the missing text from the transcript f...,missing_middle
1,1834,Fill in the first half of the missing transcri...,Here is the rest of the missing text from the ...,complete_first_half
2,1834,"Give the complete transcript for ""Memorandum t...","This is the complete transcript for ""Memorandu...",complete_full
3,1834,"Based on the year, 1946, and title, ""Memorandu...",Here is the rest of the missing text...\n\nAt ...,contextual_completion
4,2436,Fill in the missing information from the trans...,This is the missing text from the transcript f...,missing_middle
...,...,...,...,...
399,5964,"Based on the year, 1945, and title, ""Letter to...","Here is the rest of the missing text...\n\nC.,...",contextual_completion
400,5977,Fill in the missing information from the trans...,This is the missing text from the transcript f...,missing_middle
401,5977,Fill in the first half of the missing transcri...,Here is the rest of the missing text from the ...,complete_first_half
402,5977,"Give the complete transcript for ""Letter to Mr...","This is the complete transcript for ""Letter to...",complete_full


In [9]:
completion_df.to_excel(Paths.mccray_folder + "completion_df.xlsx", index=False)