In [6]:
# Combine manually cleaned transcripts into "Clean Transcripts" column
## Replacing automated Cleaned Transcripts

In [7]:
import pandas as pd
import numpy as np

In [14]:

# File paths - UPDATE THESE WITH ACTUAL FILE PATHS
main_csv_file = "/home/steven-dindl/Downloads/McCray+ (test1).csv"
excel_file = "/home/steven-dindl/Downloads/McCray (1940s, 100 transcript subset)(1).xlsx"

output_file = "output-data/full dataset w manual fixes.csv"  

# Read the files
main_df = pd.read_csv(main_csv_file)
excel_df = pd.read_excel(excel_file)

# Check if required columns exist
if 'Clean Transcript' not in main_df.columns:
    print("Warning: 'Clean Transcripts' column not found in main CSV. Creating new column.")
    main_df['Clean Transcripts'] = ''

if 'Clean Transcript' not in excel_df.columns:
    raise ValueError("'Clean Transcripts' column not found in Excel file")

if 'Title' not in main_df.columns:
    raise ValueError("'title' column not found in main CSV file")

if 'Title' not in excel_df.columns:
    raise ValueError("'Title' column not found in Excel file")

# Display column information
print(f"\nMain CSV columns: {list(main_df.columns)}")
print(f"Excel file columns: {list(excel_df.columns)}")

# Count non-empty manual transcripts
manual_count = excel_df['Clean Transcript'].notna().sum()
print(f"\nFound {manual_count} manual transcripts in Excel file")

# Create a mapping from Excel file using title as the key
print("Creating mapping based on 'Title' column...")
manual_mapping = excel_df.set_index('Title')['Clean Transcript'].dropna()
print(f"Created mapping for {len(manual_mapping)} titles")

# Create a mapping from Excel file using Title as the key
print("Creating mapping based on 'Title' column...")
# Filter out rows with NaN titles and transcripts first
excel_clean = excel_df.dropna(subset=['Title', 'Clean Transcript'])
# Convert titles to strings to avoid indexing issues
manual_mapping = dict(zip(excel_clean['Title'].astype(str), excel_clean['Clean Transcript']))
print(f"Created mapping for {len(manual_mapping)} titles")

# Update main dataframe with manual transcripts
print("Updating main dataset with manual transcripts...")
updated_count = 0
for idx, row in main_df.iterrows():
    title = row['Title']
    if pd.notna(title) and title in manual_mapping:
        main_df.loc[idx, 'Clean Transcript'] = manual_mapping[title]
        updated_count += 1
        if updated_count % 10 == 0:  # Progress indicator
            print(f"  Updated {updated_count} records...")

print(f"Successfully updated {updated_count} records with manual transcripts")


Main CSV columns: ['Unnamed: 0.1', 'Unnamed: 0', 'Title', 'Creator', 'Description', 'Reference URL', 'Year', 'Original Transcript', 'Semi-clean Transcript', 'Corrections', 'Clean Transcript']
Excel file columns: ['Subset Index', 'Original Index', 'Title', 'Creator', 'Contributors', 'Date', 'Approximate Date', 'Source', 'Subject', 'Local Subject', 'S.C. County', 'Description', 'Extent', 'Digital Collection', 'Website', 'Contributing Institution', 'Rights', 'Time Period', 'Geographic Location', 'Language', 'Digitization Specifications', 'Date Digital', 'Type', 'Format', 'Media Type', 'Identifier', 'Note', 'Digital Assistant', 'Clean Transcript', 'OCLC number', 'Date created', 'Date modified', 'Reference URL', 'CONTENTdm number', 'CONTENTdm file name', 'CONTENTdm file path', 'Year', 'corrected_transcript']

Found 101 manual transcripts in Excel file
Creating mapping based on 'Title' column...
Created mapping for 101 titles
Creating mapping based on 'Title' column...
Created mapping for 9

In [None]:

# Display summary statistics
total_records = len(main_df)
total_transcripts = main_df['Clean Transcripts'].notna().sum()
empty_transcripts = (main_df['Clean Transcripts'].isna() | 
                    (main_df['Clean Transcripts'] == '')).sum()

print(f"\nSummary:")
print(f"Total records in main dataset: {total_records}")
print(f"Records with transcripts: {total_transcripts}")
print(f"Records with empty transcripts: {empty_transcripts}")
print(f"Records updated with manual transcripts: {updated_count}")

# Show some examples of updated transcripts
print(f"\nSample of updated transcripts:")
updated_mask = main_df['title'].isin(manual_mapping.index)
sample_updated = main_df[updated_mask][['title', 'Clean Transcripts']].head(3)
for idx, row in sample_updated.iterrows():
    print(f"Title: {row['title']}")
    print(f"Transcript: {row['Clean Transcripts'][:100]}..." if len(str(row['Clean Transcripts'])) > 100 else f"Transcript: {row['Clean Transcripts']}")
    print("-" * 50)

# Check for titles in Excel that weren't found in main CSV
excel_titles = set(excel_df['title'].dropna())
main_titles = set(main_df['title'].dropna())
missing_titles = excel_titles - main_titles

if missing_titles:
    print(f"\nWarning: {len(missing_titles)} titles from Excel file were not found in main CSV:")
    for title in list(missing_titles)[:5]:  # Show first 5
        print(f"  - {title}")
    if len(missing_titles) > 5:
        print(f"  ... and {len(missing_titles) - 5} more")

In [16]:

# Save the updated dataframe
print(f"\nSaving updated dataset...")
main_df.to_csv(output_file, index=False)
print(f"Updated file saved as: {output_file}")

# Final verification
print(f"\nFinal verification:")
print(f"Original automated transcripts have been replaced with manual transcripts for {updated_count} records")
print(f"File saved successfully to: {output_file}")


Saving updated dataset...
Updated file saved as: output-data/full dataset w manual fixes.csv

Final verification:
Original automated transcripts have been replaced with manual transcripts for 138 records
File saved successfully to: output-data/full dataset w manual fixes.csv
