In [None]:
import pandas as pd
import re

# Load the Excel file
file_path = "whisper_transcription_large.xlsx"
df = pd.read_excel(file_path)

# Ensure correct column extraction
df[['Start Time', 'End Time', 'Transcription']] = df.iloc[:, 0].str.split(',', n=2, expand=True)

# Remove unnecessary whitespace and quotes from the transcription column
df['Transcription'] = df['Transcription'].str.strip().str.strip('"')

# Keep only the first 200 rows for analysis
df = df.head(199)

# Define a function for tokenization (removes punctuation, keeps words)
def tokenize_text(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    return tokens

# Apply tokenization
df['Tokens'] = df['Transcription'].apply(tokenize_text)
df['Token Count (N)'] = df['Tokens'].apply(len)

# Save tokenized data to a CSV file
df[['Transcription', 'Tokens', 'Token Count (N)']].to_csv("tokenized_transcript.csv", index=False, encoding="utf-8")

# Calculate total word count for the first 200 rows
total_word_count = df['Token Count (N)'].sum()

# Print the total token count (number of words) across the first 200 lines
print(f"\n✅ Total words in first 200 lines: {total_word_count}")
print("📁 Tokenized data saved as 'tokenized_transcript.csv'")


✅ Total words in first 200 lines: 1945
📁 Tokenized data saved as 'tokenized_transcript.csv'


In [None]:
import pandas as pd
import re

# Load the new Excel file
assemblyai_file_path = "AssemblyAI_2.xlsx"
df_assemblyai = pd.read_excel(assemblyai_file_path)

# Keep only the first 200 rows for analysis
df_assemblyai = df_assemblyai.head(199)

# Define a function for tokenization (removes punctuation, keeps words)
def tokenize_assemblyai_text(text):
    if pd.isna(text):
        return []
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = text.split()
    return tokens

# Apply tokenization on the "Sentence" column
df_assemblyai['Tokens_AssemblyAI'] = df_assemblyai['Sentence'].apply(tokenize_assemblyai_text)
df_assemblyai['Token_Count_AssemblyAI'] = df_assemblyai['Tokens_AssemblyAI'].apply(len)

# Save tokenized data for verification
df_assemblyai[['Sentence', 'Tokens_AssemblyAI', 'Token_Count_AssemblyAI']].to_csv("tokenized_assemblyai.csv", index=False, encoding="utf-8")

# Calculate total word count for the first 200 rows
total_word_count_assemblyai = df_assemblyai['Token_Count_AssemblyAI'].sum()

# Print the total token count (number of words) across the first 200 lines
print(f"\n✅ Total words in first 200 lines (AssemblyAI file): {total_word_count_assemblyai}")
print("📁 Tokenized data saved as 'tokenized_assemblyai.csv'")



✅ Total words in first 200 lines (AssemblyAI file): 1664
📁 Tokenized data saved as 'tokenized_assemblyai.csv'
