In [2]:
!pip install -U assemblyai

Collecting assemblyai
  Downloading assemblyai-0.37.0-py3-none-any.whl.metadata (29 kB)
Downloading assemblyai-0.37.0-py3-none-any.whl (44 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: assemblyai
Successfully installed assemblyai-0.37.0


In [3]:
import assemblyai as aai
import os
import pandas as pd
from tqdm import tqdm

# Replace with your API key
aai.settings.api_key = "0f6f958c291340dd8177ce08e317873c"

# Folder containing your MP3 files
INPUT_FOLDER = "/content/drive/MyDrive/MP3"
OUTPUT_CSV = "/content/sample_data/transcriptions.csv"

def transcribe_file(file_path):
    """Transcribe a single file and return utterances with metadata"""
    transcriber = aai.Transcriber()
    config = aai.TranscriptionConfig(speaker_labels=True)

    try:
        transcript = transcriber.transcribe(file_path, config=config)
        if transcript.status == aai.TranscriptStatus.error:
            print(f"Error processing {file_path}: {transcript.error}")
            return None

        # Extract data for each utterance
        data = []
        for utterance in transcript.utterances:
            data.append({
                "file_name": os.path.basename(file_path),
                "speaker": utterance.speaker,
                "text": utterance.text,
                "start": utterance.start,
                "end": utterance.end,
                "confidence": utterance.confidence
            })
        return data

    except Exception as e:
        print(f"Exception processing {file_path}: {str(e)}")
        return None

def process_folder(folder_path):
    """Process all MP3 files in a folder"""
    all_data = []

    # Get all MP3 files in the folder
    mp3_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.mp3')]

    print(f"Found {len(mp3_files)} MP3 files to process...")

    for filename in tqdm(mp3_files, desc="Processing files"):
        file_path = os.path.join(folder_path, filename)
        file_data = transcribe_file(file_path)

        if file_data:
            all_data.extend(file_data)

    return all_data

# Main processing
if __name__ == "__main__":
    # Process all files in the folder
    transcription_data = process_folder(INPUT_FOLDER)

    if transcription_data:
        # Create DataFrame and save to CSV
        df = pd.DataFrame(transcription_data)
        df.to_csv(OUTPUT_CSV, index=False)
        print(f"Successfully saved transcriptions to {OUTPUT_CSV}")
        print(f"Total utterances: {len(df)}")
    else:
        print("No transcription data was generated.")

Found 39 MP3 files to process...


Processing files: 100%|██████████| 39/39 [10:26<00:00, 16.05s/it]

Successfully saved transcriptions to /content/sample_data/transcriptions.csv
Total utterances: 1106





In [4]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("/content/sample_data/transcriptions.csv")
df.head()

Unnamed: 0,file_name,speaker,text,start,end,confidence
0,Brian_Cox_on_quantum_computing_and_black_hole_...,A,There's an engineering challenge in building q...,10320,390630,0.960721
1,Brian_Cox_on_quantum_computing_and_black_hole_...,B,Want to dive deeper? Become a Big Think member...,394890,401650,0.962061
2,Mark_Pushes_Back_On_Joe_s_Quantum_Computing_A....,A,One of the more interesting philosophical find...,80,82200,0.958051
3,Mark_Pushes_Back_On_Joe_s_Quantum_Computing_A....,B,"Well, you know that ChatGPT tried to copy itse...",82320,89176,0.895572
4,Mark_Pushes_Back_On_Joe_s_Quantum_Computing_A....,A,I'm not sure what this is. What is this?,89368,91816,0.875787


In [5]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import pandas as pd


nltk.download('vader_lexicon')

# Load the CSV file
df = pd.read_csv("/content/sample_data/transcriptions.csv")

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment
def get_sentiment(text):
    return sia.polarity_scores(text)

# Apply sentiment analysis to each dialogue
df['Sentiment'] = df['text'].apply(get_sentiment)

# Extract compound score for simplicity
df['Compound'] = df['Sentiment'].apply(lambda x: x['compound'])

# Classify sentiment as positive, negative, or neutral
df['Sentiment_Label'] = df['Compound'].apply(lambda x: 'Positive' if x > 0.05 else ('Negative' if x < -0.05 else 'Neutral'))

# Save the results
df.to_csv("speaker_sentiment.csv", index=False)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [6]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("speaker_sentiment.csv")
df.head()

Unnamed: 0,file_name,speaker,text,start,end,confidence,Sentiment,Compound,Sentiment_Label
0,Brian_Cox_on_quantum_computing_and_black_hole_...,A,There's an engineering challenge in building q...,10320,390630,0.960721,"{'neg': 0.026, 'neu': 0.912, 'pos': 0.062, 'co...",0.9843,Positive
1,Brian_Cox_on_quantum_computing_and_black_hole_...,B,Want to dive deeper? Become a Big Think member...,394890,401650,0.962061,"{'neg': 0.0, 'neu': 0.844, 'pos': 0.156, 'comp...",0.3612,Positive
2,Mark_Pushes_Back_On_Joe_s_Quantum_Computing_A....,A,One of the more interesting philosophical find...,80,82200,0.958051,"{'neg': 0.038, 'neu': 0.724, 'pos': 0.239, 'co...",0.9928,Positive
3,Mark_Pushes_Back_On_Joe_s_Quantum_Computing_A....,B,"Well, you know that ChatGPT tried to copy itse...",82320,89176,0.895572,"{'neg': 0.0, 'neu': 0.909, 'pos': 0.091, 'comp...",0.2732,Positive
4,Mark_Pushes_Back_On_Joe_s_Quantum_Computing_A....,A,I'm not sure what this is. What is this?,89368,91816,0.875787,"{'neg': 0.197, 'neu': 0.803, 'pos': 0.0, 'comp...",-0.2411,Negative


In [7]:


import pandas as pd

# Load the CSV file
df = pd.read_csv("speaker_sentiment.csv")

# Select only the desired columns
df = df[["text", "Sentiment_Label"]]

# Save the modified DataFrame to a new CSV file
df.to_csv("modified_speaker_sentiment.csv", index=False)

# Display the first few rows of the modified DataFrame
df.head()


Unnamed: 0,text,Sentiment_Label
0,There's an engineering challenge in building q...,Positive
1,Want to dive deeper? Become a Big Think member...,Positive
2,One of the more interesting philosophical find...,Positive
3,"Well, you know that ChatGPT tried to copy itse...",Positive
4,I'm not sure what this is. What is this?,Negative
