In [None]:
# Cell 1 ‚Äî Imports
import os
import io
import time
import json
import pickle
import requests
from io import StringIO

# Visualization
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS

# Panel / hvplot
import panel as pn
pn.extension()
import param
import hvplot.pandas
import pandas as pd
import numpy as np

# API Secret
from lib.API_KEY import API_KEY


In [None]:
# Cell 2 ‚Äî Local Audio File
audio_file = "audio.mp3"

print("Exists:", os.path.exists(audio_file))
if os.path.exists(audio_file):
    print("Size (bytes):", os.path.getsize(audio_file))
else:
    raise FileNotFoundError(f"{audio_file} not found. Place your audio file correctly.")


In [None]:
# Cell 3 ‚Äî Upload to AssemblyAI
upload_url_endpoint = "https://api.assemblyai.com/v2/upload"
headers_upload = {"authorization": API_KEY}

def upload_file_to_assemblyai(filename):
    with open(filename, "rb") as f:
        resp = requests.post(upload_url_endpoint, headers=headers_upload, data=f)
    resp.raise_for_status()
    return resp.json()["upload_url"]

print("Uploading...")
uploaded_audio_url = upload_file_to_assemblyai(audio_file)
print("Uploaded URL:", uploaded_audio_url)


In [None]:
# Cell 4 ‚Äî Submit transcription job
endpoint = "https://api.assemblyai.com/v2/transcript"
json_payload = {
    "audio_url": uploaded_audio_url,
    "auto_chapters": True,
    "sentiment_analysis": True,
    "auto_highlights": True,
    "iab_categories": True
}
headers_json = {"authorization": API_KEY, "content-type": "application/json"}

resp = requests.post(endpoint, json=json_payload, headers=headers_json)
resp.raise_for_status()
response_json = resp.json()
transcript_id = response_json["id"]

print("Transcript ID:", transcript_id)


In [None]:
# Cell 5 ‚Äî Poll AssemblyAI for status
result_endpoint = f"{endpoint}/{transcript_id}"
headers_auth = {"authorization": API_KEY}

while True:
    r = requests.get(result_endpoint, headers=headers_auth).json()
    status = r.get("status")
    print("Status:", status)

    if status == "completed":
        print("Transcription Completed.")
        break

    if status == "error":
        raise RuntimeError("Transcription error:", r.get("error"))

    time.sleep(3)


In [None]:
# Cell 6 ‚Äî Save transcript
final_result = requests.get(result_endpoint, headers=headers_auth).json()

with open("transcript.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, ensure_ascii=False, indent=2)

with open("transcript.txt", "w", encoding="utf-8") as f:
    f.write(final_result.get("text", ""))

with open("speech_data.pkl", "wb") as f:
    pickle.dump(final_result.copy(), f)

print("Saved transcript.json, transcript.txt, speech_data.pkl")


In [None]:
# Cell 7 ‚Äî Quick preview
transcript_text = final_result.get("text", "")
print("Text preview:\n", transcript_text[:500])
print("Chapters:", final_result.get("chapters"))
print("Highlights sample:", final_result.get("auto_highlights_result"))
print("Sentiment sample:", final_result.get("sentiment_analysis_results")[:3])


In [None]:
# Cell 8 ‚Äî Load saved data
with open(audio_file, "rb") as f:
    audio_bytes = io.BytesIO(f.read())

with open("speech_data.pkl", "rb") as f:
    data = pickle.load(f)


In [None]:
# Cell 9 ‚Äî Audio + Download
buffer = StringIO()
buffer.write(data.get("text", ""))
buffer.seek(0)

transcript_download = pn.widgets.FileDownload(file=buffer, filename="transcript.txt", button_type="success")
audio_play = pn.pane.Audio(audio_bytes, name="Audio", autoplay=False)

pn.Row(transcript_download, audio_play)


In [None]:
# Cell 10 ‚Äî Sentiment DataFrame + Bar Plot
sentiment = data.get("sentiment_analysis_results", [])
sentiment_df = pd.DataFrame(sentiment)

sentiment_counts = sentiment_df["sentiment"].value_counts() if not sentiment_df.empty else pd.Series()

if not sentiment_counts.empty:
    sentiment_plot = sentiment_counts.hvplot.bar(title="Sentences by Sentiment")
else:
    sentiment_plot = pn.pane.Markdown("No sentiment data available.")

positive_df = sentiment_df[sentiment_df["sentiment"] == "POSITIVE"]
negative_df = sentiment_df[sentiment_df["sentiment"] == "NEGATIVE"]
neutral_df  = sentiment_df[sentiment_df["sentiment"] == "NEUTRAL"]

sentiment_tabs = pn.Tabs(
    ("Overview", sentiment_plot),
    ("Positive", pn.widgets.DataFrame(positive_df, width=700, height=300)),
    ("Negative", pn.widgets.DataFrame(negative_df, width=700, height=300)),
    ("Neutral",  pn.widgets.DataFrame(neutral_df, width=700, height=300))
)

sentiment_tabs


In [None]:
# Cell 11 ‚Äî Word Cloud Slider
text = data.get("text", "")
if not text:
    text = "No transcript available."

# Process words
stopwords = set(STOPWORDS)
words = [w.lower() for w in text.split()]
all_words = " ".join(words)

# Generate word cloud
wc = WordCloud(
    background_color="black",
    stopwords=stopwords,
    max_words=100,
    collocations=False
).generate(all_words)

# Plot using Plotly
fig = px.imshow(wc)
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(showticklabels=False)
fig.update_layout(
    title="Word Cloud (Top 100 Words)",
    margin=dict(l=0, r=0, t=40, b=0)
)

fig.show()


In [None]:
# Cell 12 ‚Äî Chapter summary + audio jumping
chapters = data.get("chapters", [])
chapters_layout = pn.Column("### Auto Chapter Summary")

class ButtonAudio:
    def __init__(self, start):
        self.start = start
        self.button = pn.widgets.Button(name=f"{start/1000:.0f}s", button_type="primary", width=60)
        self.audio = pn.pane.Audio(io.BytesIO(audio_bytes.getvalue()), time=start/1000)
        self.button.on_click(self.jump)

    def jump(self, event):
        self.audio.time = self.start / 1000

for ch in chapters:
    widget = pn.widgets.StaticText(value=ch.get("summary", ""), width=1000)
    btn = ButtonAudio(ch.get("start"))
    chapters_layout.append(pn.Row(btn.button, btn.audio, widget))

chapters_layout


In [None]:
# Restore transcript text
text = data.get("text", "")

print("Transcript length:", len(text))


In [None]:
import nltk
nltk.download("stopwords")


In [None]:
# ---- WORD FREQUENCY BAR CHART ----
import pandas as pd
from collections import Counter
import plotly.express as px
import re
from nltk.corpus import stopwords

if not text:
    raise ValueError("‚ùå ERROR: `text` is empty. Reload your transcript first.")

# Clean transcript text
clean_text = re.sub(r"[^a-zA-Z ]", " ", text.lower())
words = clean_text.split()

# Remove stopwords
stop = set(stopwords.words("english"))
filtered_words = [w for w in words if w not in stop and len(w) > 2]

print("Filtered word count:", len(filtered_words))

# Count top 20 frequent words
word_counts = Counter(filtered_words).most_common(20)
df_wc = pd.DataFrame(word_counts, columns=["word", "frequency"])

# Plot
fig = px.bar(
    df_wc,
    x="word",
    y="frequency",
    title="Top 20 Most Frequent Words",
    labels={"word": "Word", "frequency": "Count"}
)

fig.update_layout(xaxis_tickangle=45)
fig.show()


In [None]:

# Load saved data (pkl, audio, transcript)
audio_file = "audio.mp3"

with open("speech_data.pkl", "rb") as f:
    data = pickle.load(f)

with open(audio_file, "rb") as f:
    audio_bytes = io.BytesIO(f.read())

text = data.get("text", "")
chapters = data.get("chapters", [])
sentiment = data.get("sentiment_analysis_results", [])


# ---------------------------------------------------
# 1Ô∏è‚É£ AUDIO + DOWNLOAD
# ---------------------------------------------------
buffer = StringIO()
buffer.write(text)
buffer.seek(0)

transcript_download = pn.widgets.FileDownload(
    file=buffer,
    filename="transcript.txt",
    button_type="success",
    width=200
)

audio_player = pn.pane.Audio(audio_bytes, autoplay=False, name="Audio Player")

audio_section = pn.Column(
    "## üéµ Audio & Transcript",
    pn.Row(transcript_download, audio_player)
)


# ---------------------------------------------------
# 2Ô∏è‚É£ SENTIMENT BAR CHART
# ---------------------------------------------------
sentiment_df = pd.DataFrame(sentiment)

if not sentiment_df.empty:
    sentiment_counts = sentiment_df["sentiment"].value_counts()
    sentiment_plot = sentiment_counts.hvplot.bar(
        title="Sentences by Sentiment",
        ylabel="Count",
        xlabel="Sentiment",
        height=350,
        width=500
    )
else:
    sentiment_plot = pn.pane.Markdown("No sentiment data available.")

sentiment_section = pn.Column(
    "## üòÄ Sentiment Analysis",
    sentiment_plot
)


# ---------------------------------------------------
# 3Ô∏è‚É£ WORD CLOUD
# ---------------------------------------------------
words = text.split()
stopwords = set(STOPWORDS)
wc = WordCloud(
    background_color="black",
    stopwords=stopwords,
    max_words=100,
    collocations=False
).generate(" ".join(words))

fig_wc = px.imshow(wc)
fig_wc.update_xaxes(showticklabels=False)
fig_wc.update_yaxes(showticklabels=False)
fig_wc.update_layout(height=350, margin=dict(l=0, r=0, t=30, b=0))

wordcloud_section = pn.Column(
    "## ‚òÅÔ∏è Word Cloud",
    pn.pane.Plotly(fig_wc, height=350)
)


# ---------------------------------------------------
# 4Ô∏è‚É£ WORD FREQUENCY BAR CHART
# ---------------------------------------------------
clean_text = re.sub(r"[^a-zA-Z ]", " ", text.lower())
word_tokens = clean_text.split()
filtered = [w for w in word_tokens if w not in stopwords and len(w) > 2]

word_counts = Counter(filtered).most_common(20)
df_wc = pd.DataFrame(word_counts, columns=["word", "frequency"])

fig_freq = px.bar(
    df_wc,
    x="word",
    y="frequency",
    title="Top 20 Most Frequent Words"
)
fig_freq.update_layout(xaxis_tickangle=45)

freq_section = pn.Column(
    "## üî† Word Frequency",
    pn.pane.Plotly(fig_freq, height=350)
)


# ---------------------------------------------------
# 5Ô∏è‚É£ CHAPTER SUMMARY + AUDIO JUMPS
# ---------------------------------------------------
class ChapterButton:
    def __init__(self, start):
        self.start = start
        self.button = pn.widgets.Button(
            name=f"{start/1000:.0f}s",
            button_type="primary",
            width=60
        )
        self.audio = pn.pane.Audio(
            io.BytesIO(audio_bytes.getvalue()),
            time=start / 1000
        )
        self.button.on_click(self.jump)

    def jump(self, event):
        self.audio.time = self.start / 1000

chapter_layout = pn.Column("## üìå Auto-Generated Chapters")

for ch in chapters:
    summary = ch.get("summary", "")
    start = ch.get("start", 0)

    btn = ChapterButton(start)
    chapter_layout.append(
        pn.Row(
            btn.button,
            btn.audio,
            pn.widgets.StaticText(value=summary, width=700)
        )
    )


# ---------------------------------------------------
# FINAL DASHBOARD LAYOUT
# ---------------------------------------------------
dashboard = pn.Column(
    "# üéôÔ∏è Audio Transcription & NLP Dashboard",
    audio_section,
    pn.layout.Divider(),

    pn.Row(sentiment_section, wordcloud_section),
    pn.layout.Divider(),

    freq_section,
    pn.layout.Divider(),

    chapter_layout,
    sizing_mode="stretch_width"
)

dashboard
