In [None]:
# Install Required Libraries
!pip install openai-whisper transformers googletrans==4.0.0-rc1 nltk

In [None]:
# Import Libraries
import whisper
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from googletrans import Translator
import nltk
nltk.download('punkt_tab')  # Updated to punkt_tab

In [None]:
# Whisper Transcription with Translation
model = whisper.load_model("tiny")
translator = Translator()

def transcribe_audio(audio_file_path):
    result = model.transcribe(audio_file_path, language=None)
    if result is None or "text" not in result:  # Check for None or missing text
        raise ValueError("Transcription failed - no text returned")
    transcript = result["text"]
    detected_lang = result.get("language", "unknown")  # Fallback if language missing
    if detected_lang != "en":
        transcript = translator.translate(transcript, dest="en").text
    return transcript

In [None]:
# Sentiment Analysis with Hugging Face
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_sentiment(text):
    result = sentiment_analyzer(text)[0]
    score, label = result["score"], result["label"]
    if score < 0.6:
        return "NEUTRAL", 0.0
    return ("POSITIVE", score) if label == "POSITIVE" else ("NEGATIVE", -score)

In [None]:
# Link WAV Audio Files from Google Drive
from google.colab import drive
drive.mount('/content/drive')
audio_folder = "/content/drive/My Drive/Colab Notebooks/Project3/clipped_audios"  # Updated folder name
clipped_audio_files = [os.path.join(audio_folder, f) for f in os.listdir(audio_folder) if f.endswith('.wav')]  # No _clipped
print("Audio files:", clipped_audio_files)

In [None]:
# Process Audio Files and Create DataFrame with Sentence Splitting
data = []
league_map = {
    "premierleague": "Premier League",
    "seriea": "Serie A",
    "bundesliga": "Bundesliga",
    "laliga": "La Liga"
}

for audio_file in clipped_audio_files:
    try:
        transcript = transcribe_audio(audio_file)
        # Split transcript into sentences
        sentences = nltk.sent_tokenize(transcript)

        # Extract and normalize league name with fallback
        filename = audio_file.split("/")[-1].replace(".wav", "").rstrip("0123456789")
        league_name = league_map.get(filename, "Unknown League")

        # Process each sentence
        for sentence in sentences:
            # Chunk sentence into ~512-token pieces (approx 500 words) if needed
            max_length = 500
            words = sentence.split()
            chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

            # Analyze each chunk and aggregate
            sentiments, polarities = [], []
            for chunk in chunks:
                sentiment, polarity = get_sentiment(chunk)
                sentiments.append(sentiment)
                polarities.append(polarity)

            # Majority sentiment and average polarity per sentence
            final_sentiment = max(set(sentiments), key=sentiments.count) if sentiments else "NEUTRAL"
            final_polarity = sum(polarities) / len(polarities) if polarities else 0.0

            # Calculate sentiment score
            sentiment_weight = {"POSITIVE": 1, "NEGATIVE": -1, "NEUTRAL": 0}
            sentiment_score = final_polarity * sentiment_weight[final_sentiment]

            data.append({
                "League": league_name,
                "Sentence": sentence,
                "Sentiment": final_sentiment,
                "Polarity": final_polarity,
                "Sentiment Score": sentiment_score  # New column
            })
    except Exception as e:
        print(f"Error on {audio_file}: {e}")
        filename = audio_file.split("/")[-1].replace(".wav", "").rstrip("0123456789")
        league_name = league_map.get(filename, "Unknown League")
        data.append({
            "League": league_name,
            "Sentence": "Transcription failed",
            "Sentiment": "NEUTRAL",
            "Polarity": 0.0,
            "Sentiment Score": 0.0  # Default for errors
        })

df = pd.DataFrame(data)
print(df.head())

In [None]:
# Save Results to Google Drive
from google.colab import drive
drive.mount('/content/drive')
output_csv_path = "/content/drive/My Drive/Colab Notebooks/Project3/var_sentiment_audio_2024.csv"
df.to_csv(output_csv_path, index=False)
print(f"Saved to {output_csv_path}")

In [None]:
# Visualizations: Pie Charts per League, Bar Chart, and Box Plot
plt.figure(figsize=(12, 8))
leagues = df["League"].unique()

# One pie chart per league
for i, league in enumerate(leagues, 1):
    plt.subplot(2, 2, i)  # 2x2 grid for four leagues
    league_data = df[df["League"] == league]["Sentiment"].value_counts()
    league_data.plot.pie(
        autopct="%1.1f%%",
        colors=["#66b3ff", "#ff9999", "#99ff99"],
        startangle=90,
        title=f"{league} Sentiment"
    )
    plt.ylabel("")
plt.tight_layout()
plt.show()

# Bar Chart
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="League", hue="Sentiment", palette="viridis")
plt.title("Sentiment by League")
plt.xlabel("League")
plt.ylabel("Count")
plt.show()

# Box Plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="League", y="Polarity", palette="coolwarm")
plt.title("Polarity by League")
plt.xlabel("League")
plt.ylabel("Polarity")
plt.show()