In [None]:
# ================================
# Project Report (Markdown View in Jupyter)
# ================================

from IPython.display import Markdown, display

report = """
# Project Report: Audio Transcription and Grammar Evaluation Pipeline

## Objective

This notebook implements a pipeline that:
1. Transcribes audio files into text using OpenAI’s **Whisper** model.
2. Evaluates the **grammatical correctness** of the transcribed text using **LanguageTool**.
3. Outputs a grammar **score** and **error count** for each audio file.
4. Saves the final results into CSV files for both training and testing datasets.

---

## Approach & Workflow

### 1. Preprocessing
- Collect audio files from:
  - `audios/train`- training set
  - `audios/test` - testing set
- Supported formats: `.mp3`, `.wav`, `.m4a`

### 2. Pipeline Architecture

| Step                | Tool Used              | Output                        |
|---------------------|------------------------|-------------------------------|
| Audio Input         | OS + File Check        | List of valid audio paths     |
| Transcription       | `whisper`              | Text per audio file           |
| Grammar Check       | `language_tool_python` | Score (0-5), error count      |
| Result Aggregation  | `pandas`               | DataFrame → CSV               |

---

## Evaluation & Output

Outputs:
- `train_results.csv`
- `test_results.csv`

Each row:
- `file`: Audio filename
- `transcription`: Whisper-generated text
- `grammar_errors`: Grammar issues detected
- `grammar_score`: Score from 0 to 5

---

## Notes

- Requires **FFmpeg** (for audio processing)
- Requires **Java** (for default LanguageTool)
- Use `LanguageToolPublicAPI` if Java is not installed
- Whisper model size is configurable (`tiny`, `base`, `small`, `medium`, `large`)
"""

display(Markdown(report))

# ==========================
# Import Required Modules
# ==========================
import os
import sys
import subprocess
import pandas as pd
import whisper
import language_tool_python

# ==========================
# Check FFmpeg Installation
# ==========================
def check_ffmpeg():
    """Ensure FFmpeg is installed on the system."""
    try:
        subprocess.run(["ffmpeg", "-version"], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except Exception:
        print("FFmpeg not found. Please install it and add it to your system PATH.")
        sys.exit(1)

check_ffmpeg()

# ==========================
# Load Whisper Model
# ==========================
print("Loading Whisper model...")
model = whisper.load_model("base")  # Change to 'small', 'medium', or 'large' if needed

# ==========================
# Load Grammar Checker
# ==========================
print("Loading Grammar Checker...")
try:
    tool = language_tool_python.LanguageTool('en-US')
except Exception:
    # Fallback if Java is not available
    print("Java not found. Using LanguageToolPublicAPI as fallback.")
    tool = language_tool_python.LanguageToolPublicAPI('en-US')

# ==========================
# Transcribe Audio File
# ==========================
def transcribe_audio(audio_path):
    """Transcribes audio to text using Whisper."""
    print(f"Transcribing: {audio_path}")
    try:
        result = model.transcribe(audio_path)
        return result.get('text', '')  # safer than result['text']
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return ""

# ==========================
# Evaluate Grammar Score
# ==========================
def grammar_score(text):
    """Evaluates grammatical correctness and returns a score from 0 to 5."""
    if not text.strip():
        return 0.0, 0

    try:
        matches = tool.check(text)
        num_errors = len(matches)
        num_words = len(text.split())

        if num_words == 0:
            return 0.0, num_errors

        errors_per_100_words = (num_errors / num_words) * 100

        # Score based on error density
        if errors_per_100_words <= 2:
            score = 5
        elif errors_per_100_words <= 5:
            score = 4
        elif errors_per_100_words <= 10:
            score = 3
        elif errors_per_100_words <= 20:
            score = 2
        elif errors_per_100_words <= 30:
            score = 1
        else:
            score = 0

        return score, num_errors
    except Exception as e:
        print(f"Error analyzing grammar: {e}")
        return 0.0, 0

# ==========================
# Process Audio Files
# ==========================
def process_audio_files(audio_files):
    """Processes a list of audio files and returns a DataFrame with results."""
    results = []
    for audio_path in audio_files:
        if not os.path.isfile(audio_path):
            print(f"File not found: {audio_path}")
            continue

        text = transcribe_audio(audio_path)
        score, errors = grammar_score(text)

        results.append({
            "file": os.path.basename(audio_path),
            "transcription": text,
            "grammar_errors": errors,
            "grammar_score": score
        })

    return pd.DataFrame(results)

# ==========================
# Collect Audio Files
# ==========================
def collect_audio_files(folder_path):
    """Collects all supported audio files from a folder."""
    if not folder_path or not os.path.isdir(folder_path):
        print(f"Folder not found or invalid: {folder_path}")
        return []

    supported_formats = ('.mp3', '.wav', '.m4a')

    try:
        return [
            os.path.join(folder_path, file)
            for file in os.listdir(folder_path)
            if isinstance(file, str) and file.lower().endswith(supported_formats)
        ]
    except Exception as e:
        print(f"Error listing files in {folder_path}: {e}")
        return []

# ==========================
# Main Execution Block
# ==========================
if __name__ == "__main__":
    # Define audio dataset paths
    train_folder = os.path.join("audios", "train")
    test_folder = os.path.join("audios", "test")

    # Collect file paths
    train_files = collect_audio_files(train_folder)
    test_files = collect_audio_files(test_folder)

    if not train_files:
        print("No audio files found in 'train' folder.")
    if not test_files:
        print("No audio files found in 'test' folder.")

    # Process training data
    if train_files:
        print("\nProcessing training files...")
        train_results = process_audio_files(train_files)
        train_results.to_csv("train_results.csv", index=False)
        print("Saved training results to 'train_results.csv'.")

    # Process testing data
    if test_files:
        print("\nProcessing testing files...")
        test_results = process_audio_files(test_files)
        test_results.to_csv("test_results.csv", index=False)
        print("Saved testing results to 'test_results.csv'.")
