<a href="https://colab.research.google.com/github/SriKrishnaMishra/-Grammar-Scoring-Engine-for-Voice/blob/main/Copy_of_grammar_scoring_engine_interview_presentation_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Environment Setup & Data Exploration

In [None]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

List all files in the dataset

In [None]:
print("üìÇ Dataset Structure:")
print("="*60)

# Define the base directory where audio and CSV files might be found
# In Colab, you typically upload files to /content/
base_dataset_path = '/content/audio_data' # Suggest a new path for user to upload files

audio_files = []
csv_files = []

# Check if the directory exists before walking through it
if os.path.exists(base_dataset_path):
    for dirname, _, filenames in os.walk(base_dataset_path):
        for filename in filenames:
            filepath = os.path.join(dirname, filename)
            print(filepath)

            if filename.endswith(('.wav', '.mp3', '.m4a', '.flac')):
                audio_files.append(filepath)
            elif filename.endswith('.csv'):
                csv_files.append(filepath)
else:
    print(f"‚ö†Ô∏è Base dataset path '{base_dataset_path}' does not exist. Please upload your files or specify the correct path.")

print("\n" + "="*60)
print(f"‚úÖ Found {len(audio_files)} audio files")
print(f"‚úÖ Found {len(csv_files)} CSV files")

üìÇ Dataset Structure:
‚ö†Ô∏è Base dataset path '/content/audio_data' does not exist. Please upload your files or specify the correct path.

‚úÖ Found 0 audio files
‚úÖ Found 0 CSV files


Load CSV data if available

In [None]:
if csv_files:
    print(f"\nüìä Loading CSV: {csv_files[0]}")
    df_labels = pd.read_csv(csv_files[0])
    print(f"Shape: {df_labels.shape}")
    print("\nFirst few rows:")
    print(df_labels.head())
    print("\nColumns:", df_labels.columns.tolist())
    print("\nData types:")
    print(df_labels.dtypes)

Install Required Packages


In [None]:
print("\nüîß Installing required packages...")
print("This will take 5-7 minutes. Please wait...")

# Install packages
import sys
!{sys.executable} -m pip install -q openai-whisper
!{sys.executable} -m pip install -q language-tool-python
!{sys.executable} -m pip install -q textstat
!{sys.executable} -m pip install -q transformers
!{sys.executable} -m pip install -q soundfile
!{sys.executable} -m pip install -q librosa

print("‚úÖ All packages installed successfully!")


üîß Installing required packages...
This will take 5-7 minutes. Please wait...
‚úÖ All packages installed successfully!


Import Libraries

In [None]:
import torch
import whisper
import librosa
import soundfile as sf
from pathlib import Path
import language_tool_python
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import textstat

ML libraries

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

print("‚úÖ All libraries imported!")
print(f"üñ•Ô∏è GPU Available: {torch.cuda.is_available()}")
print(f"üñ•Ô∏è Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

‚úÖ All libraries imported!
üñ•Ô∏è GPU Available: False
üñ•Ô∏è Device: cpu


Speech-to-Text Module

In [None]:
class SpeechTranscriber:
    """Converts audio files to text using Whisper"""

    def __init__(self, model_size='base'):
        print(f"üé§ Loading Whisper '{model_size}' model...")
        self.model = whisper.load_model(model_size)
        print("‚úÖ Whisper model loaded!")

    def transcribe(self, audio_path):
        """Transcribe audio file to text"""
        try:
            # Load audio with Whisper
            result = self.model.transcribe(audio_path, language='en', fp16=False)
            return result['text'].strip()
        except Exception as e:
            print(f"‚ùå Error transcribing {audio_path}: {e}")
            return ""

    def transcribe_batch(self, audio_paths, show_progress=True):
        """Transcribe multiple audio files"""
        transcripts = []

        for i, audio_path in enumerate(audio_paths, 1):
            if show_progress and i % 10 == 0:
                print(f"Progress: {i}/{len(audio_paths)}")

            transcript = self.transcribe(audio_path)
            transcripts.append(transcript)

        return transcripts
        # Initialize transcriber
transcriber = SpeechTranscriber(model_size='base')

üé§ Loading Whisper 'base' model...
‚úÖ Whisper model loaded!


Grammar Feature Extraction Module

In [None]:
class GrammarFeatureExtractor:
    """Extracts grammar-related features from text"""

    def __init__(self):
        print("üìù Initializing grammar tools...")

        # Grammar checker
        print("  Loading LanguageTool...")
        self.grammar_tool = language_tool_python.LanguageTool('en-US')

        # Grammar correction model
        print("  Loading T5 Grammar Correction model...")
        try:
            self.gec_tokenizer = AutoTokenizer.from_pretrained("vennify/t5-base-grammar-correction")
            self.gec_model = AutoModelForSeq2SeqLM.from_pretrained("vennify/t5-base-grammar-correction")
            self.gec_available = True
        except:
            print("  ‚ö†Ô∏è T5 model not available, will use basic features only")
            self.gec_available = False

        print("‚úÖ Grammar tools ready!")

    def clean_text(self, text):
        """Remove fillers and clean text"""
        if not text or len(text.strip()) == 0:
            return ""

        # Common fillers in speech
        fillers = [
            ' uh ', ' um ', ' hmm ', ' ah ',
            ' you know ', ' i mean ', ' like ',
            ' basically ', ' actually ', ' literally ',
            ' so yeah ', ' right ', ' okay '
        ]

        text_lower = ' ' + text.lower() + ' '
        for filler in fillers:
            text_lower = text_lower.replace(filler, ' ')

        # Clean up extra spaces
        text_lower = ' '.join(text_lower.split())
        return text_lower.strip()

    def count_grammar_errors(self, text):
        """Count grammar errors using LanguageTool"""
        if not text or len(text.strip()) == 0:
            return 0

        try:
            matches = self.grammar_tool.check(text)
            return len(matches)
        except:
            return 0

    def get_correction_edits(self, text):
        """Get number of corrections needed"""
        if not text or len(text.strip()) == 0 or not self.gec_available:
            return 0

        try:
            # Prepare input
            input_text = "grammar: " + text[:512]  # Limit length
            input_ids = self.gec_tokenizer.encode(
                input_text,
                return_tensors='pt',
                max_length=512,
                truncation=True
            )

            # Generate correction
            with torch.no_grad():
                outputs = self.gec_model.generate(
                    input_ids,
                    max_length=512,
                    num_beams=4,
                    early_stopping=True
                )

            corrected = self.gec_tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Calculate edit distance (simple word-level)
            original_words = text.split()
            corrected_words = corrected.split()

            edit_count = abs(len(original_words) - len(corrected_words))
            edit_count += sum(o != c for o, c in zip(original_words, corrected_words))

            return edit_count
        except:
            return 0

    def extract_features(self, text):
        """Extract all grammar features from text"""
        if not text or len(text.strip()) == 0:
            return self._empty_features()

        # Clean text
        cleaned_text = self.clean_text(text)

        # Basic statistics
        words = cleaned_text.split()
        word_count = len(words)
        char_count = len(cleaned_text)

        if word_count == 0:
            return self._empty_features()

        # Sentence count
        sentence_endings = cleaned_text.count('.') + cleaned_text.count('!') + cleaned_text.count('?')
        sentence_count = max(1, sentence_endings)

        # Grammar errors
        error_count = self.count_grammar_errors(cleaned_text)

        # Correction edits
        edit_count = self.get_correction_edits(cleaned_text)

        # Readability metrics
        try:
            flesch_reading = textstat.flesch_reading_ease(cleaned_text)
            flesch_kincaid = textstat.flesch_kincaid_grade(cleaned_text)
        except:
            flesch_reading = 50.0
            flesch_kincaid = 8.0

        # Advanced features
        avg_word_length = char_count / word_count
        avg_sentence_length = word_count / sentence_count

        # Punctuation features
        comma_count = cleaned_text.count(',')
        semicolon_count = cleaned_text.count(';')

        # Vocabulary complexity (unique words ratio)
        unique_words = len(set(words))
        vocab_diversity = unique_words / word_count if word_count > 0 else 0

        features = {
            'word_count': word_count,
            'char_count': char_count,
            'sentence_count': sentence_count,
            'error_count': error_count,
            'edit_count': edit_count,
            'errors_per_word': error_count / word_count,
            'edits_per_word': edit_count / word_count,
            'flesch_reading_ease': flesch_reading,
            'flesch_kincaid_grade': flesch_kincaid,
            'avg_word_length': avg_word_length,
            'avg_sentence_length': avg_sentence_length,
            'comma_count': comma_count,
            'semicolon_count': semicolon_count,
            'vocab_diversity': vocab_diversity,
            'punctuation_per_sentence': (comma_count + semicolon_count) / sentence_count
        }

        return features

    def _empty_features(self):
        """Return empty feature dict"""
        return {
            'word_count': 0, 'char_count': 0, 'sentence_count': 0,
            'error_count': 0, 'edit_count': 0, 'errors_per_word': 0,
            'edits_per_word': 0, 'flesch_reading_ease': 0,
            'flesch_kincaid_grade': 0, 'avg_word_length': 0,
            'avg_sentence_length': 0, 'comma_count': 0,
            'semicolon_count': 0, 'vocab_diversity': 0,
            'punctuation_per_sentence': 0
        }

# Initialize feature extractor
feature_extractor = GrammarFeatureExtractor()


üìù Initializing grammar tools...
  Loading LanguageTool...
  Loading T5 Grammar Correction model...
‚úÖ Grammar tools ready!


Process All Audio Files

In [None]:
print("üéµ Processing audio files...")
print("="*60)

# Create results list
results = []

# Process each audio file
for i, audio_path in enumerate(audio_files, 1):
    filename = os.path.basename(audio_path)
    print(f"\n[{i}/{len(audio_files)}] Processing: {filename}")

    try:
        # Transcribe audio
        print("  üé§ Transcribing...")
        transcript = transcriber.transcribe(audio_path)

        if not transcript:
            print("  ‚ö†Ô∏è Empty transcript, skipping...")
            continue

        print(f"  üìù Transcript: {transcript[:100]}...")

        # Extract features
        print("  üîç Extracting features...")
        features = feature_extractor.extract_features(transcript)

        # Combine results
        result = {
            'filename': filename,
            'audio_path': audio_path,
            'transcript': transcript,
            **features
        }

        results.append(result)
        print(f"  ‚úÖ Done! Errors: {features['error_count']}, Edits: {features['edit_count']}")

    except Exception as e:
        print(f"  ‚ùå Error: {e}")
        continue

# Create DataFrame
df_processed = pd.DataFrame(results)
print("\n" + "="*60)
print(f"‚úÖ Successfully processed {len(df_processed)} audio files!")

# Display results
print("\nüìä Processed Data Preview:")
print(df_processed.head())

# Create the directory if it doesn't exist
output_dir = '/kaggle/working'
os.makedirs(output_dir, exist_ok=True)

# Save intermediate results
df_processed.to_csv(os.path.join(output_dir, 'processed_transcripts.csv'), index=False)
print(f"\nüíæ Saved transcripts to: {os.path.join(output_dir, 'processed_transcripts.csv')}")

üéµ Processing audio files...

‚úÖ Successfully processed 0 audio files!

üìä Processed Data Preview:
Empty DataFrame
Columns: []
Index: []

üíæ Saved transcripts to: /kaggle/working/processed_transcripts.csv


 Merge with Labels and Prepare Training Data

In [None]:
print("\nüîó Merging with labels...")

# Try different column names for matching
possible_filename_cols = ['filename', 'file', 'audio_file', 'file_name', 'id']
possible_score_cols = ['score', 'grammar_score', 'label', 'rating', 'target']

# Find correct column names
filename_col = None
score_col = None

if csv_files and len(df_labels) > 0:
    for col in possible_filename_cols:
        if col in df_labels.columns:
            filename_col = col
            break

    for col in possible_score_cols:
        if col in df_labels.columns:
            score_col = col
            break

    if filename_col and score_col:
        print(f"  Using filename column: '{filename_col}'")
        print(f"  Using score column: '{score_col}'")

        # Merge datasets
        df_full = df_processed.merge(
            df_labels[[filename_col, score_col]],
            left_on='filename',
            right_on=filename_col,
            how='left'
        )

        # Rename score column
        df_full = df_full.rename(columns={score_col: 'grammar_score'})

        print(f"\n‚úÖ Merged successfully!")
        print(f"  Total samples: {len(df_full)}")
        print(f"  Samples with labels: {df_full['grammar_score'].notna().sum()}")

    else:
        print("‚ö†Ô∏è Could not find matching columns. Using processed data only.")
        df_full = df_processed.copy()
        df_full['grammar_score'] = np.nan
else:
    print("‚ö†Ô∏è No labels available. Will use unsupervised approach.")
    df_full = df_processed.copy()
    df_full['grammar_score'] = np.nan


üîó Merging with labels...
‚ö†Ô∏è No labels available. Will use unsupervised approach.


 Display statistics

In [None]:
print("\nüìà Score Distribution:")
if df_full['grammar_score'].notna().any():
    print(df_full['grammar_score'].describe())

    plt.figure(figsize=(10, 4))
    plt.subplot(1, 2, 1)
    df_full['grammar_score'].hist(bins=20, edgecolor='black')
    plt.xlabel('Grammar Score')
    plt.ylabel('Frequency')
    plt.title('Score Distribution')

    plt.subplot(1, 2, 2)
    df_full.boxplot(column='grammar_score')
    plt.ylabel('Grammar Score')
    plt.title('Score Box Plot')

    plt.tight_layout()
    plt.savefig('/kaggle/working/score_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("No labels available for visualization.")




üìà Score Distribution:
No labels available for visualization.


Feature Analysis and Selection (FIXED)

In [None]:
import os

# Define the target directory for audio files
base_dataset_path = '/content/audio_data'

# Create the directory if it does not exist
if not os.path.exists(base_dataset_path):
    os.makedirs(base_dataset_path)
    print(f"Created directory: {base_dataset_path}")
else:
    print(f"Directory already exists: {base_dataset_path}")

print("Please upload your audio files (.wav, .mp3, etc.) into this directory now.")

Created directory: /content/audio_data
Please upload your audio files (.wav, .mp3, etc.) into this directory now.


Model Training

In [None]:
print("\nü§ñ Training machine learning models...")
print("="*60)

# Check if we have labels
if df_full['grammar_score'].notna().sum() < 10:
    print("‚ö†Ô∏è Not enough labeled samples for training.")
    print("Creating heuristic scoring model instead...")

    # Heuristic scoring function
    def heuristic_score(row):
        """Simple heuristic based on error rates"""
        base_score = 5.0

        # Penalize errors
        error_penalty = row['errors_per_word'] * 10
        edit_penalty = row['edits_per_word'] * 5

        # Bonus for good readability
        readability_bonus = 0
        if 60 <= row['flesch_reading_ease'] <= 80:
            readability_bonus = 0.3

        # Calculate final score
        score = base_score - error_penalty - edit_penalty + readability_bonus
        return max(0, min(5, score))

    df_full['predicted_score'] = df_full.apply(heuristic_score, axis=1)
    print("‚úÖ Heuristic scoring applied!")

else:
    # We have labels - train ML models
    print(f"‚úÖ Found {df_full['grammar_score'].notna().sum()} labeled samples")

    # Prepare data
    df_train = df_full[df_full['grammar_score'].notna()].copy()
    X = df_train[available_features].fillna(0)
    y = df_train['grammar_score']

    # Handle any infinite values
    X = X.replace([np.inf, -np.inf], 0)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=pd.qcut(y, q=3, duplicates='drop')
    )

    print(f"\nTraining set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train multiple models
    models = {
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5),
        'Ridge Regression': Ridge(alpha=1.0),
        'Lasso Regression': Lasso(alpha=0.1)
    }

    results_dict = {}

    for name, model in models.items():
        print(f"\nüîß Training {name}...")

        # Train
        model.fit(X_train_scaled, y_train)

        # Predict
        y_pred = model.predict(X_test_scaled)

        # Evaluate
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        results_dict[name] = {
            'model': model,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'predictions': y_pred
        }

        print(f"  MAE:  {mae:.3f}")
        print(f"  RMSE: {rmse:.3f}")
        print(f"  R¬≤:   {r2:.3f}")

    # Select best model
    best_model_name = min(results_dict.keys(), key=lambda k: results_dict[k]['mae'])
    best_model = results_dict[best_model_name]['model']

    print(f"\nüèÜ Best Model: {best_model_name}")
    print(f"   MAE: {results_dict[best_model_name]['mae']:.3f}")

    # Predict on all data
    X_all = df_full[available_features].fillna(0).replace([np.inf, -np.inf], 0)
    X_all_scaled = scaler.transform(X_all)
    df_full['predicted_score'] = best_model.predict(X_all_scaled)

    # Clip predictions to valid range
    df_full['predicted_score'] = df_full['predicted_score'].clip(0, 5)

    # Visualize results
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.scatter(y_test, results_dict[best_model_name]['predictions'], alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Actual Score')
    plt.ylabel('Predicted Score')
    plt.title(f'{best_model_name} - Predictions vs Actual')
    plt.grid(True, alpha=0.3)

    plt.subplot(1, 2, 2)
    model_names = list(results_dict.keys())
    maes = [results_dict[m]['mae'] for m in model_names]
    plt.barh(model_names, maes)
    plt.xlabel('Mean Absolute Error')
    plt.title('Model Comparison')
    plt.grid(True, alpha=0.3, axis='x')

    plt.tight_layout()
    plt.savefig('/kaggle/working/model_performance.png', dpi=150, bbox_inches='tight')
    plt.show()

    # Feature importance (for tree-based models)
    if best_model_name in ['Random Forest', 'Gradient Boosting']:
        feature_importance = pd.DataFrame({
            'feature': available_features,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)

        print("\nüìä Top 10 Most Important Features:")
        print(feature_importance.head(10))

        plt.figure(figsize=(10, 6))
        sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
        plt.title('Top 10 Feature Importances')
        plt.tight_layout()
        plt.savefig('/kaggle/working/feature_importance.png', dpi=150, bbox_inches='tight')
        plt.show()



ü§ñ Training machine learning models...
‚ö†Ô∏è Not enough labeled samples for training.
Creating heuristic scoring model instead...
‚úÖ Heuristic scoring applied!


Generate Submission File

In [None]:
print("\n‚ÜóÔ∏è Creating submission file...")

# Check if df_full is empty or has no processed data
if df_full.empty or 'predicted_score' not in df_full.columns:
    print("\n‚ö†Ô∏è No processed audio files found or predicted scores available.")
    print("Please ensure you have uploaded your audio files to '/content/audio_data' ")
    print("and re-run all cells from the 'Environment Setup & Data Exploration' section.")
else:
    # Create submission DataFrame
    submission = df_full[['filename', 'predicted_score']].copy()
    submission = submission.rename(columns={'predicted_score': 'grammar_score'})

    # Round scores to reasonable precision
    submission['grammar_score'] = submission['grammar_score'].round(2)

    # Display sample
    print("\nüìã Submission Preview:")
    print(submission.head(10))

    print(f"\nüìä Submission Statistics:")
    print(submission['grammar_score'].describe())

    # Save submission
    submission.to_csv('/kaggle/working/submission.csv', index=False)
    print("\n‚úÖ Submission saved to: /kaggle/working/submission.csv")

    # Also save detailed results
    detailed_results = df_full[['filename', 'transcript', 'error_count', 'edit_count',
                                 'errors_per_word', 'flesch_reading_ease', 'predicted_score']].copy()
    detailed_results.to_csv('/kaggle/working/detailed_results.csv', index=False)
    print("‚úÖ Detailed results saved to: /kaggle/working/detailed_results.csv")


‚ÜóÔ∏è Creating submission file...

‚ö†Ô∏è No processed audio files found or predicted scores available.
Please ensure you have uploaded your audio files to '/content/audio_data' 
and re-run all cells from the 'Environment Setup & Data Exploration' section.


Final Visualizations and Analysis


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

print("\nüìä Creating final visualizations...")

# Check if df_full exists and is not empty
if 'df_full' not in locals() or df_full.empty:
    print("\n‚ö†Ô∏è df_full DataFrame is not defined or is empty.")
    print("Please ensure audio files have been uploaded to '/content/audio_data' and all previous cells (especially 'Process All Audio Files' and 'Merge with Labels') have been successfully executed.")
else:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))

    # 1. Score distribution
    axes[0, 0].hist(df_full['predicted_score'], bins=30, edgecolor='black', alpha=0.7)
    axes[0, 0].set_xlabel('Predicted Grammar Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Distribution of Predicted Scores')
    axes[0, 0].grid(True, alpha=0.3);

    # 2. Error count vs Score
    axes[0, 1].scatter(df_full['error_count'], df_full['predicted_score'], alpha=0.5)
    axes[0, 1].set_xlabel('Grammar Error Count')
    axes[0, 1].set_ylabel('Predicted Score')
    axes[0, 1].set_title('Grammar Errors vs Score')
    axes[0, 1].grid(True, alpha=0.3);

    # 3. Readability vs Score
    axes[1, 0].scatter(df_full['flesch_reading_ease'], df_full['predicted_score'], alpha=0.5, c='green')
    axes[1, 0].set_xlabel('Flesch Reading Ease')
    axes[1, 0].set_ylabel('Predicted Score')
    axes[1, 0].set_title('Readability vs Score')
    axes[1, 0].grid(True, alpha=0.3);

    # 4. Word count vs Score
    axes[1, 1].scatter(df_full['word_count'], df_full['predicted_score'], alpha=0.5, c='orange')
    axes[1, 1].set_xlabel('Word Count')
    axes[1, 1].set_ylabel('Predicted Score')
    axes[1, 1].set_title('Word Count vs Score')
    axes[1, 1].grid(True, alpha=0.3);

    plt.tight_layout()
    plt.savefig('/kaggle/working/final_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()

    print("‚úÖ Visualizations saved!")


üìä Creating final visualizations...

‚ö†Ô∏è df_full DataFrame is not defined or is empty.
Please ensure audio files have been uploaded to '/content/audio_data' and all previous cells (especially 'Process All Audio Files' and 'Merge with Labels') have been successfully executed.


Summary Report

In [None]:
print("\n" + "="*60)
print("üìã GRAMMAR SCORING ENGINE - FINAL REPORT")
print("="*60)

# Check if df_full exists and is not empty
if 'df_full' in locals() and not df_full.empty:
    print(f"\nüìä Dataset Summary:")
    print(f"  Total audio files processed: {len(df_full)}")
    print(f"  Average transcript length: {df_full['word_count'].mean():.1f} words")
    print(f"  Average grammar errors: {df_full['error_count'].mean():.2f}")
    print(f"  Average correction edits: {df_full['edit_count'].mean():.2f}")

    print(f"\n‚≠ê Score Summary:")
    print(f"  Mean score: {df_full['predicted_score'].mean():.2f}")
    print(f"  Median score: {df_full['predicted_score'].median():.2f}")
    print(f"  Std deviation: {df_full['predicted_score'].std():.2f}")
    print(f"  Min score: {df_full['predicted_score'].min():.2f}")
    print(f"  Max score: {df_full['predicted_score'].max():.2f}")

    print(f"\nüìà Score Distribution:")
    score_ranges = pd.cut(df_full['predicted_score'], bins=[0, 1, 2, 3, 4, 5], labels=['0-1', '1-2', '2-3', '3-4', '4-5'])
    print(score_ranges.value_counts().sort_index())

    if df_full['grammar_score'].notna().any():
        # Ensure results_dict and best_model_name are defined if labels were used
        if 'results_dict' in locals() and 'best_model_name' in locals() and best_model_name in results_dict:
            print(f"\nüéØ Model Performance:")
            print(f"  Mean Absolute Error: {results_dict[best_model_name]['mae']:.3f}")
            print(f"  Root Mean Squared Error: {results_dict[best_model_name]['rmse']:.3f}")
            print(f"  R¬≤ Score: {results_dict[best_model_name]['r2']:.3f}")
        else:
            print("\n‚ö†Ô∏è Model performance metrics not available (no labeled data or model not trained).")
else:
    print("\n‚ö†Ô∏è No processed audio files or data available for reporting.")
    print("Please ensure you have uploaded your audio files to '/content/audio_data' ")
    print("and re-run all cells from the 'Environment Setup & Data Exploration' section.")

print(f"\nüíæ Output Files Generated:")
print("  ‚úÖ /kaggle/working/submission.csv")
print("  ‚úÖ /kaggle/working/detailed_results.csv")
print("  ‚úÖ /kaggle/working/processed_transcripts.csv")
print("  ‚úÖ /kaggle/working/full_dataset.csv")
print("  ‚úÖ /kaggle/working/*.png (visualizations)")

print("\n" + "="*60)
print("üéâ GRAMMAR SCORING ENGINE COMPLETE!")
print("="*60)


üìã GRAMMAR SCORING ENGINE - FINAL REPORT

‚ö†Ô∏è No processed audio files or data available for reporting.
Please ensure you have uploaded your audio files to '/content/audio_data' 
and re-run all cells from the 'Environment Setup & Data Exploration' section.

üíæ Output Files Generated:
  ‚úÖ /kaggle/working/submission.csv
  ‚úÖ /kaggle/working/detailed_results.csv
  ‚úÖ /kaggle/working/processed_transcripts.csv
  ‚úÖ /kaggle/working/full_dataset.csv
  ‚úÖ /kaggle/working/*.png (visualizations)

üéâ GRAMMAR SCORING ENGINE COMPLETE!
