In [None]:
import os
from pathlib import Path
dataset_path = r"C:\Users\DELL\Downloads\shl-intern-hiring-assessment (1)\dataset"
print("Contents of dataset folder:")
print(os.listdir(dataset_path))

In [None]:
import pandas as pd
train_df = pd.read_csv(Path(dataset_path) / "train.csv")
test_df = pd.read_csv(Path(dataset_path) / "test.csv")

print("Train columns:", train_df.columns.tolist())
print("\nSample training data:")
train_df.head(3)

In [None]:
import os
import pandas as pd
from pathlib import Path
from pydub import AudioSegment
from speech_recognition import Recognizer, AudioFile
import language_tool_python
from tqdm import tqdm

# Initialize tools
recognizer = Recognizer()
tool = language_tool_python.LanguageTool('en-US')

# Configuration
AUDIO_DIR = Path(r"C:\Users\DELL\Downloads\shl-intern-hiring-assessment (1)\dataset\audios_train")
OUTPUT_CSV = AUDIO_DIR.parent / "grammar_scores_all.csv"

def transcribe_audio(audio_path):
    """Convert audio to text with robust error handling"""
    try:
        with AudioFile(str(audio_path)) as source:
            audio = recognizer.record(source)
            try:
                # First try with Indian English
                return recognizer.recognize_google(audio, language='en-IN')
            except:
                # Fallback to standard English
                return recognizer.recognize_google(audio)
    except Exception as e:
        return f"[Error: {type(e).__name__}]"

def calculate_grammar_score(text):
    """Calculate score between 0 (worst) and 1 (best)"""
    if not isinstance(text, str) or text.startswith("[Error"):
        return 0.0
    
    try:
        matches = tool.check(text)
        error_rate = len(matches) / max(len(text.split()), 1)
        return round(1 - min(error_rate, 1.0), 4)  # Cap at 0, 4 decimal places
    except:
        return 0.0

def process_all_audio_files():
    """Process all WAV files in the directory"""
    # Get all WAV files sorted numerically
    audio_files = sorted(
        [f for f in AUDIO_DIR.glob("*.wav") if f.is_file()],
        key=lambda x: int(x.stem.split('_')[1])
    )
    
    if not audio_files:
        print(f"No WAV files found in {AUDIO_DIR}")
        return None
    
    print(f"Found {len(audio_files)} audio files to process...")
    
    results = []
    for audio_path in tqdm(audio_files, desc="Processing Audio"):
        # Extract sample_id from filename (e.g., "audio_1261.wav" -> 1261)
        sample_id = int(audio_path.stem.split('_')[1])
        
        # Process audio
        transcription = transcribe_audio(audio_path)
        score = calculate_grammar_score(transcription
                                       
                                       )
        
        results.append({
            "sample_id": sample_id,
            "filename": audio_path.name,
            "transcription": transcription,
            "grammar_score": score
        })
    
    # Create DataFrame and save
    df = pd.DataFrame(results).sort_values('sample_id')
    df.to_csv(OUTPUT_CSV, index=False)
    return df

# Run processing
if __name__ == "__main__":
    results_df = process_all_audio_files()
    
    if results_df is not None:
        print(f"\nProcessing complete! Results saved to {OUTPUT_CSV}")
        print("\nSample of results:")
        print(results_df.head().to_markdown(index=False))