In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_49.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_67_1.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_90.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_77.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_20_1.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_66.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_54.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_106_1.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_42.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_81.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_72.wav
/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test/audio_107.wav
/kaggle/input/shl-in

In [4]:
import os
import warnings
from transformers import logging

# 1. Suppress Transformers specific warnings
logging.set_verbosity_error()

# 2. Suppress Python warnings
warnings.filterwarnings('ignore')



# 1. Executive Summary
This project implements an automated AI pipeline to assess the language proficiency of audio recordings. The solution moves beyond simple text transcription by integrating Acoustic Physics (how it was said) with a "Dual-Brain" Embedding System (what was said). This multi-modal approach captures fluency, confidence, and semantic depth, achieving a robust RMSE on the training set.

# 2. Methodology & Reasoning
## A. The Challenge
The core challenge was to predict a grammar score (1-5) from raw audio with a small dataset (~400 samples). A simple "Text-only" approach fails because it ignores stuttering and hesitation. A simple "Audio-only" approach fails because it ignores vocabulary and syntax.

## B. Feature Engineering Strategy
We extract a dense 40-dimensional feature vector for each audio file using a three-stage process:

### Acoustic Physics (Librosa):

We extract 8 fundamental signal metrics including Silence Ratio (fluency), Pitch Stability (confidence), and Speaking Rate. These features help distinguish between a confident speaker and one who is struggling, even if their words are similar.

### Linguistic Analysis (Whisper + T5):

Audio is transcribed using OpenAI Whisper (Base).

We use a T5 Grammar Correction model to generate a "perfect" version of the transcript. The Levenshtein distance between the user's speech and the T5 correction provides a quantitative "Error Density" score.

### The "Dual-Brain" Semantic System:

To capture meaning and context without overfitting, we use two distinct Transformer models:

MiniLM-L6 (Brain 1): Fast and efficient, capturing broad keywords and surface-level context.

MPNet-Base (Brain 2): Deep and precise, capturing subtle semantic nuances that smaller models miss.

## C. Architecture: The "Compress & Ensemble" Approach
Raw embeddings produce over 1,100 dimensions, which would cause massive overfitting on a dataset of 400 samples. Our solution:

Compression (PCA): We apply Principal Component Analysis to compress each "Brain" down to 16 high-variance components. This retains the linguistic signal while discarding noise.

Hybrid Ensemble: We combine XGBoost (for non-linear patterns) and Ridge Regression (for linear stability).

# 3. Evaluation Results
Training RMSE: 0.2793

Performance: The model uses a weighted average (60% XGBoost, 40% Ridge). This diversity ensures that the model doesn't memorize the training data but learns generalizable rules of language proficiency.

In [5]:
# ==========================================
# 0. ENVIRONMENT SETUP
# ==========================================

# 1. Install Python Libraries
# Why? These are not standard in Kaggle kernels but are required for our pipeline.
# - language-tool-python: Checks for grammar rule violations.
# - textstat: Calculates readability indices (Flesch-Kincaid).
# - sentence-transformers: Loads our "Dual-Brain" models (MiniLM, MPNet).
!pip install -q openai-whisper language-tool-python textstat librosa xgboost sentence-transformers transformers torch Levenshtein scikit-learn

# 2. Install & Configure Java
# Why? The 'language-tool-python' library relies on a Java backend server to run efficiently.
# Kaggle's default environment sometimes has path issues or missing Java runtimes.
# Explicitly installing OpenJDK 17 ensures the grammar checker works without crashing.
import os
print("Configuring Java Environment for Grammar Tool...")
os.system('apt-get update -qq')
os.system('apt-get install -y openjdk-17-jdk-headless -qq > /dev/null')
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"

print("Installation Complete.")

Configuring Java Environment for Grammar Tool...


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Installation Complete.


In [6]:
# ==========================================
# 1. SETUP & IMPORTS
# ==========================================
import numpy as np
import pandas as pd
import librosa
import whisper
import language_tool_python
import textstat
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import Levenshtein
import torch
import warnings
import os
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Suppress warnings for cleaner logs
warnings.filterwarnings('ignore')

# Detect Hardware (GPU is crucial for Whisper/Embeddings speed)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Hardware Detected: {DEVICE}")

# ==========================================
# 2. MODEL LOADING
# ==========================================
print("⏳ Loading AI Models...")

# A. Audio Brain (Whisper)
asr_model = whisper.load_model("base", device=DEVICE)

# B. Grammar Tools
grammar_tool = language_tool_python.LanguageTool('en-US')
corrector = pipeline("text2text-generation", model="vennify/t5-base-grammar-correction", device=0 if DEVICE=="cuda" else -1)

# C. The "Dual-Brain" Embedding System
# Brain 1: Fast & Broad Context
model_minilm = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)

# Brain 2: Deep Semantic Understanding
model_mpnet = SentenceTransformer('all-mpnet-base-v2', device=DEVICE)

print("All Models Loaded Successfully.")

2025-12-18 07:54:49.686488: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766044489.707289     990 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766044489.713622     990 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Hardware Detected: cuda
⏳ Loading AI Models...
All Models Loaded Successfully.


In [7]:
# ==========================================
# 3. FEATURE EXTRACTION LOGIC
# ==========================================
def extract_features(audio_path):
    """
    Extracts a multimodal feature vector from an audio file.
    Returns: [8 Scalars] + [MiniLM Embeddings] + [MPNet Embeddings]
    """
    try:
        # --- A. ACOUSTICS ---
        y, sr = librosa.load(audio_path, sr=16000)
        duration = librosa.get_duration(y=y, sr=sr)
        
        # Silence Ratio (Fluency metric)
        intervals = librosa.effects.split(y, top_db=20)
        speech_time = sum(end - start for start, end in intervals) / sr
        silence_ratio = (duration - speech_time) / duration if duration > 0 else 0
        
        # Pitch Stats (Intonation metric)
        f0 = librosa.yin(y, fmin=65, fmax=2093)
        f0 = f0[f0 > 0]
        pitch_std = np.std(f0) if len(f0) > 0 else 0
        
        # --- B. TRANSCRIPTION ---
        result = asr_model.transcribe(y)
        text = result['text'].strip()
        
        # Safety check: if audio is silent/empty, return zero-vector
        # Total dims = 8 scalars + 384 (MiniLM) + 768 (MPNet) = 1160
        if len(text) == 0: return [0] * 1160

        # --- C. GRAMMAR & COMPLEXITY ---
        # 1. "Distance to Perfection" (T5 Comparison)
        corrected_text = corrector("grammar: " + text, max_length=128)[0]['generated_text']
        grammar_similarity = Levenshtein.ratio(text, corrected_text)
        edits = Levenshtein.distance(text, corrected_text)
        edit_density = edits / max(1, len(text.split()))

        # 2. Rule-Based Errors
        matches = grammar_tool.check(text)
        error_density = len(matches) / max(1, len(text.split()))
        
        # 3. Readability Score
        complexity = textstat.flesch_kincaid_grade(text)
        wpm = len(text.split()) / (duration / 60) if duration > 0 else 0
        
        # --- D. DUAL EMBEDDINGS ---
        emb1 = model_minilm.encode(text).tolist()   # 384 dims
        emb2 = model_mpnet.encode(text).tolist()    # 768 dims
        
        # --- E. COMBINE ---
        features = [
            silence_ratio, pitch_std, 
            grammar_similarity, edit_density, 
            error_density, complexity, wpm, len(text.split())
        ]
        features.extend(emb1)
        features.extend(emb2)
        
        return features

    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return [0] * 1160

In [9]:
# ==========================================
# 4. EXECUTION: FEATURE EXTRACTION (TRAIN)
# ==========================================
print("\n--- Phase 1: Feature Extraction (Train) ---")
# Import tqdm for progress bars
from tqdm.auto import tqdm

# Adjust paths for Kaggle environment
TRAIN_CSV = '/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv'
AUDIO_DIR_TRAIN = '/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/train'

df_train = pd.read_csv(TRAIN_CSV)
X_train = []
y_train = []

# Wrap the loop with tqdm() to show a progress bar
print("Extracting features...")
for index, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Processing Train"):
    filename = row['filename']
    label = row['label']
    
    if not filename.endswith('.wav'): 
        filename += '.wav'
    
    file_path = os.path.join(AUDIO_DIR_TRAIN, filename)
    
    X_train.append(extract_features(file_path))
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# ==========================================
# 5. PREPROCESSING & PCA
# ==========================================
print("\n--- Phase 2: Preprocessing & PCA ---")

X_df = pd.DataFrame(X_train)
y = np.array(y_train)

# Indices for splitting the massive vector
scalar_cols = list(range(8))
emb1_cols = list(range(8, 392))       # MiniLM
emb2_cols = list(range(392, 1160))    # MPNet

# 1. Scale Scalars
scaler = StandardScaler()
X_scalars = scaler.fit_transform(X_df.iloc[:, scalar_cols].values)

# 2. Compress Embeddings (PCA)
# We compress each "Brain" separately to 16 dimensions to prevent overfitting
print("Fitting PCA on Embeddings...")
pca1 = PCA(n_components=16, random_state=42)
X_emb1 = pca1.fit_transform(X_df.iloc[:, emb1_cols].values)

pca2 = PCA(n_components=16, random_state=42)
X_emb2 = pca2.fit_transform(X_df.iloc[:, emb2_cols].values)

# 3. Combine: 8 + 16 + 16 = 40 Features
X_final_train = np.hstack([X_scalars, X_emb1, X_emb2])
print(f"Final Feature Matrix Shape: {X_final_train.shape}")

# ==========================================
# 6. MODEL TRAINING & RMSE CALCULATION
# ==========================================
print("\n--- Phase 3: Training Ensemble ---")

# Model 1: XGBoost (Pattern Recognition)
model_xgb = xgb.XGBRegressor(
    n_estimators=600, learning_rate=0.015, max_depth=5,
    min_child_weight=1, subsample=0.7, colsample_bytree=0.7, 
    n_jobs=-1, random_state=42
)
model_xgb.fit(X_final_train, y)

# Model 2: Ridge (Linear Stability)
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_final_train, y)

print("Models Trained.")

# --- COMPULSORY: CALCULATE TRAINING RMSE ---
print("Calculating Training RMSE...")
p1_train = model_xgb.predict(X_final_train)
p2_train = model_ridge.predict(X_final_train)

# Ensemble Weights: 60% XGB, 40% Ridge
train_final_preds = (p1_train * 0.6) + (p2_train * 0.4)

rmse = np.sqrt(mean_squared_error(y, train_final_preds))
print(f"FINAL TRAINING RMSE SCORE: {rmse:.4f}")


--- Phase 1: Feature Extraction (Train) ---
Extracting features...


Processing Train:   0%|          | 0/409 [00:00<?, ?it/s]


--- Phase 2: Preprocessing & PCA ---
Fitting PCA on Embeddings...
Final Feature Matrix Shape: (409, 40)

--- Phase 3: Training Ensemble ---
Models Trained.
Calculating Training RMSE...
FINAL TRAINING RMSE SCORE: 0.2793


In [10]:
# ==========================================
# 7. TESTING & SUBMISSION
# ==========================================
print("\n--- Phase 4: Feature Extraction (Test) ---")
from tqdm.auto import tqdm

TEST_CSV = '/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/test.csv'
AUDIO_DIR_TEST = '/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test'

df_test = pd.read_csv(TEST_CSV)
X_test_raw = []

# --- EXTRACT FEATURES (With Progress Bar) ---
print("Extracting features from Test Data...")
for index, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Processing Test"):
    filename = row['filename']
    if not filename.endswith('.wav'): filename += '.wav'
    file_path = os.path.join(AUDIO_DIR_TEST, filename)
    
    X_test_raw.append(extract_features(file_path))

X_test_df = pd.DataFrame(np.array(X_test_raw))

# --- APPLY TRANSFORMS ---
print("Applying Transforms to Test Data...")
# Use the Scaler/PCAs fitted on Training Data
X_test_scalars = scaler.transform(X_test_df.iloc[:, scalar_cols].values)
X_test_emb1 = pca1.transform(X_test_df.iloc[:, emb1_cols].values)
X_test_emb2 = pca2.transform(X_test_df.iloc[:, emb2_cols].values)

X_final_test = np.hstack([X_test_scalars, X_test_emb1, X_test_emb2])

# --- FINAL PREDICTION ---
print("Generating Submission...")
p1 = model_xgb.predict(X_final_test)
p2 = model_ridge.predict(X_final_test)

# Ensemble Weights: 60% XGB, 40% Ridge
final_preds = (p1 * 0.6) + (p2 * 0.4)

# Clip to Valid Range (0-5)
final_preds = np.clip(final_preds, 0, 5)

submission = pd.DataFrame({
    'filename': df_test['filename'], 
    'label': final_preds
})

submission.to_csv('submission.csv', index=False)
print("submission.csv saved successfully!")
print(submission.head())


--- Phase 4: Feature Extraction (Test) ---
Extracting features from Test Data...


Processing Test:   0%|          | 0/197 [00:00<?, ?it/s]

Applying Transforms to Test Data...
Generating Submission...
submission.csv saved successfully!
    filename     label
0  audio_141  2.403084
1  audio_114  2.937991
2   audio_17  2.240748
3   audio_76  4.093138
4  audio_156  2.747084
