In [None]:
# 📦 Imports & Warnings
import os
import numpy as np
import pandas as pd
import torch
import torchaudio
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, rankdata
from tqdm import tqdm
import joblib
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings

# Ignore specific torchaudio warnings
warnings.filterwarnings("ignore", message=".*mel filterbank has all zero values.*")


In [None]:
📝 Brief Report

🔍 Objective:
To build a Grammar Scoring Engine that predicts a continuous grammar score (0–5) from ~45–60 second audio clips of spoken English. The model is evaluated using Pearson correlation.

🧱 Pipeline Overview:
    
Feature Extraction:
Wav2Vec2 embeddings (mean + std)
Delta features (mean + std)
MFCC features
Prosodic features: pitch and energy
Audio duration

Base Models:
XGBoost, LightGBM, CatBoost

Meta Model:
Gradient Boosting Regressor

Ensemble:
Stacking + Rank Averaging

Validation:
5-Fold CV + Hold-out 20% split for final Pearson score

In [None]:
# 🎧 Load Pre-trained Wav2Vec2 Model
bundle = torchaudio.pipelines.WAV2VEC2_BASE
wav2vec_model = bundle.get_model()
wav2vec_model.eval()


In [None]:
# 🎵 Feature Extraction Function
def extract_features(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sample_rate != bundle.sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=bundle.sample_rate)
        waveform = resampler(waveform)
    with torch.inference_mode():
        features = wav2vec_model(waveform)[0].squeeze(0)
        mean_feat = features.mean(dim=0)
        std_feat = features.std(dim=0)

        # Delta features
        delta_feat = features[1:] - features[:-1]
        delta_mean = delta_feat.mean(dim=0)
        delta_std = delta_feat.std(dim=0)

        # Duration
        duration = waveform.shape[1] / sample_rate

        # MFCC & Prosodic
        mfcc = torchaudio.transforms.MFCC()(waveform).squeeze(0).mean(dim=1)
        pitch = waveform.abs().mean().unsqueeze(0)
        energy = torch.norm(waveform).unsqueeze(0)

        combined = torch.cat([mean_feat, std_feat, delta_mean, delta_std, mfcc, pitch, energy, torch.tensor([duration])])
        return combined.numpy()


In [None]:
# 🧠 Load Training Data
train_df = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv")

# 🔁 Load Cached or Extract Features
if os.path.exists("X_feats.npy") and os.path.exists("y_feats.npy"):
    print("🔁 Loading saved features...")
    X = np.load("X_feats.npy")
    y = np.load("y_feats.npy")
else:
    print("🎧 Extracting features from training set...")
    X, y = [], []
    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        try:
            path = f"/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train/{row['filename']}"
            feat = extract_features(path)
            X.append(feat)
            y.append(row['label'])
        except Exception as e:
            print(f"❌ Error with {row['filename']}: {e}")
    X = np.array(X)
    y = np.array(y)
    np.save("X_feats.npy", X)
    np.save("y_feats.npy", y)


In [None]:
# ⚖️ Feature Scaling
scaler = StandardScaler()
X = scaler.fit_transform(X)
joblib.dump(scaler, "scaler.pkl")


In [None]:
# 🚀 Define Base and Meta Models
xgb_reg = xgb.XGBRegressor(n_estimators=250, learning_rate=0.05, max_depth=6, subsample=0.8, random_state=42)
lgb_reg = lgb.LGBMRegressor(n_estimators=250, learning_rate=0.05, max_depth=6, subsample=0.8, random_state=42)
cb_reg = cb.CatBoostRegressor(iterations=250, learning_rate=0.05, depth=6, verbose=0, random_state=42)
meta_model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42)


In [None]:
# 🔁 5-Fold Cross Validation + OOF Stacking
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0], 3))

for i, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"📂 Fold {i+1}/5")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    xgb_reg.fit(X_train, y_train)
    lgb_reg.fit(X_train, y_train)
    cb_reg.fit(X_train, y_train)

    oof_preds[val_idx, 0] = xgb_reg.predict(X_val)
    oof_preds[val_idx, 1] = lgb_reg.predict(X_val)
    oof_preds[val_idx, 2] = cb_reg.predict(X_val)

meta_model.fit(oof_preds, y)
joblib.dump(meta_model, "meta_model.pkl")


In [None]:
# 📈 Evaluate on Hold-out Validation Set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
val_base_preds = np.vstack([
    xgb_reg.predict(X_val),
    lgb_reg.predict(X_val),
    cb_reg.predict(X_val)
]).T
val_meta_preds = meta_model.predict(val_base_preds)
val_meta_preds = np.clip(val_meta_preds, 0, 5)
corr, _ = pearsonr(y_val, val_meta_preds)
print("\n📈 Pearson Correlation on Validation Set:", corr)


In [None]:
# 🔮 Stacked + Rank Averaged Prediction Function
def stacked_predict(X):
    xgb_pred = xgb_reg.predict(X)
    lgb_pred = lgb_reg.predict(X)
    cb_pred = cb_reg.predict(X)
    meta_input = np.vstack([xgb_pred, lgb_pred, cb_pred]).T
    meta_pred = meta_model.predict(meta_input)

    # Rank averaging for robustness
    ranks = (rankdata(xgb_pred) + rankdata(lgb_pred) + rankdata(cb_pred)) / 3.0
    ranks = MinMaxScaler((0, 5)).fit_transform(ranks.reshape(-1, 1)).flatten()

    final = 0.5 * meta_pred + 0.5 * ranks
    return np.clip(final, 0, 5)


In [None]:
# 📤 Predict on Test Set
submission = []
test_df = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv")
print("\n🧪 Predicting test data...")

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    try:
        path = f"/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test/{row['filename']}"
        feat = extract_features(path)
        feat = scaler.transform([feat])
        pred = stacked_predict(feat)[0]
        submission.append([row['filename'], pred])
    except Exception as e:
        print(f"❌ Error with {row['filename']}: {e}")
        submission.append([row['filename'], 0.0])

submission_df = pd.DataFrame(submission, columns=['filename', 'label'])
submission_df.to_csv("submission.csv", index=False)
print("\n✅ Submission saved as submission.csv")
