In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# SHL Intern Hiring Assessment 2025
## Grammar Score Prediction from Speech Audio (Offline Baseline)

This notebook predicts grammar scores from audio recordings using:
- MFCC + spectral audio features (fully offline)
- LightGBM regression

This solution is stable on Kaggle and serves as a strong baseline.


In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import librosa
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


## Load Dataset

The dataset structure is:
- CSVs: `dataset/csvs/train.csv`, `dataset/csvs/test.csv`
- Audio: `dataset/audios/train/`, `dataset/audios/test/`


In [2]:
BASE_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset"

TRAIN_AUDIO_DIR = f"{BASE_PATH}/audios/train"
TEST_AUDIO_DIR  = f"{BASE_PATH}/audios/test"

train = pd.read_csv(f"{BASE_PATH}/csvs/train.csv")
test  = pd.read_csv(f"{BASE_PATH}/csvs/test.csv")

print(train.shape, test.shape)
train.head(), test.head()


(409, 2) (197, 1)


(    filename  label
 0  audio_173    3.0
 1  audio_138    3.0
 2  audio_127    2.0
 3   audio_95    2.0
 4   audio_73    3.5,
     filename
 0  audio_141
 1  audio_114
 2   audio_17
 3   audio_76
 4  audio_156)

## Dataset Columns

- `filename` → audio filename (without `.wav`)
- `label` → grammar score (0–5)


In [3]:
def extract_features(path):
    y, sr = librosa.load(path, sr=16000)

    # Pad very short audio
    if len(y) < sr:
        y = np.pad(y, (0, sr - len(y)))

    # MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfcc_mean = mfcc.mean(axis=1)
    mfcc_std = mfcc.std(axis=1)

    # Spectral features
    zcr = librosa.feature.zero_crossing_rate(y).mean()
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    rms = librosa.feature.rms(y=y).mean()

    features = np.hstack([
        mfcc_mean,
        mfcc_std,
        zcr,
        centroid,
        bandwidth,
        rolloff,
        rms
    ])

    # Replace NaN / inf safely
    return np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)


## Extract Training Features


In [4]:
X = []
y = []

for _, row in tqdm(train.iterrows(), total=len(train)):
    audio_path = f"{TRAIN_AUDIO_DIR}/{row['filename']}.wav"
    X.append(extract_features(audio_path))
    y.append(row['label'])

X = np.vstack(X)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)


  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated

X shape: (409, 45)
y shape: (409,)





## Extract Test Features


In [6]:
X_test = []

for _, row in tqdm(test.iterrows(), total=len(test)):
    audio_path = f"{TEST_AUDIO_DIR}/{row['filename']}.wav"
    X_test.append(extract_features(audio_path))

X_test = np.vstack(X_test)
print("X_test shape:", X_test.shape)


  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  y, sr = librosa.load(path, sr=16000)
	Deprecated

X_test shape: (197, 45)





## Train–Validation Split


In [7]:
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Train LightGBM Model


In [8]:
model = lgb.LGBMRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

model.fit(X_tr, y_tr)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4950
[LightGBM] [Info] Number of data points in the train set: 327, number of used features: 45
[LightGBM] [Info] Start training from score 2.868502


## Validation Performance


In [10]:
val_preds = model.predict(X_val)
rmse = mean_squared_error(y_val, val_preds, squared=False)
print("Validation RMSE:", rmse)


Validation RMSE: 0.7503357742552481


## Generate Submission File


In [11]:
test_preds = model.predict(X_test)

submission = pd.DataFrame({
    "filename": test["filename"],
    "label": np.clip(test_preds, 0, 5)
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,filename,label
0,audio_141,2.424525
1,audio_114,2.651411
2,audio_17,2.828472
3,audio_76,3.536204
4,audio_156,3.614038
