In [None]:
!pip install --upgrade transformers

In [None]:
!git clone https://github.com/CodeVault-girish/SFM-models.git


In [None]:
import sys
sys.path.append("/kaggle/working/SFM-models")  


In [47]:
from sfm_extractor.extractor import model_list, extract_from
model_list()


Available models:
1. Trillsson
2. YAMNet
3. Facebook MMS-1B
4. SpeechBrain x-vector
5. Facebook HuBERT-base-ls960
6. Microsoft WavLM-base
7. Facebook Wav2Vec2-XLS-R-1B
8. Facebook Wav2Vec2-base
9. OpenAI Whisper-base
10. Microsoft UniSpeech-SAT-base-100h-Libri-ft
11. speechbrain/spkrec-ecapa-voxceleb


In [None]:
!touch audio_whisper_train_embeddings.csv
!touch audio_whisper_test_embeddings.csv


In [None]:
extract_from("9", "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test", output_file="/kaggle/working/audio_whisper_test_embeddings.csv", device="cuda")

In [None]:
import pandas as pd
import numpy as np

audio_df = pd.read_csv("/kaggle/working/audio_wavlm_train_embeddings.csv")
label_df = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv")

audio_df.rename(columns={audio_df.columns[0]: "filename"}, inplace=True)
label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

audio_df["filename"] = audio_df["filename"].str.replace(".wav", "", regex=False)

df = pd.merge(audio_df, label_df, on="filename", how="inner")

print("Total aligned samples:", len(df))


In [None]:
X = df.iloc[:, 1:-1].values.astype(np.float32)

y = df["label"].values.astype(np.float32)

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1)  # regression output
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=["mae"]
)

model.summary()


In [None]:


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    verbose=1
)


train_loss, train_mae = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)

print("\n=== LOSS & MAE ===")
print(f"Train Loss (MSE): {train_loss:.4f}")
print(f"Train MAE       : {train_mae:.4f}")
print(f"Test Loss (MSE) : {test_loss:.4f}")
print(f"Test MAE        : {test_mae:.4f}")


y_train_pred = model.predict(X_train).squeeze()
y_test_pred = model.predict(X_test).squeeze()

y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_test_pred = np.clip(y_test_pred, 0.0, 5.0)

def tolerance_accuracy(y_true, y_pred, tol=0.5):
    return np.mean(np.abs(y_true - y_pred) <= tol)

train_acc = tolerance_accuracy(y_train, y_train_pred)
test_acc = tolerance_accuracy(y_test, y_test_pred)

print("\n=== REGRESSION ACCURACY (±0.5) ===")
print(f"Train Accuracy: {train_acc * 100:.2f}%")
print(f"Test Accuracy : {test_acc * 100:.2f}%")


train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n=== R² SCORE ===")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R² : {test_r2:.4f}")

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score


audio_df = pd.read_csv("/kaggle/working/audio_whisper_train_embeddings.csv")
label_df = pd.read_csv("/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv")

audio_df.rename(columns={audio_df.columns[0]: "filename"}, inplace=True)
label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

audio_df["filename"] = audio_df["filename"].str.replace(".wav", "", regex=False)

df = pd.merge(audio_df, label_df, on="filename", how="inner")
print("Total aligned samples:", len(df))

X = df.iloc[:, 1:-1].values.astype(np.float32)
y = df.iloc[:, -1].values.astype(np.float32)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(512,)),

    tf.keras.layers.Dense(256, activation="tanh"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.25),

    tf.keras.layers.Dense(128, activation="tanh"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.15),

    tf.keras.layers.Dense(64, activation="tanh"),
    tf.keras.layers.Dense(8, activation="relu"),

    tf.keras.layers.Dense(1)
])



 

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="mse",
    metrics=["mae"]
)


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=500,
    batch_size=24,
    verbose=1
)


train_loss, train_mae = model.evaluate(X_train, y_train, verbose=0)
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)

train_rmse = np.sqrt(train_loss)
test_rmse = np.sqrt(test_loss)

print("\n=== LOSS / MAE / RMSE ===")
print(f"Train MSE  : {train_loss:.4f}")
print(f"Train RMSE : {train_rmse:.4f}")
print(f"Train MAE  : {train_mae:.4f}")
print()
print(f"Test MSE   : {test_loss:.4f}")
print(f"Test RMSE  : {test_rmse:.4f}")
print(f"Test MAE   : {test_mae:.4f}")


y_train_pred = model.predict(X_train).squeeze()
y_test_pred = model.predict(X_test).squeeze()

y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_test_pred = np.clip(y_test_pred, 0.0, 5.0)

def tolerance_accuracy(y_true, y_pred, tol=0.5):
    return np.mean(np.abs(y_true - y_pred) <= tol)

train_acc = tolerance_accuracy(y_train, y_train_pred)
test_acc = tolerance_accuracy(y_test, y_test_pred)

print("\n=== REGRESSION ACCURACY (±0.5) ===")
print(f"Train Accuracy: {train_acc * 100:.2f}%")
print(f"Test Accuracy : {test_acc * 100:.2f}%")


train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n=== R² SCORE ===")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R² : {test_r2:.4f}")
MODEL_PATH = "/kaggle/working/whisper_regression_model.keras"

model.save(MODEL_PATH)
print("Model saved at:", MODEL_PATH)


In [None]:
!nvidia-smi

In [26]:
import pandas as pd
import numpy as np
import joblib


TEST_META_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/test.csv"
DEBERTA_TEST_EMB_PATH = "/kaggle/working/deberta_large_embeddings_test.csv"

XGB_MODEL_PATH = "/kaggle/working/xgb_deberta_grammar_model.pkl"
SUBMISSION_PATH = "/kaggle/working/submission.csv"


test_meta_df = pd.read_csv(TEST_META_PATH)
test_emb_df = pd.read_csv(DEBERTA_TEST_EMB_PATH)

test_meta_df.rename(
    columns={test_meta_df.columns[0]: "filename"}, inplace=True
)
test_emb_df.rename(
    columns={test_emb_df.columns[0]: "filename"}, inplace=True
)

test_meta_df["filename"] = test_meta_df["filename"].astype(str)
test_emb_df["filename"] = test_emb_df["filename"].astype(str)

test_df = pd.merge(
    test_meta_df,
    test_emb_df,
    on="filename",
    how="inner"
)

print("Total aligned test samples:", len(test_df))
print(test_df.head())


X_test = test_df.iloc[:, 1:].values.astype(np.float32)
print("Test feature shape:", X_test.shape)

xgb_model = joblib.load(XGB_MODEL_PATH)
print("Loaded XGBoost model from:", XGB_MODEL_PATH)

y_test_pred = xgb_model.predict(X_test)

y_test_pred = np.clip(y_test_pred, 0.0, 5.0)

y_test_pred = np.round(y_test_pred * 2) / 2

submission_df = pd.DataFrame({
    "filename": test_df["filename"],
    "label": y_test_pred
})

submission_df.to_csv(SUBMISSION_PATH, index=False)

print("Submission saved at:", SUBMISSION_PATH)
submission_df.head()


Total aligned test samples: 197
    filename        e0        e1        e2        e3        e4        e5  \
0  audio_141  0.099782 -0.205726 -0.112622 -0.078250  0.039959 -0.126085   
1  audio_114  0.103830 -0.134755  0.084457 -0.375298  0.066531  0.172808   
2   audio_17  0.350971 -0.368010  0.342656 -0.326491 -0.043754 -0.135610   
3   audio_76  0.009247 -0.307182  0.152034 -0.256583 -0.034348  0.111848   
4  audio_156  0.344951 -0.338885  0.458355 -0.212554  0.345985 -0.039210   

         e6        e7        e8  ...     e1014     e1015     e1016     e1017  \
0 -0.042891 -0.070192  0.095617  ...  0.000471 -0.426584  0.532851  0.185651   
1 -0.017575  0.014542 -0.073765  ...  0.076440 -0.126993  0.037520  0.398466   
2  0.119590 -0.076224 -0.032101  ... -0.350288 -0.345638  0.217359  0.514474   
3 -0.296002  0.021410 -0.087357  ... -0.173768 -0.363479  0.276646  0.292233   
4  0.080353 -0.010777 -0.035325  ... -0.307831 -0.081907  0.108643  0.190455   

      e1018     e1019     e102

Unnamed: 0,filename,label
0,audio_141,2.5
1,audio_114,4.0
2,audio_17,2.5
3,audio_76,4.5
4,audio_156,2.5


In [30]:
import pandas as pd
import numpy as np
import tensorflow as tf


TEST_META_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/test.csv"
DEBERTA_TEST_EMB_PATH = "/kaggle/working/deberta_large_embeddings_test.csv"

BEST_MODEL_PATH = "/kaggle/working/best_doberta_transcript_regression_model.keras"
SUBMISSION_PATH = "/kaggle/working/submission.csv"


test_meta_df = pd.read_csv(TEST_META_PATH)
test_emb_df = pd.read_csv(DEBERTA_TEST_EMB_PATH)

test_meta_df.rename(
    columns={test_meta_df.columns[0]: "filename"}, inplace=True
)
test_emb_df.rename(
    columns={test_emb_df.columns[0]: "filename"}, inplace=True
)

test_meta_df["filename"] = test_meta_df["filename"].astype(str)
test_emb_df["filename"] = test_emb_df["filename"].astype(str)


test_df = pd.merge(
    test_meta_df,
    test_emb_df,
    on="filename",
    how="inner"
)

print("Total aligned test samples:", len(test_df))
print(test_df.head())


X_test_final = test_df.iloc[:, 1:].values.astype(np.float32)

print("Test feature shape:", X_test_final.shape)


model = tf.keras.models.load_model(BEST_MODEL_PATH)
print("Loaded best model from:", BEST_MODEL_PATH)


y_test_pred = model.predict(X_test_final).squeeze()

y_test_pred = np.clip(y_test_pred, 0.0, 5.0)


submission_df = pd.DataFrame({
    "filename": test_df["filename"],
    "label": y_test_pred
})

submission_df.to_csv(SUBMISSION_PATH, index=False)

print("Submission saved at:", SUBMISSION_PATH)
submission_df.head()


Total aligned test samples: 197
    filename        e0        e1        e2        e3        e4        e5  \
0  audio_141  0.099782 -0.205726 -0.112622 -0.078250  0.039959 -0.126085   
1  audio_114  0.103830 -0.134755  0.084457 -0.375298  0.066531  0.172808   
2   audio_17  0.350971 -0.368010  0.342656 -0.326491 -0.043754 -0.135610   
3   audio_76  0.009247 -0.307182  0.152034 -0.256583 -0.034348  0.111848   
4  audio_156  0.344951 -0.338885  0.458355 -0.212554  0.345985 -0.039210   

         e6        e7        e8  ...     e1014     e1015     e1016     e1017  \
0 -0.042891 -0.070192  0.095617  ...  0.000471 -0.426584  0.532851  0.185651   
1 -0.017575  0.014542 -0.073765  ...  0.076440 -0.126993  0.037520  0.398466   
2  0.119590 -0.076224 -0.032101  ... -0.350288 -0.345638  0.217359  0.514474   
3 -0.296002  0.021410 -0.087357  ... -0.173768 -0.363479  0.276646  0.292233   
4  0.080353 -0.010777 -0.035325  ... -0.307831 -0.081907  0.108643  0.190455   

      e1018     e1019     e102

Unnamed: 0,filename,label
0,audio_141,1.934675
1,audio_114,3.8856
2,audio_17,2.486921
3,audio_76,5.0
4,audio_156,3.454105


In [None]:
import whisper
import os
import pandas as pd
from tqdm import tqdm


AUDIO_DIR = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/audios/test"
OUTPUT_CSV = "/kaggle/working/audio_test_transcripts.csv"

WHISPER_MODEL_SIZE = "base"   # tiny | base | small | medium | large
LANGUAGE = "en"


print("Loading Whisper model...")
model = whisper.load_model(WHISPER_MODEL_SIZE)
print("Whisper model loaded")


rows = []

audio_files = sorted([
    f for f in os.listdir(AUDIO_DIR)
    if f.lower().endswith(".wav")
])

print(f"Found {len(audio_files)} audio files")

for fname in tqdm(audio_files):
    audio_path = os.path.join(AUDIO_DIR, fname)

    try:
        result = model.transcribe(
            audio_path,
            language=LANGUAGE,
            fp16=False   
        )

        transcript = result["text"].strip()

    except Exception as e:
        print(f"Error processing {fname}: {e}")
        transcript = ""

    rows.append({
        "filename": fname.replace(".wav", ""),  # audio_1.wav → audio_1
        "transcript": transcript
    })

df = pd.DataFrame(rows)
df.to_csv(OUTPUT_CSV, index=False)

print("\nSaved transcripts to:", OUTPUT_CSV)
df.head()


In [None]:
!pip install -U openai-whisper


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


INPUT_CSV = "/kaggle/working/audio_train_transcripts.csv"
OUTPUT_CSV = "/kaggle/working/bert_embeddings.csv"

MODEL_NAME = "bert-base-uncased"
MAX_LEN = 128
BATCH_SIZE = 16

device = "cuda" if torch.cuda.is_available() else "cpu"


df = pd.read_csv(INPUT_CSV)
df["transcript"] = df["transcript"].fillna("")

print("Total transcripts:", len(df))


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()


def get_bert_embeddings(texts):
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device)
        )

    # CLS token embedding → (B, 768)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]

    return cls_embeddings.cpu().numpy()


all_embeddings = []

for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch_texts = df["transcript"].iloc[i:i + BATCH_SIZE].tolist()
    emb = get_bert_embeddings(batch_texts)
    all_embeddings.append(emb)

X = np.vstack(all_embeddings)

print("Embedding shape:", X.shape)  # (N, 768)

embedding_df = pd.DataFrame(
    X,
    columns=[f"e{i}" for i in range(X.shape[1])]
)

final_df = pd.concat(
    [df[["filename"]].reset_index(drop=True), embedding_df],
    axis=1
)

final_df.to_csv(OUTPUT_CSV, index=False)

print("Saved BERT embeddings to:", OUTPUT_CSV)
final_df.head()


In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


INPUT_CSV = "/kaggle/working/audio_test_transcripts.csv"
OUTPUT_CSV = "/kaggle/working/deberta_large_embeddings_test.csv"

MODEL_NAME = "microsoft/deberta-v3-large"
MAX_LEN = 256         
BATCH_SIZE = 16

device = "cuda" if torch.cuda.is_available() else "cpu"

df = pd.read_csv(INPUT_CSV)
df["transcript"] = df["transcript"].fillna("").astype(str)

print("Total transcripts:", len(df))


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()


def mean_pooling(last_hidden_state, attention_mask):
    """
    last_hidden_state: (B, T, H)
    attention_mask:   (B, T)
    """
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = torch.sum(last_hidden_state * mask, dim=1)
    counts = torch.clamp(mask.sum(dim=1), min=1e-9)
    return summed / counts


def get_deberta_embeddings(texts):
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device)
        )

    embeddings = mean_pooling(
        outputs.last_hidden_state,
        inputs["attention_mask"].to(device)
    )

    return embeddings.cpu().numpy()


all_embeddings = []

for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch_texts = df["transcript"].iloc[i:i + BATCH_SIZE].tolist()
    emb = get_deberta_embeddings(batch_texts)
    all_embeddings.append(emb)

X = np.vstack(all_embeddings)

print("Embedding shape:", X.shape)  # (N, 768)


embedding_df = pd.DataFrame(
    X,
    columns=[f"e{i}" for i in range(X.shape[1])]
)

final_df = pd.concat(
    [df[["filename"]].reset_index(drop=True), embedding_df],
    axis=1
)

final_df.to_csv(OUTPUT_CSV, index=False)

print("Saved DeBERTa embeddings to:", OUTPUT_CSV)
final_df.head()


Total transcripts: 197


100%|██████████| 13/13 [00:04<00:00,  2.85it/s]


Embedding shape: (197, 1024)
Saved DeBERTa embeddings to: /kaggle/working/deberta_large_embeddings_test.csv


Unnamed: 0,filename,e0,e1,e2,e3,e4,e5,e6,e7,e8,...,e1014,e1015,e1016,e1017,e1018,e1019,e1020,e1021,e1022,e1023
0,audio_1,0.224947,-0.405848,0.57334,-0.407065,0.052722,-0.11358,-0.011057,0.003411,0.022139,...,-0.308698,-0.184177,-0.005291,0.221239,0.046818,-0.1471,-0.068861,0.028528,-0.366116,0.035095
1,audio_10,0.098477,-0.245417,0.524971,-0.221284,0.003748,0.163754,-0.007403,-0.059899,-0.02132,...,-0.375762,-0.400642,0.059986,0.240107,-0.457293,-0.104484,-0.158275,-0.001369,-0.580557,-0.011623
2,audio_100,0.447721,0.059997,0.664966,-0.545253,0.117874,-0.034652,0.106867,0.08844,-0.086432,...,-0.334274,-0.098004,-0.278957,0.342436,-0.229205,-0.052819,-0.075702,0.044247,-0.004279,0.133923
3,audio_101,0.369042,-0.323718,0.553655,-0.178902,0.304813,-0.024624,0.211365,-0.026503,0.096263,...,-0.178587,-0.203587,-0.109451,0.45222,-0.259855,-0.051301,-0.25991,0.026615,-0.409907,-0.110399
4,audio_102,0.14288,-0.199646,0.271711,-0.457679,-0.004549,0.331301,0.008655,0.123272,0.190292,...,-0.08954,-0.228065,-0.062575,0.403322,-0.185059,-0.184725,-0.050202,-0.076124,-0.008634,0.359956


In [34]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tensorflow.keras.callbacks import ModelCheckpoint


bert_df = pd.read_csv("/kaggle/working/deberta_large_embeddings.csv")
label_df = pd.read_csv(
    "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv"
)

bert_df.rename(columns={bert_df.columns[0]: "filename"}, inplace=True)
label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

bert_df["filename"] = bert_df["filename"].astype(str)

df = pd.merge(bert_df, label_df, on="filename", how="inner")
print("Total aligned samples:", len(df))


X = df.iloc[:, 1:-1].values.astype(np.float32)   
y = df.iloc[:, -1].values.astype(np.float32)    

print("X shape:", X.shape) 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# model = tf.keras.Sequential([
#     tf.keras.layers.Input(shape=(1024,)),

#     tf.keras.layers.Dense(256, activation="relu"),
#     tf.keras.layers.BatchNormalization(),
#     tf.keras.layers.Dropout(0.25),

#     tf.keras.layers.Dense(128, activation="relu"),
#     tf.keras.layers.BatchNormalization(),
#     tf.keras.layers.Dropout(0.15),

#     tf.keras.layers.Dense(64, activation="relu"),
#     tf.keras.layers.Dense(8, activation="relu"),

#     tf.keras.layers.Dense(1)
# ])

import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024,)),

    tf.keras.layers.Reshape((1024, 1)),

    # Conv Block 1
    # tf.keras.layers.Conv1D(
    #     filters=256,
    #     kernel_size=5,
    #     activation="relu",
    #     padding="same"
    # ),
    # tf.keras.layers.BatchNormalization(),
    # tf.keras.layers.MaxPooling1D(pool_size=2),

    # Conv Block 2
    tf.keras.layers.Conv1D(
        filters=512,
        kernel_size=5,
        activation="swish",
        padding="same"
    ),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling1D(pool_size=2),

    tf.keras.layers.Flatten(),
    # tf.keras.layers.Dense(512, activation="relu"),
    # tf.keras.layers.Dense(256, activation="relu"),
    # tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dense(8, activation="relu"),

    tf.keras.layers.Dense(1)
])

model.summary()



BEST_MODEL_PATH = "/kaggle/working/best_doberta_transcript_regression_model.keras"

checkpoint_cb = ModelCheckpoint(
    filepath=BEST_MODEL_PATH,
    monitor="val_loss",        # use validation loss
    mode="min",
    save_best_only=True,       # only save best epoch
    save_weights_only=False,   # save full model
    verbose=1
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.005),
    loss="mse",
    metrics=["mae"]
)


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=8,
    callbacks=[checkpoint_cb],  
    verbose=1
)

best_model = tf.keras.models.load_model(BEST_MODEL_PATH)
print("Loaded best model from:", BEST_MODEL_PATH)

train_loss, train_mae = best_model.evaluate(X_train, y_train, verbose=0)
test_loss, test_mae = best_model.evaluate(X_test, y_test, verbose=0)

train_rmse = np.sqrt(train_loss)
test_rmse = np.sqrt(test_loss)

print("\n=== LOSS / MAE / RMSE ===")
print(f"Train MSE  : {train_loss:.4f}")
print(f"Train RMSE : {train_rmse:.4f}")
print(f"Train MAE  : {train_mae:.4f}")
print()
print(f"Test MSE   : {test_loss:.4f}")
print(f"Test RMSE  : {test_rmse:.4f}")
print(f"Test MAE   : {test_mae:.4f}")


y_train_pred = best_model.predict(X_train).squeeze()
y_test_pred = best_model.predict(X_test).squeeze()

y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_test_pred = np.clip(y_test_pred, 0.0, 5.0)

def tolerance_accuracy(y_true, y_pred, tol=0.5):
    return np.mean(np.abs(y_true - y_pred) <= tol)

train_acc = tolerance_accuracy(y_train, y_train_pred)
test_acc = tolerance_accuracy(y_test, y_test_pred)

print("\n=== REGRESSION ACCURACY (±0.5) ===")
print(f"Train Accuracy: {train_acc * 100:.2f}%")
print(f"Test Accuracy : {test_acc * 100:.2f}%")


train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\n=== R² SCORE ===")
print(f"Train R²: {train_r2:.4f}")
print(f"Test R² : {test_r2:.4f}")


MODEL_PATH = "/kaggle/working/bert_transcript_regression_model.keras"
model.save(MODEL_PATH)

print("Model saved at:", MODEL_PATH)


Total aligned samples: 409
X shape: (409, 1024)


Epoch 1/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 1726.9971 - mae: 18.3049
Epoch 1: val_loss improved from inf to 21487.07812, saving model to /kaggle/working/best_doberta_transcript_regression_model.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 169ms/step - loss: 1699.1874 - mae: 18.0573 - val_loss: 21487.0781 - val_mae: 146.5798
Epoch 2/100
[1m38/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 13ms/step - loss: 33.3819 - mae: 4.2469
Epoch 2: val_loss improved from 21487.07812 to 12094.14941, saving model to /kaggle/working/best_doberta_transcript_regression_model.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 140ms/step - loss: 31.5903 - mae: 4.1491 - val_loss: 12094.1494 - val_mae: 109.9452
Epoch 3/100
[1m38/41[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 12ms/step - loss: 8.9636 - mae: 2.8877
Epoch 3: val_loss improved from 12094.14941 to 9118.83398, saving model to /kagg

In [25]:
import pandas as pd
import numpy as np
import joblib

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


EMB_PATH = "/kaggle/working/deberta_large_embeddings.csv"
LABEL_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv"

MODEL_PATH = "/kaggle/working/xgb_deberta_grammar_model.pkl"

RANDOM_STATE = 42

emb_df = pd.read_csv(EMB_PATH)
label_df = pd.read_csv(LABEL_PATH)

emb_df.rename(columns={emb_df.columns[0]: "filename"}, inplace=True)
label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

emb_df["filename"] = emb_df["filename"].astype(str)

df = pd.merge(emb_df, label_df, on="filename", how="inner")
print("Total aligned samples:", len(df))


X = df.iloc[:, 1:-1].values.astype(np.float32)   # DeBERTa embeddings
y = df.iloc[:, -1].values.astype(np.float32)    # Grammar scores (0–5)

print("X shape:", X.shape)


X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE
)


def tolerance_accuracy(y_true, y_pred, tol=0.5):
    return np.mean(np.abs(y_true - y_pred) <= tol)


xgb_model = XGBRegressor(
    n_estimators=1500,
    max_depth=8,              
    learning_rate=0.01,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=5,       
    gamma=0.2,               
    reg_alpha=0.1,           
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=40,
    verbose=True
)


y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)

y_train_pred = np.clip(y_train_pred, 0.0, 5.0)
y_val_pred = np.clip(y_val_pred, 0.0, 5.0)

# RMSE
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)

train_acc = tolerance_accuracy(y_train, y_train_pred)
val_acc = tolerance_accuracy(y_val, y_val_pred)

print("\n=== XGBOOST PERFORMANCE ===")
print(f"Train RMSE           : {train_rmse:.4f}")
print(f"Validation RMSE      : {val_rmse:.4f}")
print(f"Train R²             : {train_r2:.4f}")
print(f"Validation R²        : {val_r2:.4f}")
print(f"Train Accuracy (±0.5): {train_acc * 100:.2f}%")
print(f"Val Accuracy (±0.5)  : {val_acc * 100:.2f}%")


joblib.dump(xgb_model, MODEL_PATH)
print("\nXGBoost model saved at:", MODEL_PATH)


Total aligned samples: 409
X shape: (409, 1024)




[0]	validation_0-rmse:0.74968
[1]	validation_0-rmse:0.74585
[2]	validation_0-rmse:0.74254
[3]	validation_0-rmse:0.73916
[4]	validation_0-rmse:0.73576
[5]	validation_0-rmse:0.73203
[6]	validation_0-rmse:0.72815
[7]	validation_0-rmse:0.72495
[8]	validation_0-rmse:0.72090
[9]	validation_0-rmse:0.71834
[10]	validation_0-rmse:0.71419
[11]	validation_0-rmse:0.71078
[12]	validation_0-rmse:0.70740
[13]	validation_0-rmse:0.70404
[14]	validation_0-rmse:0.70052
[15]	validation_0-rmse:0.69676
[16]	validation_0-rmse:0.69441
[17]	validation_0-rmse:0.69079
[18]	validation_0-rmse:0.68804
[19]	validation_0-rmse:0.68510
[20]	validation_0-rmse:0.68140
[21]	validation_0-rmse:0.67810
[22]	validation_0-rmse:0.67528
[23]	validation_0-rmse:0.67210
[24]	validation_0-rmse:0.66890
[25]	validation_0-rmse:0.66520
[26]	validation_0-rmse:0.66220
[27]	validation_0-rmse:0.66012
[28]	validation_0-rmse:0.65761
[29]	validation_0-rmse:0.65534
[30]	validation_0-rmse:0.65261
[31]	validation_0-rmse:0.65039
[32]	validation_0-

In [39]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm


INPUT_CSV = "/kaggle/working/audio_train_transcripts.csv"

OUT_MPNet = "/kaggle/working/audio_sbert_mpnet_embeddings.csv"
OUT_MiniLM = "/kaggle/working/audio_sbert_minilm_embeddings.csv"
OUT_FUSED = "/kaggle/working/audio_sbert_fused_embeddings.csv"

BATCH_SIZE = 32


df = pd.read_csv(INPUT_CSV)
df["transcript"] = df["transcript"].fillna("")

sentences = df["transcript"].tolist()
filenames = df["filename"].tolist()

print("Total transcripts:", len(sentences))

print("Loading MPNet model...")
mpnet_model = SentenceTransformer(
    "sentence-transformers/all-mpnet-base-v2"
)

print("Loading MiniLM model...")
minilm_model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2"
)


print("Extracting MPNet embeddings...")
mpnet_embeddings = mpnet_model.encode(
    sentences,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Extracting MiniLM embeddings...")
minilm_embeddings = minilm_model.encode(
    sentences,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("MPNet shape :", mpnet_embeddings.shape)   # (N, 768)
print("MiniLM shape:", minilm_embeddings.shape)  # (N, 384)


mpnet_df = pd.DataFrame(
    mpnet_embeddings,
    columns=[f"mpnet_{i}" for i in range(mpnet_embeddings.shape[1])]
)

mpnet_out = pd.concat(
    [pd.Series(filenames, name="filename"), mpnet_df],
    axis=1
)

mpnet_out.to_csv(OUT_MPNet, index=False)
print("Saved MPNet embeddings →", OUT_MPNet)


minilm_df = pd.DataFrame(
    minilm_embeddings,
    columns=[f"minilm_{i}" for i in range(minilm_embeddings.shape[1])]
)

minilm_out = pd.concat(
    [pd.Series(filenames, name="filename"), minilm_df],
    axis=1
)

minilm_out.to_csv(OUT_MiniLM, index=False)
print("Saved MiniLM embeddings →", OUT_MiniLM)

fused_embeddings = np.concatenate(
    [mpnet_embeddings, minilm_embeddings],
    axis=1
)

fused_df = pd.DataFrame(
    fused_embeddings,
    columns=[f"fused_{i}" for i in range(fused_embeddings.shape[1])]
)

fused_out = pd.concat(
    [pd.Series(filenames, name="filename"), fused_df],
    axis=1
)

fused_out.to_csv(OUT_FUSED, index=False)
print("Saved FUSED embeddings →", OUT_FUSED)

print("\nFinal fused embedding dimension:", fused_embeddings.shape[1])


Total transcripts: 409
Loading MPNet model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading MiniLM model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Extracting MPNet embeddings...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

Extracting MiniLM embeddings...


Batches:   0%|          | 0/13 [00:00<?, ?it/s]

MPNet shape : (409, 768)
MiniLM shape: (409, 384)
Saved MPNet embeddings → /kaggle/working/audio_sbert_mpnet_embeddings.csv
Saved MiniLM embeddings → /kaggle/working/audio_sbert_minilm_embeddings.csv
Saved FUSED embeddings → /kaggle/working/audio_sbert_fused_embeddings.csv

Final fused embedding dimension: 1152


In [43]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


def load_and_align_data(
    whisper_path,
    mpnet_path,
    label_path
):
    whisper_df = pd.read_csv(whisper_path)
    mpnet_df   = pd.read_csv(mpnet_path)
    label_df  = pd.read_csv(label_path)

    whisper_df.rename(columns={whisper_df.columns[0]: "filename"}, inplace=True)
    mpnet_df.rename(columns={mpnet_df.columns[0]: "filename"}, inplace=True)
    label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

    whisper_df["filename"] = whisper_df["filename"].str.replace(".wav", "", regex=False)

    df = (
        whisper_df
        .merge(mpnet_df, on="filename", how="inner")
        .merge(label_df, on="filename", how="inner")
    )

    return df


def build_whisper_branch(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="whisper_input")

    x = tf.keras.layers.Dense(256, activation="relu")(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)

    x = tf.keras.layers.Dense(128, activation="relu")(x)

    return inp, x


def build_mpnet_branch(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="mpnet_input")

    x = tf.keras.layers.Dense(256, activation="relu")(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)

    x = tf.keras.layers.Dense(128, activation="relu")(x)

    return inp, x


def build_fusion_model(whisper_dim, mpnet_dim):
    w_inp, w_feat = build_whisper_branch(whisper_dim)
    m_inp, m_feat = build_mpnet_branch(mpnet_dim)

    fused = tf.keras.layers.Concatenate()([w_feat, m_feat])

    fused = tf.keras.layers.Dense(128, activation="relu")(fused)
    fused = tf.keras.layers.BatchNormalization()(fused)
    fused = tf.keras.layers.Dropout(0.3)(fused)

    fused = tf.keras.layers.Dense(64, activation="relu")(fused)

    output = tf.keras.layers.Dense(1)(fused)

    model = tf.keras.Model(
        inputs=[w_inp, m_inp],
        outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="mse",
        metrics=["mae"]
    )

    return model


WHISPER_EMB = "/kaggle/working/audio_whisper_train_embeddings.csv"
MPNET_EMB   = "/kaggle/working/deberta_large_embeddings.csv"
LABELS     = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv"

df = load_and_align_data(WHISPER_EMB, MPNET_EMB, LABELS)
print("Total aligned samples:", len(df))

X_whisper = df.iloc[:, 1:513].values.astype(np.float32)   # 512-d
X_mpnet   = df.iloc[:, 513:-1].values.astype(np.float32) # 768-d
y         = df.iloc[:, -1].values.astype(np.float32)

Xw_tr, Xw_te, Xm_tr, Xm_te, y_tr, y_te = train_test_split(
    X_whisper, X_mpnet, y,
    test_size=0.2,
    random_state=42
)

model = build_fusion_model(
    whisper_dim=Xw_tr.shape[1],
    mpnet_dim=Xm_tr.shape[1]
)

model.summary()


history = model.fit(
    {"whisper_input": Xw_tr, "mpnet_input": Xm_tr},
    y_tr,
    validation_data=(
        {"whisper_input": Xw_te, "mpnet_input": Xm_te},
        y_te
    ),
    epochs=300,
    batch_size=24,
    verbose=1
)


def evaluate_regression(model, Xw, Xm, y_true, split=""):
    y_pred = model.predict(
        {"whisper_input": Xw, "mpnet_input": Xm}
    ).squeeze()

    y_pred = np.clip(y_pred, 0.0, 5.0)

    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mae  = np.mean(np.abs(y_true - y_pred))
    r2   = r2_score(y_true, y_pred)

    print(f"\n=== {split} METRICS ===")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R²  : {r2:.4f}")

    return y_pred


_ = evaluate_regression(model, Xw_tr, Xm_tr, y_tr, "TRAIN")
_ = evaluate_regression(model, Xw_te, Xm_te, y_te, "TEST")


MODEL_PATH = "/kaggle/working/whisper_mpnet_fusion_model.keras"
model.save(MODEL_PATH)

print("\nModel saved at:", MODEL_PATH)


Total aligned samples: 409


Epoch 1/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 231ms/step - loss: 4.9239 - mae: 1.9272 - val_loss: 8.2273 - val_mae: 2.7658
Epoch 2/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1.7016 - mae: 1.0627 - val_loss: 6.6659 - val_mae: 2.4670
Epoch 3/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1.0863 - mae: 0.7878 - val_loss: 4.9313 - val_mae: 2.0738
Epoch 4/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.8968 - mae: 0.7304 - val_loss: 5.5142 - val_mae: 2.2153
Epoch 5/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.7518 - mae: 0.6791 - val_loss: 4.4876 - val_mae: 1.9706
Epoch 6/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.5926 - mae: 0.6067 - val_loss: 3.6441 - val_mae: 1.7473
Epoch 7/300
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss:

# MAIN CODE HERE

In [38]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
def load_and_align_data(whisper_path, mpnet_path, label_path):
    whisper_df = pd.read_csv(whisper_path)
    mpnet_df   = pd.read_csv(mpnet_path)
    label_df   = pd.read_csv(label_path)

    whisper_df.rename(columns={whisper_df.columns[0]: "filename"}, inplace=True)
    mpnet_df.rename(columns={mpnet_df.columns[0]: "filename"}, inplace=True)
    label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

    whisper_df["filename"] = whisper_df["filename"].str.replace(
        ".wav", "", regex=False
    )

    df = (
        whisper_df
        .merge(mpnet_df, on="filename", how="inner")
        .merge(label_df, on="filename", how="inner")
    )

    return df


def build_whisper_branch_no_cnn(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="whisper_input")
    x = tf.keras.layers.Dense(256, activation="relu")(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    return inp, x


def build_mpnet_branch_no_cnn(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="mpnet_input")
    x = tf.keras.layers.Dense(256, activation="relu")(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    return inp, x

def build_whisper_branch_cnn(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="whisper_input")
    x = tf.keras.layers.Reshape((input_dim, 1))(inp)

    # Conv Block 1
    x = tf.keras.layers.Conv1D(
        filters=64,
        kernel_size=3,
        activation="relu",
        padding="same"
    )(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)

    # # Conv Block 2
    # x = tf.keras.layers.Conv1D(
    #     filters=128,
    #     kernel_size=3,
    #     activation="relu",
    #     padding="same"
    # )(x)
    # x = tf.keras.layers.BatchNormalization()(x)
    # x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)

    x = tf.keras.layers.Flatten()(x)

    #x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)

    return inp, x


def build_mpnet_branch_cnn(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="mpnet_input")

    x = tf.keras.layers.Reshape((input_dim, 1))(inp)

    # Conv Block 1
    x = tf.keras.layers.Conv1D(
        filters=64,
        kernel_size=3,
        activation="relu",
        padding="same"
    )(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)

    x = tf.keras.layers.Flatten()(x)

    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dense(8, activation="relu")(x)

    return inp, x



def build_fusion_model(whisper_dim, mpnet_dim):
    w_inp, w_feat = build_whisper_branch_cnn(whisper_dim)
    m_inp, m_feat = build_mpnet_branch_cnn(mpnet_dim)

    fused = tf.keras.layers.Concatenate()([w_feat, m_feat])
    fused = tf.keras.layers.Dense(64, activation="relu")(fused)
    fused = tf.keras.layers.Dense(8, activation="relu")(fused)

    output = tf.keras.layers.Dense(1)(fused)

    model = tf.keras.Model(
        inputs=[w_inp, m_inp],
        outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="mse",
        metrics=["mae"]
    )

    return model


def tolerance_accuracy(y_true, y_pred, tol=0.5):
    return np.mean(np.abs(y_true - y_pred) <= tol)


def evaluate_regression(model, Xw, Xm, y_true, split=""):
    y_pred = model.predict(
        {"whisper_input": Xw, "mpnet_input": Xm},
        verbose=0
    ).squeeze()

    y_pred = np.clip(y_pred, 0.0, 5.0)

    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mae  = np.mean(np.abs(y_true - y_pred))
    r2   = r2_score(y_true, y_pred)
    acc  = tolerance_accuracy(y_true, y_pred)

    print(f"\n=== {split} METRICS ===")
    print(f"RMSE                : {rmse:.4f}")
    print(f"MAE                 : {mae:.4f}")
    print(f"R²                  : {r2:.4f}")
    print(f"Accuracy (±0.5)     : {acc * 100:.2f}%")

    return y_pred


WHISPER_EMB = "/kaggle/working/audio_whisper_train_embeddings.csv"
MPNET_EMB   = "/kaggle/working/deberta_large_embeddings.csv"
LABELS      = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv"

df = load_and_align_data(WHISPER_EMB, MPNET_EMB, LABELS)
print("Total aligned samples:", len(df))

X_whisper = df.iloc[:, 1:513].values.astype(np.float32)    # 512-d
X_mpnet   = df.iloc[:, 513:-1].values.astype(np.float32)  # 1024-d
y         = df.iloc[:, -1].values.astype(np.float32)

Xw_tr, Xw_te, Xm_tr, Xm_te, y_tr, y_te = train_test_split(
    X_whisper,
    X_mpnet,
    y,
    test_size=0.2,
    random_state=42
)

model = build_fusion_model(
    whisper_dim=Xw_tr.shape[1],
    mpnet_dim=Xm_tr.shape[1]
)

model.summary()

BEST_MODEL_PATH = "/kaggle/working/best_whisper_doberta_fusion_model.keras"

checkpoint_cb = ModelCheckpoint(
    filepath=BEST_MODEL_PATH,
    monitor="val_loss",
    save_best_only=True,
    mode="min",
    verbose=1
)

early_stop_cb = EarlyStopping(
    monitor="val_loss",
    patience=50,
    restore_best_weights=True,
    verbose=1
)


history = model.fit(
    {"whisper_input": Xw_tr, "mpnet_input": Xm_tr},
    y_tr,
    validation_data=(
        {"whisper_input": Xw_te, "mpnet_input": Xm_te},
        y_te
    ),
    epochs=200,
    batch_size=32,
    callbacks=[checkpoint_cb, early_stop_cb],
    verbose=1
)


best_model = tf.keras.models.load_model(BEST_MODEL_PATH)
print("\nLoaded best model from:", BEST_MODEL_PATH)


_ = evaluate_regression(best_model, Xw_tr, Xm_tr, y_tr, "TRAIN")
_ = evaluate_regression(best_model, Xw_te, Xm_te, y_te, "TEST")


FINAL_MODEL_PATH = "/kaggle/working/final_best_whisper_mpnet_fusion_model.keras"
best_model.save(FINAL_MODEL_PATH)

print("\nFinal best model saved at:", FINAL_MODEL_PATH)


Total aligned samples: 409


Epoch 1/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step - loss: 73.5838 - mae: 5.4813
Epoch 1: val_loss improved from inf to 8.04823, saving model to /kaggle/working/best_whisper_doberta_fusion_model.keras
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 585ms/step - loss: 71.8685 - mae: 5.4023 - val_loss: 8.0482 - val_mae: 2.7339
Epoch 2/200
[1m5/6[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 13ms/step - loss: 1.1392 - mae: 0.8176
Epoch 2: val_loss improved from 8.04823 to 7.64305, saving model to /kaggle/working/best_whisper_doberta_fusion_model.keras
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - loss: 1.0972 - mae: 0.8073 - val_loss: 7.6431 - val_mae: 2.6604
Epoch 3/200
[1m5/6[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 13ms/step - loss: 0.7556 - mae: 0.6822
Epoch 3: val_loss improved from 7.64305 to 7.47304, saving model to /kaggle/working/best_whisper_doberta_fusion_model.keras
[1m6/6[0m 

In [40]:
import pandas as pd
import numpy as np
import tensorflow as tf


TEST_META_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/test.csv"

WHISPER_TEST_EMB = "/kaggle/working/audio_whisper_test_embeddings.csv"
DEBERTA_TEST_EMB = "/kaggle/working/deberta_large_embeddings_test.csv"

FUSION_MODEL_PATH = "/kaggle/working/final_best_whisper_mpnet_fusion_model.keras"
SUBMISSION_PATH = "/kaggle/working/submission.csv"


test_meta_df = pd.read_csv(TEST_META_PATH)
whisper_df = pd.read_csv(WHISPER_TEST_EMB)
deberta_df = pd.read_csv(DEBERTA_TEST_EMB)

test_meta_df.rename(columns={test_meta_df.columns[0]: "filename"}, inplace=True)
whisper_df.rename(columns={whisper_df.columns[0]: "filename"}, inplace=True)
deberta_df.rename(columns={deberta_df.columns[0]: "filename"}, inplace=True)

test_meta_df["filename"] = test_meta_df["filename"].astype(str)
whisper_df["filename"] = whisper_df["filename"].astype(str)
deberta_df["filename"] = deberta_df["filename"].astype(str)

whisper_df["filename"] = whisper_df["filename"].str.replace(
    ".wav", "", regex=False
)


test_df = (
    test_meta_df
    .merge(whisper_df, on="filename", how="inner")
    .merge(deberta_df, on="filename", how="inner")
)

print("Total aligned test samples:", len(test_df))
print(test_df.head())

X_whisper_test = test_df.iloc[:, 1:513].values.astype(np.float32)

X_deberta_test = test_df.iloc[:, 513:].values.astype(np.float32)

print("Whisper test shape :", X_whisper_test.shape)
print("DeBERTa test shape :", X_deberta_test.shape)


fusion_model = tf.keras.models.load_model(FUSION_MODEL_PATH)
print("Loaded fusion model from:", FUSION_MODEL_PATH)

y_test_pred = fusion_model.predict(
    {
        "whisper_input": X_whisper_test,
        "mpnet_input": X_deberta_test
    },
    verbose=1
).squeeze()

y_test_pred = np.clip(y_test_pred, 0.0, 5.0)

y_test_pred = np.round(y_test_pred * 2) / 2

submission_df = pd.DataFrame({
    "filename": test_df["filename"],
    "label": y_test_pred
})

submission_df.to_csv(SUBMISSION_PATH, index=False)

print("Submission saved at:", SUBMISSION_PATH)
submission_df.head()


Total aligned test samples: 197
    filename         0         1         2         3         4         5  \
0  audio_141 -0.090601 -0.494616 -0.142851 -0.164760  0.259085  0.244506   
1  audio_114  0.006982 -0.009547  0.069612 -0.146992 -0.206328  0.392823   
2   audio_17 -0.057944  0.019045 -0.017720  0.042755 -0.006069  0.009745   
3   audio_76  0.174044 -0.028993 -0.065615  0.139336 -0.011715 -0.143123   
4  audio_156 -0.051893 -0.351710 -0.011548 -0.003263  0.081606  0.067437   

          6         7         8  ...     e1014     e1015     e1016     e1017  \
0 -0.364304  0.028873  0.068138  ...  0.000471 -0.426584  0.532851  0.185651   
1 -0.121439 -0.258225  0.137687  ...  0.076440 -0.126993  0.037520  0.398466   
2 -0.149774 -0.358314  0.498129  ... -0.350288 -0.345638  0.217359  0.514474   
3 -0.110890 -0.053824  0.393943  ... -0.173768 -0.363479  0.276646  0.292233   
4 -0.270160 -0.129895  0.272256  ... -0.307831 -0.081907  0.108643  0.190455   

      e1018     e1019     e102

Unnamed: 0,filename,label
0,audio_141,2.5
1,audio_114,5.0
2,audio_17,3.5
3,audio_76,5.0
4,audio_156,3.0


In [46]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


tf.keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)


def load_and_align_data(whisper_path, mpnet_path, label_path):
    whisper_df = pd.read_csv(whisper_path)
    mpnet_df   = pd.read_csv(mpnet_path)
    label_df   = pd.read_csv(label_path)

    whisper_df.rename(columns={whisper_df.columns[0]: "filename"}, inplace=True)
    mpnet_df.rename(columns={mpnet_df.columns[0]: "filename"}, inplace=True)
    label_df.rename(columns={label_df.columns[0]: "filename"}, inplace=True)

    whisper_df["filename"] = whisper_df["filename"].str.replace(".wav", "", regex=False)

    df = (
        whisper_df
        .merge(mpnet_df, on="filename", how="inner")
        .merge(label_df, on="filename", how="inner")
    )

    return df


def build_whisper_branch(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="whisper_input")

    x = tf.keras.layers.Dense(512, activation="relu")(inp)
    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dense(8, activation="relu")(x)

    return inp, x


def build_mpnet_branch(input_dim):
    inp = tf.keras.layers.Input(shape=(input_dim,), name="mpnet_input")

    x = tf.keras.layers.Dense(256, activation="relu")(inp)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)

    return inp, x


def build_fusion_model(whisper_dim, mpnet_dim):
    w_inp, w_feat = build_whisper_branch(whisper_dim)
    m_inp, m_feat = build_mpnet_branch(mpnet_dim)

    w_seq = tf.keras.layers.Reshape((1, 8))(w_feat)  # Query
    m_seq = tf.keras.layers.Reshape((1, 128))(m_feat)  # Key / Value

    attn = tf.keras.layers.MultiHeadAttention(
        num_heads=12,
        key_dim=32,
        name="audio_to_text_attention"
    )(query=w_seq, value=m_seq, key=m_seq)

    attn = tf.keras.layers.Add()([w_seq, attn])
    attn = tf.keras.layers.LayerNormalization()(attn)

    attn = tf.keras.layers.Flatten()(attn)

    fused = tf.keras.layers.Concatenate()([attn, m_feat])

    fused = tf.keras.layers.Dense(128, activation="relu")(fused)
    fused = tf.keras.layers.BatchNormalization()(fused)
    fused = tf.keras.layers.Dropout(0.1)(fused)

    fused = tf.keras.layers.Dense(64, activation="relu")(fused)
    fused = tf.keras.layers.Dense(8, activation="relu")(fused)

    output = tf.keras.layers.Dense(1)(fused)

    model = tf.keras.Model(
        inputs=[w_inp, m_inp],
        outputs=output
    )

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="mse",
        metrics=["mae"]
    )

    return model

WHISPER_EMB = "/kaggle/working/audio_whisper_train_embeddings.csv"
MPNET_EMB   = "/kaggle/working/audio_sbert_mpnet_embeddings.csv"
LABELS     = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv"

df = load_and_align_data(WHISPER_EMB, MPNET_EMB, LABELS)
print("Total aligned samples:", len(df))

X_whisper = df.iloc[:, 1:513].values.astype(np.float32)   # 512-d
X_mpnet   = df.iloc[:, 513:-1].values.astype(np.float32) # 768-d
y         = df.iloc[:, -1].values.astype(np.float32)

Xw_tr, Xw_te, Xm_tr, Xm_te, y_tr, y_te = train_test_split(
    X_whisper, X_mpnet, y,
    test_size=0.2,
    random_state=42
)


model = build_fusion_model(
    whisper_dim=Xw_tr.shape[1],
    mpnet_dim=Xm_tr.shape[1]
)

model.summary()

history = model.fit(
    {"whisper_input": Xw_tr, "mpnet_input": Xm_tr},
    y_tr,
    validation_data=(
        {"whisper_input": Xw_te, "mpnet_input": Xm_te},
        y_te
    ),
    epochs=100,
    batch_size=32,
    verbose=1
)


def evaluate_regression(model, Xw, Xm, y_true, split):
    y_pred = model.predict(
        {"whisper_input": Xw, "mpnet_input": Xm}
    ).squeeze()

    y_pred = np.clip(y_pred, 0.0, 5.0)

    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mae  = np.mean(np.abs(y_true - y_pred))
    r2   = r2_score(y_true, y_pred)

    print(f"\n=== {split} METRICS ===")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE : {mae:.4f}")
    print(f"R²  : {r2:.4f}")

    return y_pred


_ = evaluate_regression(model, Xw_tr, Xm_tr, y_tr, "TRAIN")
_ = evaluate_regression(model, Xw_te, Xm_te, y_te, "TEST")


MODEL_PATH = "/kaggle/working/whisper_mpnet_attention_fusion_model.keras"
model.save(MODEL_PATH)

print("\nModel saved at:", MODEL_PATH)


Total aligned samples: 409


Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 406ms/step - loss: 9.7843 - mae: 2.9577 - val_loss: 8.3249 - val_mae: 2.7854
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 4.5202 - mae: 1.8928 - val_loss: 6.9017 - val_mae: 2.5163
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 2.0090 - mae: 1.1631 - val_loss: 5.7182 - val_mae: 2.2685
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.9327 - mae: 0.7438 - val_loss: 5.0622 - val_mae: 2.1192
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.6193 - mae: 0.6182 - val_loss: 4.7429 - val_mae: 2.0417
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.4115 - mae: 0.5023 - val_loss: 4.3938 - val_mae: 1.9539
Epoch 7/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step 