In [None]:
#Import necessary libraries
import torch
import torchaudio
import torchaudio.transforms as T
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import os
import numpy as np
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

import warnings
warnings.filterwarnings('ignore')

In [None]:
def get_wav2vec2_embeddings(audio_file_path, processor, model, device):
    """
    Generates Wav2Vec2 embeddings for a given audio file.
    """
    try:
        waveform, sample_rate = torchaudio.load(audio_file_path)

        # Convert to mono if stereo
        if waveform.shape[0] == 2:
            waveform = torch.mean(waveform, dim=0, keepdim=True)  # Average channels

        waveform = torch.squeeze(waveform)  # Remove extra dimensions
        inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        return embeddings
    except Exception as e:
        print(f"Error processing {audio_file_path}: {e}")
        return None

In [None]:
def process_audio_files(audio_dir, processor, model, device):
    """
    Processes all audio files in a directory and returns their embeddings.

    Args:
        audio_dir (str): Path to the directory containing audio files.
        processor (Wav2Vec2Processor): Wav2Vec2 processor.
        model (Wav2Vec2Model): Wav2Vec2 model.
        device (torch.device): The device to run the model on (CPU or GPU).

    Returns:
        dict: A dictionary mapping audio file names to their embedding vectors.
    """
    embeddings = {}
    for filename in tqdm(os.listdir(audio_dir)):
        if filename.endswith(".wav"):
            audio_file_path = os.path.join(audio_dir, filename)
            embedding = get_wav2vec2_embeddings(audio_file_path, processor, model, device)
            if embedding is not None:
                embeddings[filename] = embedding
    return embeddings

In [None]:
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
train_audio_dir = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train"
train_embeddings = process_audio_files(train_audio_dir, processor, model, device)
if train_embeddings:
    first_train_file = list(train_embeddings.keys())[0]
    print(f"Shape of embedding for {first_train_file}: {train_embeddings[first_train_file].shape}")

In [None]:
train_csv_path = "/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv"
train_df = pd.read_csv(train_csv_path)
train_labels = {}
for index, row in train_df.iterrows():
    train_labels[row['filename']] = row['label']

features = []
targets = []
for filename, embedding in train_embeddings.items():
    if filename in train_labels:
        features.append(embedding)
        targets.append(train_labels[filename])
features = np.array(features)
targets = np.array(targets)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=42)

In [None]:
def objective(trial):
    """Objective function for Optuna."""
    param = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }

    xgb_model = xgb.XGBRegressor(**param)
    xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False, early_stopping_rounds=10)

    y_pred = xgb_model.predict(X_val)
    y_pred_transformed = np.clip(y_pred, 0, 5)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred_transformed))
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Adjust n_trials as needed

print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
best_params = trial.params
best_params['objective'] = 'reg:squarederror'
best_params['random_state'] = 42

final_xgb_model = xgb.XGBRegressor(**best_params)
final_xgb_model.fit(X_train, y_train)

In [None]:
y_pred = final_xgb_model.predict(X_val)
y_pred_clipped = np.clip(y_pred, 0, 5)

In [None]:
rmse = np.sqrt(mean_squared_error(y_val, y_pred_clipped))
print(f"Root Mean Squared Error (Clipped): {rmse}")

In [None]:
test_audio_dir = "/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test"
test_embeddings = process_audio_files(test_audio_dir, processor, model, device)
test_features = []
test_filenames = []
for filename, embedding in test_embeddings.items():
    test_features.append(embedding)
    test_filenames.append(filename)
test_predictions = final_xgb_model.predict(np.array(test_features))
test_predictions_clipped = np.clip(test_predictions, 0, 5)  # Clip test predictions

In [None]:
output_df = pd.DataFrame({'filename': test_filenames, 'label': test_predictions_clipped})
output_df.to_csv("submission.csv", index=False)