In [None]:
# Step 1: Import necessary libraries
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import joblib

# Step 2: Load CSV files (train.csv and test.csv)
train_csv_path = '/kaggle/input/shl-intern-hiring-assessment/Dataset/train.csv'  # Path to train.csv
test_csv_path = '/kaggle/input/shl-intern-hiring-assessment/Dataset/test.csv'    # Path to test.csv

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

print(f"Train data sample: \n{train_df.head()}")
print(f"Test data sample: \n{test_df.head()}")

# Step 3: Define function to extract additional audio features (MFCC, Chroma, Spectral Contrast, Zero-Crossing Rate)
def extract_features(file_path):
    """Extract MFCC, Chroma, Spectral Contrast, and Zero-Crossing Rate from an audio file"""
    audio, sr = librosa.load(file_path, sr=None)
    
    # MFCC (Mel-Frequency Cepstral Coefficients)
    mfcc_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mfcc_features = np.mean(mfcc_features, axis=1)  # Take mean of MFCC coefficients
    
    # Chroma features
    chroma_features = librosa.feature.chroma_stft(y=audio, sr=sr)
    chroma_features = np.mean(chroma_features, axis=1)
    
    # Spectral contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    spectral_contrast = np.mean(spectral_contrast, axis=1)
    
    # Zero-crossing rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=audio)
    zero_crossing_rate = np.mean(zero_crossing_rate)
    
    # Root mean square error (RMS)
    rmse = librosa.feature.rms(y=audio)
    rmse = np.mean(rmse)
    
    # Combine all features
    features = np.hstack([mfcc_features, chroma_features, spectral_contrast, zero_crossing_rate, rmse])
    return features

# Step 4: Prepare training features and labels
X_train = []
y_train = []

# Loop over each row in the train dataframe to load audio files and labels
for index, row in train_df.iterrows():
    audio_file = os.path.join('/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/train', row['filename'])
    features = extract_features(audio_file)
    X_train.append(features)
    y_train.append(row['label'])

X_train = np.array(X_train)
y_train = np.array(y_train)

# Step 5: Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Step 6: Train a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 7: Evaluate model using training data
y_train_pred = model.predict(X_train_scaled)

# Calculate RMSE for the training data
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"RMSE on training data: {rmse}")

# Step 8: Prepare test data
X_test = []
filenames = []

# Loop over each row in the test dataframe to load audio files
for index, row in test_df.iterrows():
    audio_file = os.path.join('/kaggle/input/shl-intern-hiring-assessment/Dataset/audios/test', row['filename'])
    features = extract_features(audio_file)
    X_test.append(features)
    filenames.append(row['filename'])

X_test = np.array(X_test)

# Scale the test features
X_test_scaled = scaler.transform(X_test)

# Predict the grammar scores on the test set
y_test_pred = model.predict(X_test_scaled)

# Step 9: Prepare the submission file (CSV format)
submission = pd.DataFrame({
    'filename': filenames,
    'predicted_mos_score': y_test_pred
})

submission_file_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_file_path, index=False)

# Step 10: Save the model (Optional: If you want to use the model later)
joblib.dump(model, '/kaggle/working/grammar_scoring_model.pkl')

print(f"Submission file created: {submission_file_path}")

# Optional: Cross-validation to evaluate model
kf = KFold(n_splits=5, random_state=42, shuffle=True)
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=kf, scoring='neg_mean_squared_error')
print(f"Cross-validated RMSE: {np.sqrt(-cv_scores.mean())}")