In [1]:
# SHL Grammar Scoring Engine
**Author:** Jaswanth Chilakalapudi 
**Description:** This notebook presents the development of a Grammar Scoring Engine that predicts grammar proficiency (score: 0–5) from 45–60 second speech samples, using transcript and acoustic features.  
**Evaluation Metric:** Pearson Correlation Coefficient


SyntaxError: invalid decimal literal (2870833485.py, line 3)

In [None]:
import pandas as pd
import os

train_transcripts = pd.read_csv("train_transcripts.csv")
test_transcripts = pd.read_csv("test_transcripts.csv")
train_audio = pd.read_csv("train_audio_features.csv")
test_audio = pd.read_csv("test_audio_features.csv")
labels_df = pd.read_csv("dataset/train.csv")

# Merge data
train_df = pd.merge(train_transcripts, train_audio, on="filename")
test_df = pd.merge(test_transcripts, test_audio, on="filename")
labels_df.rename(columns={"label": "grammar"}, inplace=True)
train_df = pd.merge(train_df, labels_df[["filename", "grammar"]], on="filename")


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = train_df.drop(columns=["filename", "transcript", "grammar"])
y = train_df["grammar"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_val_scaled)

pearson_corr, _ = pearsonr(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred) ** 0.5

print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"RMSE: {rmse:.4f}")


In [None]:
### 📊 Evaluation
- **Pearson Correlation** indicates how well our predicted grammar scores align with the ground truth.
- **RMSE** helps track absolute error in prediction.


In [None]:
import joblib

# Save model and scaler
joblib.dump(model, "grammar_model.pkl")
joblib.dump(scaler, "scaler.pkl")

# Prepare test predictions
X_test = test_df.drop(columns=["filename", "transcript"])
X_test_scaled = scaler.transform(X_test)
test_preds = model.predict(X_test_scaled)

submission_df = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_preds.round(1)
})

submission_df.to_csv("submission.csv", index=False)
print("✅ submission.csv saved.")


In [None]:
## ✅ Summary
- Features used: Acoustic and transcript-based features.
- Model: Random Forest Regressor
- Evaluation Metric: Pearson Correlation (used in leaderboard).
- Output: submission.csv with predicted grammar scores (0–5).


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 6))
plt.scatter(y_val, y_pred, alpha=0.6)
plt.xlabel("True Grammar Score")
plt.ylabel("Predicted Grammar Score")
plt.title("Validation Set: True vs Predicted")
plt.grid(True)
plt.show()
