In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import joblib
import os


In [None]:
# Load CSV files
train_transcripts = pd.read_csv("train_transcripts.csv")
test_transcripts = pd.read_csv("test_transcripts.csv")
train_audio = pd.read_csv("train_audio_features.csv")
test_audio = pd.read_csv("test_audio_features.csv")
labels_df = pd.read_csv("dataset/train.csv")

# Rename label column to grammar for clarity
labels_df.rename(columns={"label": "grammar"}, inplace=True)


In [None]:
# Merge transcripts and audio features
train_df = pd.merge(train_transcripts, train_audio, on="filename")
test_df = pd.merge(test_transcripts, test_audio, on="filename")

# Merge grammar labels with train data
train_df = pd.merge(train_df, labels_df[["filename", "grammar"]], on="filename")

# Optional: Save combined data for debugging
train_df.to_csv("train_combined.csv", index=False)
test_df.to_csv("test_combined.csv", index=False)


In [None]:
# Drop unnecessary columns
X = train_df.drop(columns=["filename", "transcript", "grammar"])
y = train_df["grammar"]

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [None]:
# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)


In [None]:
# Predict on validation set
y_pred = model.predict(X_val_scaled)

# Metrics
pearson_corr, _ = pearsonr(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred) ** 0.5

print(f"✅ Pearson Correlation: {pearson_corr:.4f}")
print(f"✅ RMSE: {rmse:.4f}")


In [None]:
# Save model and scaler
joblib.dump(model, "grammar_model.pkl")
joblib.dump(scaler, "scaler.pkl")


In [None]:
X_test = test_df.drop(columns=["filename", "transcript"])
X_test_scaled = scaler.transform(X_test)

test_predictions = model.predict(X_test_scaled)


In [None]:
submission_df = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_predictions
})

# Round scores to 1 decimal
submission_df["label"] = submission_df["label"].round(1)

# Save submission
submission_df.to_csv("submission.csv", index=False)
print("✅ submission.csv saved successfully.")


## 📝 Project Report: Grammar Scoring Engine

### 🔍 Objective:
To develop a model that predicts grammar scores (0–5) from 45–60s audio files using audio and transcript features.

### 📂 Data Used:
- `train.csv`: Grammar score labels
- `train_transcripts.csv` + `train_audio_features.csv`: Training features
- `test_transcripts.csv` + `test_audio_features.csv`: Test features

### ⚙️ Method:
- Merged transcript and audio features using filename
- Scaled features using `StandardScaler`
- Trained a `RandomForestRegressor` on training data
- Evaluated using **Pearson Correlation** and **RMSE**

### 📊 Evaluation:
- **Pearson Correlation**: `0.83` *(example – replace with your result)*
- **RMSE**: `0.45` *(example – replace with your result)*

### ✅ Submission:
- `submission.csv` contains:
  - `filename`
  - Predicted grammar `label` (rounded to 1 decimal)

---

### ✅ Conclusion:
This approach uses interpretable ML with combined features from speech and text to estimate grammar performance with reasonable accuracy. Feature importance can be used to further improve and understand model decisions.
