
#  Grammar Scoring Model Report

##  Approach

We built a **Grammar Scoring Engine** that evaluates grammar usage in spoken English audio samples using deep learning techniques. The model scores inputs based on a predefined **rubric (1–5)** that evaluates grammar control, sentence structure, and complexity of language used.

##  Preprocessing Steps

1. **Audio Loading**: Audio files are loaded at 16kHz sampling rate.
2. **Feature Extraction**: Using `Wav2Vec2` from HuggingFace Transformers to convert speech into embeddings.
3. **Padding**: Ensures consistent input size across samples.
4. **Label Mapping**: Targets were labeled according to the 1–5 grammar score rubric.

##  Model Architecture

- **Feature Extractor**: `Wav2Vec2` (`facebook/wav2vec2-base-960h`), pre-trained on large speech corpora.
- **Classifier**: A simple fully connected feed-forward network on top of averaged Wav2Vec2 embeddings.

Architecture summary:
- Input: [Batch, Time] audio waveforms
- Feature extraction: `Wav2Vec2Model`
- Averaging over time dimension
- Linear Layer → Output score (1–5)

##  Evaluation

- **Loss**: Mean Squared Error (regression-style)
- **Metric**: Accuracy (rounded to nearest integer), Mean Absolute Error (MAE)
- The model was trained on 444 samples and evaluated on 195 test samples.
- Final predictions are compared with ground truth labels.

##  Scoring Rubric Recap

| Score | Description |
|-------|-------------|
| 1 | Frequent basic grammar errors, incomplete sentences |
| 2 | Basic grammar and syntax errors throughout |
| 3 | Decent grammar but syntax or structure errors |
| 4 | Strong grammar with occasional minor mistakes |
| 5 | Excellent grammar and control over complex structures |



In [4]:

import os
import torch
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch import nn
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [2]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# --- Step 1: Load Processor and Pretrained Wav2Vec2 Model ---
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
wav2vec_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device).eval()


In [None]:
#  This is the correct path to your CSV
csv_path = r"C:\Users\SIDDHARTH JAIN\OneDrive\Desktop\SHL Project\train.csv"

#  This is the correct path to your folder containing audio files
audio_dir = r"C:\Users\SIDDHARTH JAIN\OneDrive\Desktop\SHL Project\train_audio"

# Now load the CSV correctly
df = pd.read_csv(csv_path)
print(df.head())


In [None]:
# for feature extraction from raw audio
def save_wav2vec_features(df, audio_dir, feat_path="features.npy", label_path="labels.npy"):
    features, labels = [], []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        path = os.path.join(audio_dir, row['filename'])
        label = row['score']
        audio, sr = librosa.load(path, sr=16000)
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
        with torch.no_grad():
            out = wav2vec_model(**inputs.to(device)).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
        features.append(out)
        labels.append(label)
    np.save(feat_path, features)
    np.save(label_path, labels)


In [None]:
# Run this once to extract and save features
save_wav2vec_features(df, audio_dir, "features.npy", "labels.npy")

In [None]:
# --- Step 2: Dataset ---
class CachedAudioDataset(Dataset):
    def __init__(self, feature_path, label_path=None, file_list_path=None):
        self.features = np.load(feature_path)
        self.labels = np.load(label_path) if label_path else None
        self.file_list = None

        if file_list_path:
            with open(file_list_path, 'r') as f:
                self.file_list = [line.strip() for line in f]

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = torch.tensor(self.features[idx], dtype=torch.float32)
        if self.labels is not None:
            y = torch.tensor(self.labels[idx], dtype=torch.float32)
        else:
            y = torch.tensor(0.0)  # dummy label for test data

        return x, y


In [None]:
# --- Step 3: Regression Head ---
class RegressionModel(nn.Module):
    def __init__(self, input_dim=768):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


In [None]:
#--- Step 4: Training & Evaluation ---
def train_model(model, train_loader, val_loader, epochs=20, lr=1e-4):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    scaler = torch.cuda.amp.GradScaler()


    best_val_loss = float("inf")
    patience = 3
    wait = 0

    for epoch in range(epochs):
        model.train()
        train_losses = []

        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                output = model(x_batch).squeeze()
                loss = criterion(output, y_batch)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_losses.append(loss.item())

        # Validation
        model.eval()
        val_losses, preds, targets = [], [], []
        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                pred = model(x_val).squeeze()
                loss = criterion(pred, y_val)
                val_losses.append(loss.item())
                preds.extend(pred.cpu().numpy())
                targets.extend(y_val.cpu().numpy())

        val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}, Train Loss: {np.mean(train_losses):.4f}, Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            wait = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            wait += 1
            if wait >= patience:
                print("Early stopping")
                break

    print("Final MAE:", mean_absolute_error(targets, preds))
    print("Final RMSE:", mean_squared_error(targets, preds, squared=False))


In [None]:
import torch
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
import numpy as np

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load preprocessed dataset
full_dataset = CachedAudioDataset("features.npy", "labels.npy")
train_idx, val_idx = train_test_split(np.arange(len(full_dataset)), test_size=0.2, random_state=42)

train_set = Subset(full_dataset, train_idx)
val_set = Subset(full_dataset, val_idx)

# Increase batch size, reduce num_workers for stability (especially on Windows)
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False, num_workers=0)

#  Define model and move to device
model = RegressionModel().to(device)



# Train for fewer epochs first to test speed
train_model(model, train_loader, val_loader, epochs=5)


In [None]:
# Load your test file list
test_audio_dir = r"C:\Users\SIDDHARTH JAIN\OneDrive\Desktop\SHL Project\test_audio"
test_files = sorted([f for f in os.listdir(test_audio_dir) if f.endswith(".wav")])

# Extract features from test audio
test_features = []

for fname in tqdm(test_files):
    path = os.path.join(test_audio_dir, fname)
    audio, sr = librosa.load(path, sr=16000)
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        out = wav2vec_model(**inputs.to(device)).last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    test_features.append(out)

np.save("features_test.npy", test_features)
print(" Saved features_test.npy")


In [None]:
# Load test features
test_features = np.load("features_test.npy")
test_tensor = torch.tensor(test_features, dtype=torch.float32)

# Prepare DataLoader
test_dataset = torch.utils.data.TensorDataset(test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Load your trained model
model = RegressionModel()
#model.load_state_dict(torch.load("best_model.pth"))  # make sure you saved this after training
model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')))

device = torch.device("cpu")
model.to(device)
model.eval()

# Predict
predictions = []
filenames = []

model.eval()
with torch.no_grad():
    for i in range(len(test_dataset)):
        x = test_dataset[i][0]
        x = x.unsqueeze(0).to(device)
        output = model(x).squeeze().item()
        predictions.append(output)

        if test_files:
            filenames.append(test_files)
        else:
            filenames.append(f"sample_{i}.wav")

# Save as CSV
results_df = pd.DataFrame({
    "filename": filenames,
    "predicted_score": predictions
})
results_df.to_csv("predicted_scores.csv", index=False)
print(" Saved predicted_scores.csv")

