In [None]:
pip install -U openai-whisper

In [None]:
pip install language_tool_python

In [None]:
# Grammar Scoring Engine for Voice Samples
# Import Necessary Libraries
import os
import pandas as pd
import numpy as np
import whisper
from transformers import BertTokenizer, BertModel
import torch
import language_tool_python
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from google.colab import drive
from google.colab import files

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Paths
TRAIN_AUDIO_DIR = "/content/drive/My Drive/Grammar Scoring/audios_train"
TEST_AUDIO_DIR = "/content/drive/My Drive/Grammar Scoring/audios_test"
TRAIN_CSV = "/content/drive/My Drive/Grammar Scoring/train.csv"
TEST_CSV = "/content/drive/My Drive/Grammar Scoring/test.csv"

# Step 1: Load Whisper model for speech-to-text
whisper_model = whisper.load_model("base")

# Step 2: Initialize LanguageTool and BERT
grammar_tool = language_tool_python.LanguageTool("en-US")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Function to convert audio to text
def audio_to_text(audio_path):
    result = whisper_model.transcribe(audio_path, fp16=False)  # Explicitly disable FP16
    return result["text"]

# Function to extract grammar features using LanguageTool
def get_grammar_features(text):
    matches = grammar_tool.check(text)
    num_errors = len(matches)
    return num_errors

# Function to extract BERT embeddings
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token
    return embeddings

# Step 3: Process training data
train_df = pd.read_csv(TRAIN_CSV)
train_features = []

print("Transcribed texts from training audio files:")
for idx, filename in enumerate(train_df["filename"], 1):
    audio_path = os.path.join(TRAIN_AUDIO_DIR, filename)
    text = audio_to_text(audio_path)

    # Print immediately after transcription
    print(f"{idx}. {filename}: '{text}'")

    # Grammar features
    grammar_score = get_grammar_features(text)

    # BERT embeddings
    bert_emb = get_bert_embeddings(text)

    # Combine features
    features = np.concatenate([[grammar_score], bert_emb])
    train_features.append(features)

# Prepare training data
X_train = np.array(train_features)
y_train = train_df["label"].values

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Step 4: Train XGBoost model
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

# Step 5: Process test data and predict
test_df = pd.read_csv(TEST_CSV)
test_features = []

print("\nTranscribed texts from test audio files:")
for idx, filename in enumerate(test_df["filename"], 1):
    audio_path = os.path.join(TEST_AUDIO_DIR, filename)
    text = audio_to_text(audio_path)

    # Print immediately after transcription
    print(f"{idx}. {filename}: '{text}'")

    # Grammar features
    grammar_score = get_grammar_features(text)

    # BERT embeddings
    bert_emb = get_bert_embeddings(text)

    # Combine features
    features = np.concatenate([[grammar_score], bert_emb])
    test_features.append(features)

# Scale test features
X_test_scaled = scaler.transform(np.array(test_features))

# Predict
y_test_pred = xgb_model.predict(X_test_scaled)

# Round predictions to nearest 0.5 and ensure within 1.0 to 5.0
y_test_pred_rounded = np.clip(np.round(y_test_pred * 2) / 2, 1.0, 5.0)  # Multiply by 2, round, divide by 2 for 0.5 steps

In [None]:
# Step 6: Save submission to new file and make it downloadable
submission_df = pd.DataFrame({"filename": test_df["filename"], "label": y_test_pred_rounded})
submission_df.to_csv("submission.csv", index=False)
print("\nSubmission saved to submission.csv")
files.download("submission.csv")

In [None]:
# Step 4.1: Evaluation of model performance on training data
y_train_pred = xgb_model.predict(X_train_scaled)
y_train_pred_rounded = np.clip(np.round(y_train_pred), 1.0, 5.0)  # Round to nearest integer and clip
y_train_rounded = np.clip(np.round(y_train), 1.0, 5.0)  # Round actual labels for classification metrics

print("\nModel Performance on Training Data")
print("Classification Report:")
print(classification_report(y_train_rounded, y_train_pred_rounded, target_names=[f"Score {i}" for i in range(1, 6)]))
print("Confusion Matrix:")
print(confusion_matrix(y_train_rounded, y_train_pred_rounded))
mse = mean_squared_error(y_train_rounded, y_train_pred_rounded)
print(f"\nMean Squared Error: {mse:.4f}")