<a href="https://colab.research.google.com/github/PrajitaB/Availability_Survey/blob/master/Grammar_Scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U openai-whisper



In [None]:
!pip install language_tool_python



In [None]:
import os
import pandas as pd
import numpy as np
import whisper
from transformers import BertTokenizer, BertModel
import torch
import language_tool_python
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from google.colab import drive
from google.colab import files

# Mount Google Drive
drive.mount('/content/drive')

# Paths
TRAIN_AUDIO_DIR = "/content/drive/My Drive/Grammar Scoring/audios_train"
TEST_AUDIO_DIR = "/content/drive/My Drive/Grammar Scoring/audios_test"
TRAIN_CSV = "/content/drive/My Drive/Grammar Scoring/train.csv"
TEST_CSV = "/content/drive/My Drive/Grammar Scoring/test.csv"

# Step 1: Load Whisper model for speech-to-text
whisper_model = whisper.load_model("base")

# Step 2: Initialize LanguageTool and BERT
grammar_tool = language_tool_python.LanguageTool("en-US")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

# Function to convert audio to text
def audio_to_text(audio_path):
    result = whisper_model.transcribe(audio_path, fp16=False)  # Explicitly disable FP16
    return result["text"]

# Function to extract grammar features using LanguageTool
def get_grammar_features(text):
    matches = grammar_tool.check(text)
    num_errors = len(matches)
    return num_errors

# Function to extract BERT embeddings
def get_bert_embeddings(text):
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token
    return embeddings

# Step 3: Process training data
train_df = pd.read_csv(TRAIN_CSV)
train_features = []

print("Transcribed texts from training audio files:")
for idx, filename in enumerate(train_df["filename"], 1):
    audio_path = os.path.join(TRAIN_AUDIO_DIR, filename)
    text = audio_to_text(audio_path)

    # Print immediately after transcription
    print(f"{idx}. {filename}: '{text}'")

    # Grammar features
    grammar_score = get_grammar_features(text)

    # BERT embeddings
    bert_emb = get_bert_embeddings(text)

    # Combine features
    features = np.concatenate([[grammar_score], bert_emb])
    train_features.append(features)

# Prepare training data
X_train = np.array(train_features)
y_train = train_df["label"].values

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Step 4: Train XGBoost model
xgb_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42
)
xgb_model.fit(X_train_scaled, y_train)

# Step 5: Process test data and predict
test_df = pd.read_csv(TEST_CSV)
test_features = []

print("\nTranscribed texts from test audio files:")
for idx, filename in enumerate(test_df["filename"], 1):
    audio_path = os.path.join(TEST_AUDIO_DIR, filename)
    text = audio_to_text(audio_path)

    # Print immediately after transcription
    print(f"{idx}. {filename}: '{text}'")

    # Grammar features
    grammar_score = get_grammar_features(text)

    # BERT embeddings
    bert_emb = get_bert_embeddings(text)

    # Combine features
    features = np.concatenate([[grammar_score], bert_emb])
    test_features.append(features)

# Scale test features
X_test_scaled = scaler.transform(np.array(test_features))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Transcribed texts from training audio files:
1. audio_1261.wav: ' 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% 1.5% My favourite hobby is cultivation of plants such as gardening, offers you a rewending the experience with a gardening. I can gain immense of immense of plants to plant and the flowers and herbs to unwaistables.'
2. audio_942.wav: ' The playground looks like very clear and neat as there are a lot of colorful things like basketball, court, for playing or we can do zin zing on swings. Group of friends together play very well and they laugh each other.'
3. audio_1110.wav: ' My Girl is to become an Elecrical Employee and also I am studying the Electronics Engineering to become an Electrical Engineer. hənl Words alleged that you need iceberg visitor, nd we have prepared the topics thoroughly to reach the goals.'

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Predict
y_pred = xgb_model.predict(X_test_scaled)

# Round predictions to nearest 0.5 and ensure within 1.0 to 5.0
y_pred_rounded = np.clip(np.round(y_pred * 2) / 2, 1.0, 5.0)  # Multiply by 2, round, divide by 2 for 0.5 steps

# Step 6: Save submission to new file and make it downloadable
submission_df = pd.DataFrame({"filename": test_df["filename"], "label": y_pred_rounded})
submission_df.to_csv("submission.csv", index=False)
print("\nSubmission saved to submission.csv")
files.download("submission.csv")


Submission saved to submission.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Add imports for evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
import numpy as np

# Step 4.1: Evaluate on Training Data
# Predict on training data
y_pred = xgb_model.predict(X_train_scaled)

# Clip predictions to [1.0, 5.0] range
y_pred = np.clip(y_pred, 1.0, 5.0)

# Discretize continuous values to integers (1 to 5) for classification metrics
y_train_discrete = np.round(y_train).astype(int)
y_train_pred_discrete = np.round(y_pred).astype(int)

# Ensure values stay within valid range (1 to 5)
y_train_discrete = np.clip(y_train_discrete, 1, 5)
y_train_pred_discrete = np.clip(y_train_pred_discrete, 1, 5)

print("Model Training Performance")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_train_discrete, y_train_pred_discrete, target_names=[f"Score {i}" for i in range(1, 6)]))

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_train_discrete, y_train_pred_discrete)
print(cm)

# Regression Metric (MSE) for reference
mse = mean_squared_error(y_train, y_pred)
print(f"\nMean Squared Error: {mse:.4f}")

Model Training Performance

Classification Report:
              precision    recall  f1-score   support

     Score 1       1.00      1.00      1.00         1
     Score 2       1.00      0.80      0.89       113
     Score 3       0.69      1.00      0.81        87
     Score 4       1.00      0.65      0.79       133
     Score 5       0.79      1.00      0.88       110

    accuracy                           0.84       444
   macro avg       0.90      0.89      0.87       444
weighted avg       0.89      0.84      0.84       444


Confusion Matrix:
[[  1   0   0   0   0]
 [  0  90  23   0   0]
 [  0   0  87   0   0]
 [  0   0  17  87  29]
 [  0   0   0   0 110]]

Mean Squared Error: 0.0045
