In [None]:
pip install -U openai-whisper

In [None]:
pip install language_tool_python

In [None]:
####################################
### The Base Model (GPU Version) ###
####################################

# Import Necessary Libraries
import os
import numpy as np
import pandas as pd
import cupy as cp
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error
import language_tool_python
import torch
import whisper
from transformers import BertTokenizer, BertModel
import xgboost as xgb
from google.colab import drive
from google.colab import files

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Paths
TRAIN_AUDIO_DIR = "/content/drive/My Drive/Grammar Scoring/audios_train"
TEST_AUDIO_DIR = "/content/drive/My Drive/Grammar Scoring/audios_test"
TRAIN_CSV = "/content/drive/My Drive/Grammar Scoring/train.csv"
TEST_CSV = "/content/drive/My Drive/Grammar Scoring/test.csv"

In [None]:
# Step 1: Load Whisper model for speech-to-text (GPU-enabled)
try:
    whisper_model = whisper.load_model("base").to(device)
except Exception as e:
    print(f"Error loading Whisper model: {e}")
    raise

In [None]:
# Step 2: Initialize LanguageTool and BERT (BERT on GPU)
grammar_tool = language_tool_python.LanguageTool("en-US")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)

In [None]:
# Function to convert audio to text (GPU-accelerated Whisper)
def audio_to_text(audio_path):
    try:
        result = whisper_model.transcribe(audio_path, fp16=(device.type == "cuda"))
        return result["text"]
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return ""

In [None]:
# Function to extract grammar features (CPU-bound)
def get_grammar_features(text):
    try:
        matches = grammar_tool.check(text)
        return len(matches)
    except Exception as e:
        print(f"Error checking grammar: {e}")
        return 0

In [None]:
# Function to extract BERT embeddings (GPU-accelerated)
def get_bert_embeddings(text):
    try:
        inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS token to NumPy
    except Exception as e:
        print(f"Error generating BERT embeddings: {e}")
        return np.zeros(768)  # Default embedding size for bert-base-uncased

In [None]:
# Step 3: Process training data
train_df = pd.read_csv(TRAIN_CSV)
train_features = []
valid_indices = []  # To track valid samples

print("Transcribed texts from training audio files:")
for idx, filename in enumerate(train_df["filename"]):
    audio_path = os.path.join(TRAIN_AUDIO_DIR, filename)
    text = audio_to_text(audio_path)
    
    if not text:  # Skip if transcription failed
        print(f"{idx + 1}. {filename}: 'Transcription failed'")
        continue

    print(f"{idx + 1}. {filename}: '{text}'")

    # Extract features
    grammar_score = get_grammar_features(text)
    bert_emb = get_bert_embeddings(text)

    # Combine features (keep in NumPy for simplicity)
    features = np.concatenate([[grammar_score], bert_emb])
    train_features.append(features)
    valid_indices.append(idx)  # Record valid sample index
    

In [None]:
# Convert to NumPy array for training
X_train = np.array(train_features)
y_train = train_df["label"].iloc[valid_indices].values  # Filter labels to match valid samples

if len(X_train) == 0:
    raise ValueError("No valid training samples were processed. Check audio files or transcription process.")

In [None]:
# Scale features (NumPy-based, as StandardScaler doesn't support CuPy natively)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# Step 4: Train XGBoost model (GPU-enabled)
xgb_params = {
    "objective": "reg:squarederror",
    "n_estimators": 100,
    "max_depth": 5,
    "learning_rate": 0.1,
    "random_state": 42,
    "tree_method": "hist" if device.type == "cuda" else "auto",  # GPU acceleration if available
    "device": "cuda" if device.type == "cuda" else "cpu"
}
xgb_model = xgb.XGBRegressor(**xgb_params)
try:
    xgb_model.fit(X_train_scaled, y_train)
except Exception as e:
    print(f"Error training XGBoost: {e}")
    raise

In [None]:
# Step 4.1: Evaluation of model performance on training data
y_train_pred = xgb_model.predict(X_train_scaled)
y_train_pred_rounded = np.clip(np.round(y_train_pred), 1.0, 5.0)  # Round to nearest integer and clip
y_train_rounded = np.clip(np.round(y_train), 1.0, 5.0)  # Round actual labels for classification metrics

print("\nModel Performance on Training Data")
print("Classification Report:")
print(classification_report(y_train_rounded, y_train_pred_rounded, target_names=[f"Score {i}" for i in range(1, 6)]))
print("Confusion Matrix:")
print(confusion_matrix(y_train_rounded, y_train_pred_rounded))
mse = mean_squared_error(y_train_rounded, y_train_pred_rounded)
print(f"\nMean Squared Error: {mse:.4f}")

In [None]:
# Step 5: Process test data and predict
test_df = pd.read_csv(TEST_CSV)
test_features = []
valid_test_filenames = []  # To track valid test samples

print("\nTranscribed texts from test audio files:")
for idx, filename in enumerate(test_df["filename"]):
    audio_path = os.path.join(TEST_AUDIO_DIR, filename)
    text = audio_to_text(audio_path)
    
    if not text:  # Skip if transcription failed
        print(f"{idx + 1}. {filename}: 'Transcription failed'")
        continue

    print(f"{idx + 1}. {filename}: '{text}'")

    # Extract features
    grammar_score = get_grammar_features(text)
    bert_emb = get_bert_embeddings(text)

    # Combine features
    features = np.concatenate([[grammar_score], bert_emb])
    test_features.append(features)
    valid_test_filenames.append(filename)  # Record valid filename
    

In [None]:
# Convert to NumPy array and scale
X_test = np.array(test_features)
if len(X_test) == 0:
    raise ValueError("No valid test samples were processed. Check audio files or transcription process.")
X_test_scaled = scaler.transform(X_test)

In [None]:
# Predict
try:
    y_test_pred = xgb_model.predict(X_test_scaled)
    # Round predictions to nearest 0.5 and clip to [1.0, 5.0] (GPU-accelerated)
    y_test_pred_cp = cp.array(y_test_pred)
    y_test_pred_rounded = cp.clip(cp.round(y_test_pred_cp * 2) / 2, 1.0, 5.0).get()  # Back to NumPy
    print("\nPredicted scores:", y_test_pred_rounded)
except Exception as e:
    print(f"Error predicting with XGBoost: {e}")
    raise

In [None]:
# Step 6: Save submission to new file and make it downloadable
submission_df = pd.DataFrame({"filename": valid_test_filenames, "label": y_test_pred_rounded})
submission_df.to_csv("submission.csv", index=False)
print("\nSubmission saved to submission.csv")
files.download("submission.csv")