In [None]:
!pip install --upgrade scipy

In [None]:
from scipy.stats import pearsonr

#  Grammar Scoring Engine 
#  Competition Solution
# Combines Whisper transcription, linguistic features, and XGBoost regression

In [None]:

# Installing dependencies
!pip install -qU git+https://github.com/openai/whisper.git
!pip install -q language-tool-python spacy pandas xgboost matplotlib seaborn scipy
!python -m spacy download -q en_core_web_sm
!sudo apt-get install -qq default-jre


import os
import torch
import whisper
import spacy
import pandas as pd
import numpy as np
import language_tool_python
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score  # Added missing import
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Checking  GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name(0)}")




# Step 1: Robust Audio Transcription

In [None]:

class AudioTranscriber:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = whisper.load_model("base").to(self.device)
        self.fp16 = self.device == "cuda"
    
    def transcribe(self, audio_path):
        try:
            result = self.model.transcribe(
                audio_path,
                fp16=self.fp16,
                verbose=None  
            )
            return result.get("text", "")
        except Exception as e:
            print(f"Transcription failed for {os.path.basename(audio_path)}: {str(e)}")
            return ""

transcriber = AudioTranscriber()



# Step 2: Comprehensive Feature Engineering

In [None]:

class FeatureExtractor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.tool = language_tool_python.LanguageTool('en-US')
        
    def extract(self, text):
        features = {}
        text = text.strip()
        
        # Basic text stats
        features['text_length'] = len(text)
        features['word_count'] = len(text.split())
        
        # Grammar checking
        if features['word_count'] > 0:
            try:
                matches = self.tool.check(text)
                features['grammar_errors'] = len(matches)
                features['error_rate'] = len(matches) / features['word_count']
            except:
                features.update({'grammar_errors': 0, 'error_rate': 0})
        else:
            features.update({'grammar_errors': 0, 'error_rate': 0})
        
        # Linguistic features
        if features['word_count'] > 1:
            try:
                doc = self.nlp(text)
                features.update({
                    'sentence_count': len(list(doc.sents)),
                    'avg_sentence_length': features['word_count'] / max(1, len(list(doc.sents))),
                    'noun_ratio': sum(1 for t in doc if t.pos_ == 'NOUN') / features['word_count'],
                    'verb_ratio': sum(1 for t in doc if t.pos_ == 'VERB') / features['word_count'],
                    'punctuation_ratio': sum(1 for t in doc if t.is_punct) / features['word_count'],
                    'unique_word_ratio': len(set(t.text for t in doc)) / features['word_count']
                })
            except:
                features.update({
                    'sentence_count': 0,
                    'avg_sentence_length': 0,
                    'noun_ratio': 0,
                    'verb_ratio': 0,
                    'punctuation_ratio': 0,
                    'unique_word_ratio': 0
                })
        else:
            features.update({
                'sentence_count': 0,
                'avg_sentence_length': 0,
                'noun_ratio': 0,
                'verb_ratio': 0,
                'punctuation_ratio': 0,
                'unique_word_ratio': 0
            })
            
        return features

feature_extractor = FeatureExtractor()



#  Step 3: Data Pipeline

In [None]:

class DataProcessor:
    def __init__(self):
        self.scaler = StandardScaler()
        
    def process(self, csv_path, audio_dir, is_train=True):
        df = pd.read_csv(csv_path)
        features = []
        
        for idx, row in df.iterrows():
            audio_path = os.path.join(audio_dir, row['filename'])
            try:
                text = transcriber.transcribe(audio_path)
                feat = feature_extractor.extract(text)
                if is_train:
                    feat['label'] = row['label']  # Fixed column name
                features.append(feat)
            except Exception as e:
                print(f"Skipping {row['filename']}: {str(e)}")
                features.append(self.empty_features(is_train))
            
            # Memory management
            if idx % 10 == 0 and torch.cuda.is_available():
                torch.cuda.empty_cache()
                
        features_df = pd.DataFrame(features).fillna(0)
        if is_train:
            features_df = features_df[features_df['word_count'] > 0]
            
        return features_df
    
    def empty_features(self, is_train):
        base = {
            'text_length': 0,
            'word_count': 0,
            'grammar_errors': 0,
            'error_rate': 0,
            'sentence_count': 0,
            'avg_sentence_length': 0,
            'noun_ratio': 0,
            'verb_ratio': 0,
            'punctuation_ratio': 0,
            'unique_word_ratio': 0
        }
        if is_train:
            base['grammar_score'] = 1.0  
        return base




# Step 4: Model Training

In [None]:

class GrammarScorer:
    def __init__(self):
        self.model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=300,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method='gpu_hist' if torch.cuda.is_available() else 'hist',
            predictor='gpu_predictor' if torch.cuda.is_available() else 'cpu_predictor',
            random_state=42
        )
        
    def train(self, X_train, y_train, X_val, y_val):
        self.model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=20,
            verbose=False
        )
        
    def evaluate(self, X, y):
        pred = self.model.predict(X)
        pearson, _ = pearsonr(y, pred)
        mse = mean_squared_error(y, pred)
        r2 = r2_score(y, pred)
        print(f"Pearson: {pearson:.3f} | MSE: {mse:.3f} | R²: {r2:.3f}")
        return pearson, mse, r2




#  Main Execution Flow

In [None]:

# Initializing components
processor = DataProcessor()
scorer = GrammarScorer()

# Processing training data
print("Processing training data...")
train_df = processor.process(
    '/kaggle/input/shl-intern-hiring-assessment/dataset/train.csv',  
    '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_train'  
)

# Preparing data
X = train_df.drop(columns=['label'])
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training model
print("\nTraining model...")
scorer.train(X_train, y_train, X_val, y_val)

# Evaluating
print("\nValidation performance:")
scorer.evaluate(X_val, y_val)




#  Visualization & Interpretation

In [None]:

# Feature importance
importance = pd.DataFrame({
    'feature': X.columns,
    'importance': scorer.model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=importance)
plt.title('Feature Importance Analysis')
plt.show()

# Prediction distribution
val_pred = scorer.model.predict(X_val)
plt.figure(figsize=(10, 6))
sns.histplot(x=val_pred, bins=20, kde=True)
plt.title('Predicted Score Distribution')
plt.xlabel('label')
plt.show()




# Generating Submission

In [None]:

print("Processing test data...")
test_df = processor.process(
    '/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv', 
    '/kaggle/input/shl-intern-hiring-assessment/dataset/audios_test',  
    is_train=False
)

# Predicting and formating
test_pred = scorer.model.predict(test_df)
submission = pd.DataFrame({
    'filename': pd.read_csv('/kaggle/input/shl-intern-hiring-assessment/dataset/test.csv')['filename'],
    'label': np.clip(test_pred, 1.0, 5.0)  
})

# Saving
submission.to_csv('submission.csv', index=False)
print("Submission file saved!")

# Cleanup
if torch.cuda.is_available():
    torch.cuda.empty_cache()