In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [40]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, cohen_kappa_score, r2_score
from sentence_transformers import SentenceTransformer

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import optuna
import language_tool_python

In [5]:
from imblearn.over_sampling import SMOTE
from textstat import flesch_kincaid_grade
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
import textstat
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import RobertaTokenizer, RobertaModel

In [7]:
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [8]:
nltk.data.path.append(os.path.abspath('nltk_data'))

In [9]:
nltk.download('punkt_tab', download_dir='nltk_data')
nltk.download('omw-1.4', download_dir='nltk_data')
nltk.download('stopwords' , download_dir='nltk_data')
nltk.download('wordnet' , download_dir='nltk_data')
nltk.download('averaged_perceptron_tagger_eng' , download_dir='nltk_data')

[nltk_data] Downloading package punkt_tab to nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

---


In [10]:
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', encoding='latin1')
    df = df[['essay_id', 'essay_set', 'essay', 'domain1_score']].dropna()
    return df

In [11]:
    essays = load_data('../Dataset/asap-aes/training_set_rel3.tsv')

- essay_id: A unique identifier for each individual student essay
- essay_set: 1-8, an id for each set of essays
- essay: The ascii text of a student's response
- rater1_domain1: Rater 1's domain 1 score; all essays have this
- rater2_domain1: Rater 2's domain 1 score; all essays have this
- rater3_domain1: Rater 3's domain 1 score; only some essays in set 8 have this.
- domain1_score: Resolved score between the raters; all essays have this
- rater1_domain2: Rater 1's domain 2 score; only essays in set 2 have this
- rater2_domain2: Rater 2's domain 2 score; only essays in set 2 have this
- domain2_score: Resolved score between the raters; only essays in set 2 have this
- rater1_trait1 score - rater3_trait6 score: trait scores for sets 7-8


---


# Preprocess


In [12]:
class EssayPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('english') )
        self.lemmatizer = WordNetLemmatizer()
        self.grammar_tool = language_tool_python.LanguageTool('en-US')
        self.sbert = SentenceTransformer('all-mpnet-base-v2')
        
    def preprocess_text(self, text):
        # Basic cleaning
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        
        # Tokenization and stopword removal
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if t not in self.stop_words]
        
        # Lemmatization with POS tagging
        pos_tags = nltk.pos_tag(tokens)
        lemmatized = []
        for word, tag in pos_tags:
            if tag.startswith('V'):  # Verb
                lemmatized.append(self.lemmatizer.lemmatize(word, 'v'))
            elif tag.startswith('J'):  # Adjective
                lemmatized.append(self.lemmatizer.lemmatize(word, 'a'))
            elif tag.startswith('R'):  # Adverb
                lemmatized.append(self.lemmatizer.lemmatize(word, 'r'))
            else:  # Noun
                lemmatized.append(self.lemmatizer.lemmatize(word))
                
        return ' '.join(lemmatized)
    
    def extract_linguistic_features(self, text):
    # Grammar and spelling
        matches = self.grammar_tool.check(text)
        grammar_errors = len(matches)
        
        # Readability
        readability = flesch_kincaid_grade(text)
        
        
        # Vocabulary richness
        words = word_tokenize(text)
        unique_words = set(words)
        vocab_richness = len(unique_words) / max(1, len(words))
        
        # Essay structure
        sentences = nltk.sent_tokenize(text)
        avg_sentence_len = sum(len(word_tokenize(s)) for s in sentences) / max(1, len(sentences))
        
        return {
            'grammar_errors': grammar_errors,
            'readability': readability,
            'vocab_richness': vocab_richness,
            'avg_sentence_len': avg_sentence_len
        }
    
    def get_sbert_embedding(self, text):
        return self.sbert.encode(text, show_progress_bar=False)

In [13]:
preprocessor = EssayPreprocessor()

In [14]:
preprocessor

<__main__.EssayPreprocessor at 0x1b0e1e77eb0>

In [15]:
essays['processed_text'] = essays['essay'].apply(preprocessor.preprocess_text)
essays['processed_text']


0        dear local newspaper think effect computer peo...
1        dear cap cap believe use computer benefit u ma...
2        dear cap cap cap people use computer everyone ...
3        dear local newspaper cap find many expert say ...
4        dear location know computer positive effect pe...
                               ...                        
12971    story mother daughter either enemy friends cap...
12972    never understood meaning laughter short distan...
12973    laugh cap habit cap cause cause laugh cap even...
12974    trippin fence num year young short num year ev...
12975    many people believe laughter improve life laug...
Name: processed_text, Length: 12976, dtype: object

In [16]:
linguistic_features = essays['processed_text'].apply(preprocessor.extract_linguistic_features)
linguistic_df =  pd.json_normalize(linguistic_features)

In [32]:
print("Generating SBERT embeddings...")
sbert_embeddings = np.array(essays['processed_text'].apply(preprocessor.get_sbert_embedding).tolist())
sbert_df = pd.DataFrame(sbert_embeddings)

Generating SBERT embeddings...


In [33]:
X = pd.concat([linguistic_df, sbert_df], axis=1)
y = essays['domain1_score']

In [34]:
class AESModel(nn.Module):
    def __init__(self, input_dim):
        super(AESModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.drop1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.drop2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(128, 64)
        self.fc_out = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.drop1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.drop2(x)
        x = self.fc3(x)
        x = self.relu(x)
        return self.fc_out(x)

In [35]:
def train_model(X_train, y_train, params):
    # Convert to tensors
    X_train_tensor = torch.FloatTensor(X_train.values)
    y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1)
    
    # Create dataset and loader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    
    # Initialize model
    model = AESModel(input_dim=X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
    
    # Training loop
    for epoch in range(params['epochs']):
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
    
    return model

In [None]:

def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.FloatTensor(X_test.values)
        predictions = model(X_test_tensor).squeeze().numpy()

    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)
    
    return {
        'predictions': predictions,
        'mse': mse,
        'rmse': rmse,
        'r2': r2  
    }

In [37]:
def objective(trial):
    # Hyperparameters to be optimized
    params = {
        'lr': trial.suggest_float('lr', 1e-5, 1e-3, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [16, 32, 64]),
        'epochs': trial.suggest_int('epochs', 20, 100),
        'weight_decay': trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    }
    
    kf = KFold(n_splits=5)
    qwk_scores = []
    
    # Perform K-Fold Cross Validation
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train the model
        model = train_model(X_train, y_train, params)
        
        # Evaluate the model
        metrics = evaluate_model(model, X_val, y_val)
        
        # Collect QWK scores for each fold
        qwk_scores.append(metrics['qwk'])
    
    # Return the mean QWK score across all folds
    return np.mean(qwk_scores)


In [41]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)
best_params = study.best_params
print("Best hyperparameters:", best_params)

[I 2025-04-23 19:30:39,163] A new study created in memory with name: no-name-699ed211-ed7b-41d6-919f-d1c0f29231ed
[W 2025-04-23 19:30:57,931] Trial 0 failed with parameters: {'lr': 3.812299611214971e-05, 'batch_size': 64, 'epochs': 25, 'weight_decay': 2.674970642042841e-05} because of the following error: ValueError("Classification metrics can't handle a mix of multiclass and continuous targets").
Traceback (most recent call last):
  File "c:\Users\ahmad\AppData\Local\Programs\Python\Python39\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ahmad\AppData\Local\Temp\ipykernel_11208\3012127703.py", line 22, in objective
    metrics = evaluate_model(model, X_val, y_val)
  File "C:\Users\ahmad\AppData\Local\Temp\ipykernel_11208\2478604919.py", line 10, in evaluate_model
    kappa = cohen_kappa_score(y_test, predictions)
  File "c:\Users\ahmad\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\_param

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
score_bins = pd.cut(y_train, bins=5, labels=False)
X_train_res, y_train_res = smote.fit_resample(X_train, score_bins)
y_train_res = y_train.iloc[X_train_res.index]

In [None]:
# Train final model
final_model = train_model(X_train_res, y_train_res, best_params)

# Evaluate
results = evaluate_model(final_model, X_test, y_test)

In [None]:
print("\nFinal Evaluation Results:")
print(f"Quadratic Weighted Kappa (QWK): {results['qwk']:.4f}")
print(f"Mean Squared Error (MSE): {results['mse']:.4f}")
print(f"Root Mean Squared Error (RMSE): {results['rmse']:.4f}")

In [None]:
## 7. Visualization and Analysis
# Plot predictions vs actual
plt.figure(figsize=(10, 6))
sns.regplot(x=y_test, y=results['predictions'], scatter_kws={'alpha':0.3})
plt.xlabel('Actual Scores')
plt.ylabel('Predicted Scores')
plt.title('Actual vs Predicted Essay Scores')
plt.show()



In [None]:
# Error analysis
errors = y_test - results['predictions']
plt.figure(figsize=(10, 6))
sns.histplot(errors, bins=30, kde=True)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors')
plt.show()