In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('training_set_rel3.tsv', sep='\t', encoding="latin1")
df['essay'] = df['essay'].fillna('')

# Keep relevant columns
df = df[['essay_id', 'essay_set', 'essay', 'domain1_score']]

# Display basic info
print(df.shape)
print(df.head())

(12976, 4)
   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   domain1_score  
0              8  
1              9  
2              7  
3             10  
4              8  


In [2]:
# Remove illegible or corrupted essays
df = df[~df['essay'].str.contains('illegible|\\?\\?\\?', case=False, na=False)]

# Drop any rows with missing text or scores
df = df.dropna(subset=['essay', 'domain1_score']).reset_index(drop=True)
print(f"After cleaning, dataset shape: {df.shape}")

After cleaning, dataset shape: (12703, 4)


In [3]:
# Check score ranges per essay set
for s in sorted(df['essay_set'].unique()):
    min_score = df[df['essay_set'] == s]['domain1_score'].min()
    max_score = df[df['essay_set'] == s]['domain1_score'].max()
    print(f"Set {s}: min={min_score}, max={max_score}")

print("=" *50)    

# Scale scores to 0-1 range per essay set
df['score_scaled'] = df.apply(
    lambda x: (x['domain1_score'] - df[df['essay_set'] == x['essay_set']]['domain1_score'].min()) /
              (df[df['essay_set'] == x['essay_set']]['domain1_score'].max() -
               df[df['essay_set'] == x['essay_set']]['domain1_score'].min()),
    axis=1
)
print(df[['essay_set', 'domain1_score', 'score_scaled']].head())

Set 1: min=2, max=12
Set 2: min=1, max=6
Set 3: min=0, max=3
Set 4: min=0, max=3
Set 5: min=0, max=4
Set 6: min=0, max=4
Set 7: min=2, max=24
Set 8: min=10, max=60
   essay_set  domain1_score  score_scaled
0          1              8           0.6
1          1              9           0.7
2          1              7           0.5
3          1             10           0.8
4          1              8           0.6


In [4]:
import re

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove special chars
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df['essay'] = df['essay'].apply(clean_text)


In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# TF-IDF features (unigrams + bigrams)
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    
)
X_tfidf = vectorizer.fit_transform(df['essay'])

# Simple numeric features
df['essay_len'] = df['essay'].apply(len)
df['word_count'] = df['essay'].apply(lambda x: len(x.split()))
df['avg_word_len'] = df['essay'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)

numeric_features = df[['essay_len', 'word_count', 'avg_word_len']].values

# Combine TF-IDF + numeric
from scipy.sparse import csr_matrix
X = hstack([X_tfidf, csr_matrix(numeric_features)])
y = df['score_scaled'].values

print(f"Final feature matrix shape: {X.shape}")



Final feature matrix shape: (12703, 10003)


In [35]:
from sklearn.model_selection import train_test_split

indices = np.arange(len(df))
X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
    X, y, indices, test_size=0.2, random_state=42
)

print(f"Train shape: {X_train.shape}, Validation shape: {X_val.shape}")

Train shape: (10162, 10003), Validation shape: (2541, 10003)


In [36]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, cohen_kappa_score
from scipy.stats import pearsonr
import numpy as np

# Train Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

# Predict on validation set
y_pred = ridge.predict(X_val)

# Calculate metrics
mae = mean_absolute_error(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)
pearson_corr, _ = pearsonr(y_pred, y_val)

# Rescale predictions and labels for QWK
score_ranges = {
    1: (2, 12), 2: (1, 6), 3: (0, 3), 4: (0, 3),
    5: (0, 4), 6: (0, 4), 7: (2, 24), 8: (10, 60)
}

def rescale(scaled, essay_sets):
    original = []
    for val, s in zip(scaled, essay_sets):
        min_score, max_score = score_ranges[s]
        orig = val * (max_score - min_score) + min_score
        original.append(round(orig))
    return np.array(original)

# Get essay_set for each validation sample
val_essay_sets = df.iloc[idx_val]['essay_set'].values

y_pred_orig = rescale(y_pred, val_essay_sets)
y_val_orig = rescale(y_val, val_essay_sets)

# Calculate QWK
qwk = cohen_kappa_score(y_val_orig, y_pred_orig, weights='quadratic')

print("=" * 50)
print("📊 MODEL PERFORMANCE METRICS")
print("=" * 50)
print(f"Quadratic Weighted Kappa (QWK): {qwk:.4f}")
print(f"Pearson Correlation:            {pearson_corr:.4f}")
print(f"R² Score:                       {r2:.4f}")
print(f"Mean Absolute Error (MAE):      {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print("=" * 50)

📊 MODEL PERFORMANCE METRICS
Quadratic Weighted Kappa (QWK): 0.9665
Pearson Correlation:            0.7327
R² Score:                       0.5369
Mean Absolute Error (MAE):      0.1262
Root Mean Squared Error (RMSE): 0.1629


==================================================
📊 MODEL PERFORMANCE METRICS 5000, [1,2], Stopwords
==================================================
Quadratic Weighted Kappa (QWK): 0.9145
Pearson Correlation:            0.7135
R² Score:                       0.5090
Mean Absolute Error (MAE):      0.1296
Root Mean Squared Error (RMSE): 0.1677
==================================================

==================================================
📊 MODEL PERFORMANCE METRICS 10000, [1,3], 
==================================================
Quadratic Weighted Kappa (QWK): 0.9240
Pearson Correlation:            0.7327
R² Score:                       0.5369
Mean Absolute Error (MAE):      0.1262
Root Mean Squared Error (RMSE): 0.1629
==================================================