# Automated Essay Scoring

This notebook implements a machine learning pipeline to score essays automatically. It uses TF-IDF for feature extraction and a Random Forest Regressor for prediction.

In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, cohen_kappa_score
import pickle

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shihab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shihab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Shihab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Load Data

In [2]:
# Load the dataset
data_path = 'archive/training_set_rel3.tsv'
df = pd.read_csv(data_path, sep='\t', encoding='ISO-8859-1')

# Display first few rows
df.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
0,1,1,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,,,,,
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,,,,,
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,,,,,
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,,,,,
4,5,1,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,,,,,


## Data Preprocessing

In [3]:
# Select relevant columns
df = df[['essay_id', 'essay_set', 'essay', 'domain1_score']]

# Clean text function
def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning
df['cleaned_essay'] = df['essay'].apply(clean_text)
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,cleaned_essay
0,1,1,"Dear local newspaper, I think effects computer...",8,dear local newspaper think effects computers p...
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9,dear caps caps believe using computers benefit...
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7,dear caps caps caps people use computers every...
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10,dear local newspaper caps found many experts s...
4,5,1,"Dear @LOCATION1, I know having computers has a...",8,dear location know computers positive effect p...


## Feature Extraction

In [4]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the essays
X = vectorizer.fit_transform(df['cleaned_essay']).toarray()
y = df['domain1_score']

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (12976, 5000)
Shape of y: (12976,)


## Model Training

In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Model trained successfully!")

Model trained successfully!


## Evaluation

In [6]:
# Predict on test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate Cohen's Kappa Score (Quadratic Weighted Kappa is standard for essay scoring)
# We round predictions to nearest integer for Kappa calculation
y_pred_rounded = np.round(y_pred).astype(int)
kappa = cohen_kappa_score(y_test.astype(int), y_pred_rounded, weights='quadratic')
print(f"Quadratic Weighted Kappa: {kappa}")

Mean Squared Error: 6.255033727101524
Quadratic Weighted Kappa: 0.9569000917483427


## Save Model

In [None]:
# Save model and vectorizer
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved to disk.")