In [18]:
# Core data handling
import pandas as pd  # Load and manipulate TSV dataset (training_set_rel3.tsv)
import numpy as np  # Numerical operations for feature arrays and score normalization

# Text preprocessing
import nltk  # Stopwords, lemmatization, tokenization, and spelling correction
from nltk.corpus import stopwords, wordnet, words  # Resources for stopwords, lemmatization, and spelling
from spellchecker import SpellChecker
from nltk.tokenize import sent_tokenize, word_tokenize  # Sentence and word tokenization
from nltk.metrics.distance import edit_distance  # Edit distance for spelling correction
import re  # Regular expressions for cleaning ASAP tokens (e.g., @SOURCE1) and contractions
import string  # Punctuation removal


# Model-related (PyTorch)
import torch  # PyTorch for model and tensor conversion of preprocessed data
from torch.nn.utils.rnn import pad_sequence  # Pad sequences for LSTM input

# Evaluation and splitting
from sklearn.model_selection import train_test_split  # Stratified train-test splits by essay_set
from sklearn.metrics import cohen_kappa_score  # Quadratic Weighted Kappa for evaluation

# Optional: Visualization
import matplotlib.pyplot as plt  # Plot score distributions to identify imbalances
import seaborn as sns  # Enhanced visualization for score analysis

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ecc\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [19]:
def split_in_sets(data):
    essay_sets = []
    min_scores = []
    max_scores = []
    for s in range(1, 9):
        essay_set = data[data["essay_set"] == s].copy()  # Avoid modifying original
        # Drop irrelevant columns (specific to each set)
        columns_to_drop = ["rater1_domain1", "rater2_domain1", "rater3_domain1"]
        if s != 2:
            columns_to_drop.extend(["rater1_domain2", "rater2_domain2", "domain2_score"])
        if s not in [7, 8]:
            columns_to_drop.extend([col for col in data.columns if "trait" in col])
        essay_set = essay_set.drop(columns=[col for col in columns_to_drop if col in essay_set.columns])
        n, d = essay_set.shape
        set_scores = essay_set["domain1_score"]
        print(f"Set {s}: Essays = {n}, Attributes = {d}")
        min_scores.append(set_scores.min())
        max_scores.append(set_scores.max())
        essay_sets.append(essay_set)
    return essay_sets, min_scores, max_scores

In [20]:
# Load dataset
dataset_path = "training_set_rel3.tsv"
data = pd.read_csv(dataset_path, sep="\t", encoding="ISO-8859-1")

# Split into sets and get score ranges
essay_sets, min_scores, max_scores = split_in_sets(data)
set1, set2, set3, set4, set5, set6, set7, set8 = tuple(essay_sets)
sets = [set1, set2, set3, set4, set5, set6, set7, set8]
print("Score ranges:", list(zip(min_scores, max_scores)))

Set 1: Essays = 1783, Attributes = 4
Set 2: Essays = 1800, Attributes = 7
Set 3: Essays = 1726, Attributes = 4
Set 4: Essays = 1770, Attributes = 4
Set 5: Essays = 1805, Attributes = 4
Set 6: Essays = 1800, Attributes = 4
Set 7: Essays = 1569, Attributes = 22
Set 8: Essays = 723, Attributes = 22
Score ranges: [(2, 12), (1, 6), (0, 3), (0, 3), (0, 4), (0, 4), (2, 24), (10, 60)]


In [21]:
# Enhanced text cleaning
stop_words = set(stopwords.words('english'))
word_list = set(words.words())
lemmatizer = nltk.WordNetLemmatizer()

In [33]:
spell = SpellChecker()
def enhanced_clean_essay(essay, ner_tokens):
    essay = essay.lower()
    # Handle contractions
    contractions = {"can't": "cannot", "don't": "do not", "won't": "will not", "it's": "it is"}
    for contr, expand in contractions.items():
        essay = re.sub(r"\b" + contr + r"\b", expand, essay)
    # Replace all NER tokens with "entity"
    for token in ner_tokens:
        essay = re.sub(r'\b' + re.escape(token) + r'\b', 'entity', essay)
    # Remove punctuation and numbers
    essay = essay.translate(str.maketrans('', '', string.punctuation))
    essay = re.sub(r'\d+', '', essay)
    # Spelling correction with pyspellchecker
    #words = essay.split()
    #corrected = [spell.correction(word) if spell.correction(word) else word for word in words]
    #essay = ' '.join(corrected)
    # Remove stopwords
    essay = ' '.join(word for word in essay.split() if word not in stop_words)
    # Lemmatize
    essay = ' '.join(lemmatizer.lemmatize(word) for word in essay.split())
    return essay

In [34]:
# Optimized NER token list
all_essays = ' '.join(data['essay'])
ner = set(re.findall(r'@\w+\d*', all_essays))  # Extract unique tokens
print(f"Found {len(ner)} unique NER tokens: {ner}")

# Apply cleaning with progress bar
from tqdm import tqdm
tqdm.pandas()
data['cleaned_essay'] = data['essay'].progress_apply(lambda x: enhanced_clean_essay(x, ner))
for i, essay_set in enumerate(sets):
    essay_set['cleaned_essay'] = essay_set['essay'].progress_apply(lambda x: enhanced_clean_essay(x, ner))
    sets[i] = essay_set

# Save cleaned data
data.to_csv('cleaned_data.csv', index=False)
for i, essay_set in enumerate(sets):
    essay_set.to_csv(f'cleaned_set{i+1}.csv', index=False)

Found 168 unique NER tokens: {'@PERCENT1', '@DATE3', '@TIME1', '@PERSON3', '@CAPS48', '@CAPS38', '@NUM2', '@CAPS33', '@LOCATION2M', '@CAPS19', '@CAPS40', '@NUM7', '@CAPS77', '@NUM4', '@CAPS18', '@MONEY1', '@CAPS37', '@CAPS8', '@CAPS44', '@CAPS14', '@CAPS39', '@NUM1king', '@ORGANIZATION3', '@MONTH1', '@CAPS27', '@CAPS71', '@CAPS62', '@DATE2', '@PERCENT4', '@CAPS15', '@NUM1th', '@DATE4', '@NUM10', '@CAPS29', '@NUM5', '@PERCENT7', '@CAPS23', '@CAPS2', '@TIME3', '@TIME4', '@ORGANIZATION1n', '@LOCATION1L', '@CAPS12', '@PERSON7', '@NUM1o', '@DATE6', '@CAPS74', '@CAPS36', '@STATE1', '@CAPS63', '@PERSON4', '@LOCATION1â', '@CAPS35', '@PERCENT5', '@LOCATION12', '@NUM13', '@PERCENT2', '@CAPS30', '@CAPS11â', '@CAPS25', '@CAPS6', '@MONEY2', '@NUM6', '@CAPS9', '@DATE5', '@CAPS52', '@CAPS49', '@LOCATION4', '@CAPS13', '@NUM1at', '@CAPS46', '@TIME1pm', '@CAPS76', '@PERCENT3', '@LOCATION4from', '@LOCATION3', '@NUM2th', '@NUM1', '@CAPS43', '@LOCATION6', '@CAPS53', '@LOCATION8', '@CAPS3', '@CAPS4', '@CAPS

100%|██████████| 12976/12976 [00:53<00:00, 241.99it/s]
100%|██████████| 1783/1783 [00:11<00:00, 149.90it/s]
100%|██████████| 1800/1800 [00:12<00:00, 145.91it/s]
100%|██████████| 1726/1726 [00:03<00:00, 470.29it/s]
100%|██████████| 1770/1770 [00:03<00:00, 534.35it/s]
100%|██████████| 1805/1805 [00:04<00:00, 410.61it/s]
100%|██████████| 1800/1800 [00:05<00:00, 326.79it/s]
100%|██████████| 1569/1569 [00:04<00:00, 324.09it/s]
100%|██████████| 723/723 [00:07<00:00, 95.03it/s] 


In [32]:
# Normalize scores
def normalize_score(row):
    set_id = row['essay_set'] - 1  # Adjust for 0-based indexing
    score = row['domain1_score']
    return (score - min_scores[set_id]) / (max_scores[set_id] - min_scores[set_id])

data['normalized_score'] = data.apply(normalize_score, axis=1)
for i, essay_set in enumerate(sets):
    essay_set['normalized_score'] = essay_set.apply(normalize_score, axis=1)
    sets[i] = essay_set

# Display sample
print(data[['essay_set', 'essay', 'cleaned_essay', 'domain1_score', 'normalized_score']].head())

   essay_set                                              essay  \
0          1  Dear local newspaper, I think effects computer...   
1          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3          1  Dear Local Newspaper, @CAPS1 I have found that...   
4          1  Dear @LOCATION1, I know having computers has a...   

                                       cleaned_essay  domain1_score  \
0  dear local newspaper think effect computer peo...              8   
1  dear cap cap believe using computer benefit u ...              9   
2  dear cap cap cap people use computer everyone ...              7   
3  dear local newspaper cap found many expert say...             10   
4  dear location know computer positive effect pe...              8   

   normalized_score  
0               0.6  
1               0.7  
2               0.5  
3               0.8  
4               0.6  
