In [None]:
import pandas as pd
import spacy
from collections import Counter
from tqdm import tqdm

In [None]:
# load spacy
nlp = spacy.load("en_core_web_sm")

In [1]:
# Load sentence dataset
INPUT_FILE = '../data/processed/sentences_dataset.csv'
OUTPUT_FILE = '../data/processed/sentence_features.csv'

df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} sentences.")

In [None]:
# Feature Extraction
def extract_features(sentence):
    doc = nlp(sentence)

    tokens = [token for token in doc if not token.is_punct and not token.is_space]
    words = [token.text for token in tokens if token.is_alpha]
    word_lengths = [len(word) for word in words]

    pos_counts = Counter([token.pos_ for token in tokens])

    # Features
    features = {
        'num_tokens': len(tokens),
        'num_words': len(words),
        'num_chars': len(sentence),
        'avg_word_length': sum(word_lengths) / len(word_lengths) if word_lengths else 0,
        'ttr': len(set(words)) / len(words) if words else 0,
        'noun_ratio': pos_counts['NOUN'] / len(tokens) if len(tokens) > 0 else 0,
        'verb_ratio': pos_counts['VERB'] / len(tokens) if len(tokens) > 0 else 0,
        'adj_ratio': pos_counts['ADJ'] / len(tokens) if len(tokens) > 0 else 0,
    }

    
    depths = [abs(token.head.i - token.i) for token in tokens if token.dep_ != 'ROOT']
    features['syntactic_depth'] = max(depths) if depths else 0
    
    return features

In [None]:
# Apply to dataset
features_list = []
for sentence in tqdm(df['sentence_text']):
    features_list.append(extract_features(sentence))

features_df = pd.DataFrame(features_list)

In [None]:
# Combine with original identifiers
result = pd.concat([df[['essay_id', 'sentence_id']], features_df], axis=1)
print(result.head())

In [None]:
# Save to file
result.to_csv(OUTPUT_FILE, index=False)
print(f"Features saved to {OUTPUT_FILE}")