# PaperIQ Feature Extraction Notebook

This notebook demonstrates how to extract handcrafted features (TTR, avg sentence length, lexical sophistication)
and obtain sentence/paragraph embeddings using a transformer model.


In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel
import torch

def tokenize_words(text):
    return re.findall(r"\b[\w']+\b", text.lower())
def ttr(words):
    return len(set(words))/len(words) if words else 0

# Load cleaned CSV
df = pd.read_csv('data/combined_clean_asap.csv')
df['word_count'] = df['essay'].apply(lambda x: len(tokenize_words(str(x))))
df['ttr'] = df['essay'].apply(lambda x: ttr(tokenize_words(str(x))))

# Example: compute embeddings (small model for CPU)
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def embed(text):
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
    with torch.no_grad():
        out = model(**inputs)
    # mean pooling
    embeddings = out.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

# example
emb = embed(df['essay'].iloc[0])
print('Embedding shape:', emb.shape)
