In [None]:
import os
import pandas as pd
import spacy
import re
from tqdm import tqdm

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
RAW_DATA_FOLDER = '../data/raw/'
OUTPUT_FILE = '../data/processed/sentences_dataset.csv'

In [None]:
# Clean text
def clean_text(text):
    text = text.replace('\n', ' ')                   
    text = re.sub(r'\s+', ' ', text)                 
    text = re.sub(r'[^\x00-\x7F]+',' ', text)        
    text = text.strip()                             
    return text

In [None]:
# Split sentence
def split_into_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

In [None]:
# collect all files
file_list = [os.path.join(RAW_DATA_FOLDER, f) for f in os.listdir(RAW_DATA_FOLDER) if f.endswith('.txt')]
print(f"Found {len(file_list)} files.")

In [None]:
# Main loop
data = []
for file_path in tqdm(file_list):
    essay_id = os.path.basename(file_path).split('.')[0]  
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        text = clean_text(text)
        sentences = split_into_sentences(text)
        for idx, sentence in enumerate(sentences, start=1):
            data.append({
                'essay_id': essay_id,
                'sentence_id': idx,
                'sentence_text': sentence
            })

In [None]:
# Create Dataframe
df = pd.DataFrame(data)

print(f"Total sentences extracted: {len(df)}")
print(df.head())

In [None]:
# Save Output
os.makedirs('../data/processed/', exist_ok=True)
df.to_csv(OUTPUT_FILE, index=False)
print(f"Saved to {OUTPUT_FILE}")