In [12]:
# Imports and setup
import re
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [13]:
# Load raw CSV data
csv_path = '../data/raw/Shakespeare_data.csv'
df = pd.read_csv(csv_path)
print(f"Loaded {len(df)} rows from CSV")


Loaded 111396 rows from CSV


In [14]:
# Remove rows with NaN in 'PlayerLine' or 'Player'
df = df[df['PlayerLine'].notna()]
df['Player'] = df['Player'].fillna('')  # avoid NaN
mask = df['Player'].str.contains(r'^(ACT|SCENE)', regex=True)
df = df[~mask]

# Combine all dialogue into one text blob
text = ' '.join(df['PlayerLine'].astype(str).tolist())
print(f"Combined dialogue length: {len(text)} characters")

  mask = df['Player'].str.contains(r'^(ACT|SCENE)', regex=True)


Combined dialogue length: 4366287 characters


In [15]:
# Cleaning function
def clean_text(txt):
    # Remove stage directions in brackets
    txt = re.sub(r"\[.*?\]", "", txt)
    txt = re.sub(r"\(.*?\)", "", txt)
    # Remove unwanted characters
    txt = re.sub(r"[^a-zA-Z0-9\s\.\,\;\'\-]", "", txt)
    # Lowercase and collapse whitespace
    txt = txt.lower()
    txt = re.sub(r"\s+", " ", txt).strip()
    return txt

cleaned = clean_text(text)
print(f"Cleaned length: {len(cleaned)} characters")


Cleaned length: 4323891 characters


In [16]:
#Save cleaned text
with open('../data/processed/cleaned_shakespeare.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned)

In [20]:
# Tokenization
# Build tokenizer on the entire cleaned text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([cleaned])
# Convert cleaned text to sequence of word indices
token_list = tokenizer.texts_to_sequences([cleaned])[0]
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}, Total tokens: {len(token_list)}")

Vocabulary size: 25759, Total tokens: 819639


In [None]:
# Generate sequences
# Choose a fixed sequence length n (e.g., 20)
n = 20
sequences = []
for i in range(n, len(token_list)):
    seq = token_list[i-n:i+1]  # n inputs + 1 label
    sequences.append(seq)

# Pad/truncate to ensure uniform length
max_len = n + 1
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

# features and labels
data_X = sequences[:, :-1]
data_y = sequences[:, -1]


Generated 819619 sequences of length 20


In [22]:
# Save tokenizer and sequences
with open('../data/processed/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
np.save('../data/processed/data_X.npy', data_X)
np.save('../data/processed/data_y.npy', data_y)
print('Preprocessing complete: data_X, data_y, and tokenizer saved.')

Preprocessing complete: data_X, data_y, and tokenizer saved.
