In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Exploratory Data Analysis (EDA) — Intro Section

The dataset under study has already been preprocessed into **chunked texts** of 300–500 characters, 
resulting in a large corpus suitable for modeling. To handle the heavy dataset efficiently 
(~2.9 million records), we use **PySpark** for data extraction and summarization.

In this introductory stage of EDA, our goals are:
- Describe the **basic shape** of the dataset (total chunk records).
- Count the number of **unique authors** and **unique titles** in the corpus.
- Explore **genre distribution**, with a special focus on "Fantasy" records.
- Provide a few **interesting facts** about the corpus to set the stage for deeper visual analysis.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# --- Start Spark session ---
spark = SparkSession.builder.appName("AuthorEDA").getOrCreate()

# --- Load dataset (~2.9M rows) ---
df = spark.read.csv("dataset_splits/full_chunked.csv", header=True, inferSchema=True)

# -----------------
#  BASIC STATS
# -----------------
total_records = df.count()
unique_authors = df.select("author").distinct().count()
unique_titles = df.select("title").distinct().count()

print(" Dataset Overview")
print(f"Total chunked records: {total_records:,}")
print(f"Unique authors: {unique_authors}")
print(f"Unique titles: {unique_titles}")

# -----------------
# 🎭 GENRE DISTRIBUTION
# -----------------
genre_counts = df.groupBy("genre").count().orderBy(col("count").desc())
genre_counts_pd = genre_counts.toPandas()   # convert to Pandas for plotting

# Pie chart of top 10 genres
top_genres = genre_counts_pd.head(10)
plt.figure(figsize=(8, 8))
plt.pie(top_genres['count'], labels=top_genres['genre'],
        autopct='%1.1f%%', startangle=140, colors=plt.cm.tab20.colors)
plt.title("Top 10 Genres Distribution")
plt.show()

# -----------------
# 👤 TOP 10 AUTHORS
# -----------------
author_counts = df.groupBy("author").count().orderBy(col("count").desc())
author_counts_pd = author_counts.toPandas()

top_authors = author_counts_pd.head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x="author", y="count", data=top_authors, palette="viridis")
plt.xticks(rotation=65, ha="right")
plt.title("Top 10 Authors by Number of Chunks")
plt.ylabel("Number of Chunks")
plt.xlabel("Author")
plt.show()

# -----------------
# 📚 HOW MANY AUTHORS / TITLES
# -----------------
print(f"\nTotal Authors: {unique_authors}")
print(f"Total Titles (unique books): {unique_titles}")

# -----------------
# 🌌 WORD CLOUD for TITLES
# -----------------
# Collect all titles (could be 3k unique, repeated across ~2.9M chunks)
titles_pd = df.select("title").toPandas()

# Join all titles into one big string
titles_combined = " ".join(titles_pd['title'].dropna().tolist())

# Generate a word cloud where frequent titles (or words IN titles) are bigger
wordcloud_titles = WordCloud(width=1000, height=500,
                             background_color="white",
                             colormap="plasma",
                             collocations=False).generate(titles_combined)

plt.figure(figsize=(14, 7))
plt.imshow(wordcloud_titles, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Book Titles in the Dataset", fontsize=16)
plt.show()

### Step 2 — Preprocessing for LSTM

For our LSTM author classification model, we adopt a **character-level** preprocessing strategy.

1. **Build Character Vocabulary**  
   Extract all unique characters in the training set and map them to integer IDs.  
   (e.g., {'a': 1, 'b': 2, ..., ' ': 27, '!': 28, ...}). Padding is reserved as 0.

2. **Text to Sequences**  
   Convert each text chunk into a sequence of character IDs.

3. **Sequence Padding**  
   Pad or truncate each sequence to a uniform fixed length: **2000 characters**.  
   This ensures consistent tensor inputs for the LSTM.

4. **Author Labels Encoding**  
   Encode string author names into integer labels (0…N-1).

This preprocessing prepares the data in a format suitable for feeding into an LSTM:  
`X_train, X_val, X_test` → padded numeric sequences of chars,  
`y_train, y_val, y_test` → integer author labels.  

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# ---------------------------
# Load Data
# ---------------------------
train_df = pd.read_csv("dataset_splits/train.csv")
val_df   = pd.read_csv("dataset_splits/val.csv")
test_df  = pd.read_csv("dataset_splits/test.csv")

print("Data loaded")
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# ---------------------------
# Build Char Vocabulary
# ---------------------------
all_text = "".join(train_df['chunk_text'].astype(str).tolist())
chars = sorted(list(set(all_text)))
char_to_int = {c: i+1 for i, c in enumerate(chars)}  # start idx=1, keep 0 for padding
int_to_char = {i: c for c, i in char_to_int.items()}

print(f"Built vocab of {len(char_to_int)} unique characters")

# ---------------------------
# Encode Text Function
# ---------------------------
def encode_text(text, mapping):
    return [mapping.get(c, 0) for c in text]  # unknown -> 0

train_seqs = [encode_text(t, char_to_int) for t in train_df['chunk_text'].astype(str)]
val_seqs   = [encode_text(t, char_to_int) for t in val_df['chunk_text'].astype(str)]
test_seqs  = [encode_text(t, char_to_int) for t in test_df['chunk_text'].astype(str)]

# ---------------------------
# Stats for Chunk Lengths
# ---------------------------
actual_max = max(
    max(len(seq) for seq in train_seqs),
    max(len(seq) for seq in val_seqs),
    max(len(seq) for seq in test_seqs)
)

print(f"Longest observed chunk length: {actual_max}")

# ---------------------------
# Set MAX_LEN Smartly
# ---------------------------
# Since avg ≈ 400, 75% ≤ 450, max = 500 → best trade-off = 450
MAX_LEN = 450
print("Using MAX_LEN =", MAX_LEN)

# ---------------------------
# Pad Sequences
# ---------------------------
X_train = pad_sequences(train_seqs, maxlen=MAX_LEN, padding='post', truncating='post')
X_val   = pad_sequences(val_seqs,   maxlen=MAX_LEN, padding='post', truncating='post')
X_test  = pad_sequences(test_seqs,  maxlen=MAX_LEN, padding='post', truncating='post')

print(f"Shapes -> X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")

# ---------------------------
# Encode Labels (Authors)
# ---------------------------
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['author'])
y_val   = label_encoder.transform(val_df['author'])
y_test  = label_encoder.transform(test_df['author'])

print(f"Encoded {len(label_encoder.classes_)} authors")

# ---------------------------
# Output
# ---------------------------
print("Preprocessing complete! Data ready for LSTM/GRU training")
print("   - X_train, X_val, X_test: Padded int sequences")
print("   - y_train, y_val, y_test: Encoded author labels")
print("char_to_int & int_to_char dictionaries available for encoding/decoding")

✅ Data loaded
Train: 21000, Val: 4500, Test: 4500
✅ Built vocab of 92 unique characters
Longest observed chunk length: 500
Using MAX_LEN = 450
✅ Shapes -> X_train: (21000, 450), X_val: (4500, 450), X_test: (4500, 450)
✅ Encoded 5 authors
🎯 Preprocessing complete! Data ready for LSTM/GRU training
   - X_train, X_val, X_test: Padded int sequences
   - y_train, y_val, y_test: Encoded author labels
💡 char_to_int & int_to_char dictionaries available for encoding/decoding


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import joblib

# ---------------------------
# Parameters (from preprocessing results)
# ---------------------------
NUM_CLASSES = len(set(y_train))       # should be 5 now
VOCAB_SIZE  = len(char_to_int) + 1    # +1 for padding index
MAX_LEN     = 450                     # optimized length
EMBED_DIM   = 32                      # small, CPU-friendly

# ---------------------------
# Build Model (optimized for CPU)
# ---------------------------
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM,
              input_length=MAX_LEN, mask_zero=True),
    Bidirectional(GRU(32, return_sequences=False, dropout=0.3, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=["accuracy"]
)

# ---------------------------
# Callbacks
# ---------------------------
early_stop = EarlyStopping(
    monitor='val_accuracy', patience=2,
    restore_best_weights=True, verbose=1
)
checkpoint = ModelCheckpoint(
    "checkpoint_best.keras", monitor='val_accuracy',
    save_best_only=True, verbose=1
)

# ---------------------------
# Training
# ---------------------------
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=7,
    batch_size=128,
    callbacks=[early_stop, checkpoint],
    verbose=1
)

# ---------------------------
# Evaluate
# ---------------------------
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print(f"🔥 Test Accuracy: {test_acc:.4f}")

# ---------------------------
# Save final trained model + label encoder
# ---------------------------
model.save("final_author_model.keras")       # saves full model
joblib.dump(label_encoder, "label_encoder.pkl")
print("✅ Model and label encoder saved!")

Epoch 1/7
[1m  4/165[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:59[0m 2s/step - accuracy: 0.1914 - loss: 1.6097

KeyboardInterrupt: 