In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, countDistinct
import pandas as pd

# Exploratory Data Analysis (EDA) — Intro Section

The dataset under study has already been preprocessed into **chunked texts** of 300–500 characters, 
resulting in a large corpus suitable for modeling. To handle the heavy dataset efficiently 
(~2.9 million records), we use **PySpark** for data extraction and summarization.

In this introductory stage of EDA, our goals are:
- Describe the **basic shape** of the dataset (total chunk records).
- Count the number of **unique authors** and **unique titles** in the corpus.
- Explore **genre distribution**, with a special focus on "Fantasy" records.
- Provide a few **interesting facts** about the corpus to set the stage for deeper visual analysis.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, countDistinct

# Start Spark session
spark = SparkSession.builder.appName("AuthorEDA").getOrCreate()

# Load the full chunked dataset (~2.9M rows)
df = spark.read.csv("dataset_splits/full_chunked.csv", header=True, inferSchema=True)

# --- Basic Stats ---
total_records = df.count()
unique_authors = df.select("author").distinct().count()
unique_titles = df.select("title").distinct().count()

print(f"📊 Dataset Overview")
print(f"Total chunked records: {total_records:,}")
print(f"Unique authors: {unique_authors}")
print(f"Unique titles: {unique_titles}")

# --- Genres ---
genre_counts = df.groupBy("genre").count().orderBy(col("count").desc())
print("\nGenre distribution (top 10):")
genre_counts.show(10, truncate=False)

# --- Fantasy Records ---
fantasy_df = df.filter(col("genre").like("%Fantasy%"))
fantasy_records = fantasy_df.count()
fantasy_authors = fantasy_df.select("author").distinct().count()

print(f"\nNumber of Fantasy-related chunks: {fantasy_records:,}")
print(f"Number of unique Fantasy authors: {fantasy_authors}")

📊 Dataset Overview
Total chunked records: 2,930,008
Unique authors: 142
Unique titles: 3022

Genre distribution (top 10):
+---------------------------------+------+
|genre                            |count |
+---------------------------------+------+
|Literature/Novel, Fiction        |476896|
|Historical/Adventure, Fiction    |211553|
|Adventure/Novel, Fiction         |99022 |
|Historical/Novel, Fiction        |93572 |
|Unknown                          |73171 |
|Adventure/Sea, Fiction           |71090 |
|Literature/Collection, Fiction   |61501 |
|Poetry, Fiction                  |59534 |
|Literature/Short Stories, Fiction|58519 |
|Young Adult/Adventure, Fiction   |52381 |
+---------------------------------+------+
only showing top 10 rows

Number of Fantasy-related chunks: 76,920
Number of unique Fantasy authors: 25
