In [3]:
# -------------------------
# 1. Setup
# -------------------------
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
 
df = pd.read_csv("author_chunks_dataset/train.csv")
df.head()

Unnamed: 0,author,book_id,genre,chunk,split
0,Abraham Lincoln,Lincoln Letters,"History/Politics, Non-Fiction",LINCOLN LETTERS By Abraham Lincoln Published b...,train
1,Abraham Lincoln,Lincoln Letters,"History/Politics, Non-Fiction",", you have said to me, ""We can get along very ...",train
2,Abraham Lincoln,Lincoln Letters,"History/Politics, Non-Fiction",ld almost give your place in Heaven for 70 or ...,train
3,Abraham Lincoln,Lincoln's First Inaugural Address,"History/Politics, Speech/Non-Fiction","Lincoln's First Inaugural Address March 4, 186...",train
4,Abraham Lincoln,Lincoln's First Inaugural Address,"History/Politics, Speech/Non-Fiction","entiments; and, in doing so, I only press upon...",train


In [None]:

# -------------------------
# 1. Style configs
# -------------------------
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# -------------------------
# 2. Load Original Dataset
# -------------------------
df = pd.read_csv("author_identification_dataset_final.csv")

print("Shape of dataset:", df.shape)
print("Columns:", df.columns.tolist())

# -------------------------
# 3. Author Counts
# -------------------------
n_authors = df['author'].nunique()
print("Unique authors:", n_authors)

author_counts = df['author'].value_counts()
print("\nTop 10 authors by number of works:\n", author_counts.head(10))

plt.figure(figsize=(12,6))
sns.barplot(x=author_counts.head(20).values,
            y=author_counts.head(20).index,
            palette="mako")
plt.title("Top 20 Authors by Number of Works")
plt.xlabel("Works Count")
plt.ylabel("Author")
plt.show()

# -------------------------
# 4. Genre Exploration
# -------------------------
n_genres = df['genre'].nunique()
print("\nUnique genres:", n_genres)

genre_counts = df['genre'].value_counts()
print("\nTop 10 genres:\n", genre_counts.head(10))

plt.figure(figsize=(12,6))
sns.barplot(x=genre_counts.values,
            y=genre_counts.index,
            palette="viridis")
plt.title("Distribution of Works by Genre")
plt.xlabel("Number of Works")
plt.ylabel("Genre")
plt.show()

# Relationship: How many authors per genre?
authors_per_genre = df.groupby('genre')['author'].nunique().sort_values(ascending=False)
print("\nNumber of unique authors per genre:\n", authors_per_genre.head(10))

plt.figure(figsize=(12,6))
sns.barplot(x=authors_per_genre.values,
            y=authors_per_genre.index,
            palette="coolwarm")
plt.title("Number of Unique Authors per Genre")
plt.xlabel("Authors Count")
plt.ylabel("Genre")
plt.show()

# -------------------------
# 5. WordCloud of Genres
# -------------------------
genre_wordcloud = WordCloud(width=1000, height=600,
                            background_color="white",
                            colormap="tab10",
                            prefer_horizontal=1.0
                           ).generate_from_frequencies(genre_counts)

plt.figure(figsize=(12,8))
plt.imshow(genre_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Genre Frequency WordCloud", fontsize=20)
plt.show()

# -------------------------
# 6. Example Rows (Preview)
# -------------------------
print("\n=== Example works (for report) ===")
for genre in df['genre'].unique()[:3]:  # sample 3 genres
    sample_row = df[df['genre'] == genre].iloc[0]
    print(f"\nAuthor: {sample_row['author']}\nTitle: {sample_row['title']}\nGenre: {sample_row['genre']}\n---\nText snippet:\n{sample_row['text'][:300]}...\n{'='*80}")