#Import data

In [None]:
import kagglehub

# Download
path = kagglehub.dataset_download("meetnaren/goodreads-best-books")

print("Path to dataset files:", path)
import os
import pandas as pd

csv_file = os.path.join(path, "book_data.csv")
df = pd.read_csv(csv_file)

print(df.columns)
print(df.head())

Path to dataset files: /kaggle/input/goodreads-best-books
Index(['book_authors', 'book_desc', 'book_edition', 'book_format', 'book_isbn',
       'book_pages', 'book_rating', 'book_rating_count', 'book_review_count',
       'book_title', 'genres', 'image_url'],
      dtype='object')
                                        book_authors  \
0                                    Suzanne Collins   
1                         J.K. Rowling|Mary GrandPré   
2                                         Harper Lee   
3  Jane Austen|Anna Quindlen|Mrs. Oliphant|George...   
4                                    Stephenie Meyer   

                                           book_desc  \
0  Winning will make you famous. Losing means cer...   
1  There is a door at the end of a silent corrido...   
2  The unforgettable novel of a childhood in a sl...   
3  «È cosa ormai risaputa che a uno scapolo in po...   
4  About three things I was absolutely positive.F...   

                         book_edition book_

#Preprocessing

In [None]:
print(df['genres'].sample(10))

31315    Paranormal|Fairies|Fantasy|Magic|Fantasy|Child...
234      Fantasy|Fiction|Romance|Historical|Historical ...
49509    Classics|Poetry|Fantasy|Mythology|Religion|Fic...
9661          Fiction|Contemporary|Literary Fiction|Novels
38379                                              Fiction
4992                                          Spirituality
29019      Fantasy|Paranormal|Holiday|Contemporary|Romance
53029    Historical|Historical Fiction|Fiction|Historic...
31357                              Religion|Islam|Religion
42960    Fantasy|Paranormal|Paranormal|Vampires|Romance...
Name: genres, dtype: object


In [None]:
#df= df.sample(5000, random_state=42)

In [None]:
# Split by '|' and keep first genre
df['genre'] = df['genres'].str.split('|').str[0]

In [None]:
print(df[['book_desc', 'genre']].sample(10))

                                               book_desc       genre
9302   Once she was Adrienne Satti. An orphan of Davi...     Fantasy
38566  Denver is rich - very, very rich. Everyone in ...   Childrens
44024  Transcription of the handwritten pages:http://...  Historical
24279  Comedian Gabbie Hanna brings levity to the twi...      Poetry
27322  It's almost a year since Gaby Winters watched ...  Paranormal
46776  A commentary on contemporary urban mores and m...       Plays
51962  A young man from a small provincial town moves...   Biography
6495   A Song of Love won her heart.A Song of Darknes...     Fantasy
12782  We think we're relating to other people. Actua...  Psychology
49937  Award-winning Canadian author Kathleen Winter’...     Fiction


In [None]:
# Drop any rows with missing genre
df = df.dropna(subset=['genre'])


print(df[['book_desc', 'genre']].sample(10))

                                               book_desc              genre
28767  Sixteen-year-old Tess's life has been shaped b...        Young Adult
9010   Trollope's 1875 tale of a great financier's fr...           Classics
40994  The magnificent Pulitzer Prize-winning novel o...            Fiction
51942  If someone gave you a chair and said it was ma...  Christian Fiction
41652                                                NaN          Sociology
52897                       Book 32 of the Old Testament         Nonfiction
7037   For almost two centuries, the stories of magic...           Classics
8240   The execution-style murder of a Swedish housew...            Mystery
24263  The Imperial Survey Service has four levels of...    Science Fiction
52906  Deborah and Simon St. James have taken a holid...            Mystery


In [None]:
print(df['genre'].value_counts())

genre
Fantasy           7549
Fiction           6828
Romance           4412
Young Adult       3711
Nonfiction        2593
                  ... 
Folk Tales           1
Church               1
Social Justice       1
How To               1
Pulp                 1
Name: count, Length: 203, dtype: int64


In [None]:
df = df.dropna(subset=['book_desc'])

In [None]:
top_genres = df['genre'].value_counts().index[:15]
df = df[df['genre'].isin(top_genres)]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform text
X = vectorizer.fit_transform(df['book_desc']).toarray()

print(X.shape)

(39211, 5000)


In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['genre'])

print(y[:10])
print(encoder.classes_)

[14  2  1  1 14  4  2  1  1  2]
['Childrens' 'Classics' 'Fantasy' 'Fiction' 'Historical' 'History'
 'Horror' 'Mystery' 'Nonfiction' 'Paranormal' 'Poetry' 'Romance'
 'Science Fiction' 'Sequential Art' 'Young Adult']


In [None]:
print(X.shape)
print(y[:10])
print(encoder.classes_)

(39211, 5000)
[14  2  1  1 14  4  2  1  1  2]
['Childrens' 'Classics' 'Fantasy' 'Fiction' 'Historical' 'History'
 'Horror' 'Mystery' 'Nonfiction' 'Paranormal' 'Poetry' 'Romance'
 'Science Fiction' 'Sequential Art' 'Young Adult']


#Training

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, X_test.shape)

(31368, 5000) (7843, 5000)


In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix

# PCA Dim Reduction

print("Fitting PCA...")
n_components = 2500
pca = PCA(n_components=0.95, random_state=42)
X_train_reduced = pca.fit_transform(X_train)
X_test_reduced = pca.transform(X_test)
print("Reduced shapes:", X_train_reduced.shape, X_test_reduced.shape)

Fitting PCA...
Reduced shapes: (31368, 3853) (7843, 3853)


In [None]:
# Total variance captured (as a fraction of 1)
total_variance_captured = np.sum(pca.explained_variance_ratio_)
print(f"Total variance captured by {n_components} components: {total_variance_captured:.4f}")

# Or view individual variance ratios
print("First 10 explained variance ratios:", pca.explained_variance_ratio_[:10])

Total variance captured by 2500 components: 0.9501
First 10 explained variance ratios: [0.0300226  0.01494424 0.01108106 0.00789455 0.00596185 0.00558847
 0.00522296 0.0045341  0.00420509 0.00392758]


In [None]:
# Prepare NumPy Arrays

X_train_np = X_train_reduced
X_test_np = X_test_reduced
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

num_samples, input_dim = X_train_np.shape
num_classes = len(np.unique(y_train_np))
print(f"Input dim: {input_dim}, Num classes: {num_classes}")

Input dim: 3853, Num classes: 15


In [None]:

hidden_dim1 = 512
hidden_dim2 = 256
learning_rate = 0.0001
batch_size = 64
epochs = 100
l2_lambda = 0.001

# Xavier Initialization

def xavier_init(in_dim, out_dim):
    return np.random.randn(in_dim, out_dim) * np.sqrt(1. / in_dim)

W1 = xavier_init(input_dim, hidden_dim1)
b1 = np.zeros((1, hidden_dim1))

W2 = xavier_init(hidden_dim1, hidden_dim2)
b2 = np.zeros((1, hidden_dim2))

W3 = xavier_init(hidden_dim2, num_classes)
b3 = np.zeros((1, num_classes))

In [None]:
# Activation Functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy(preds, labels):
    n_samples = preds.shape[0]
    clipped_preds = np.clip(preds, 1e-12, 1. - 1e-12)
    log_likelihood = -np.log(clipped_preds[range(n_samples), labels])
    return np.mean(log_likelihood)

# Adam Optimizer Variables

beta1 = 0.9
beta2 = 0.999
epsilon = 1e-8

mW1, vW1 = np.zeros_like(W1), np.zeros_like(W1)
mb1, vb1 = np.zeros_like(b1), np.zeros_like(b1)

mW2, vW2 = np.zeros_like(W2), np.zeros_like(W2)
mb2, vb2 = np.zeros_like(b2), np.zeros_like(b2)

mW3, vW3 = np.zeros_like(W3), np.zeros_like(W3)
mb3, vb3 = np.zeros_like(b3), np.zeros_like(b3)

t = 0

In [None]:

# Training Loop
loss_history = []
print("\nStarting training...\n")
for epoch in range(epochs):
    perm = np.random.permutation(num_samples)
    X_shuffled = X_train_np[perm]
    y_shuffled = y_train_np[perm]
    total_loss = 0

    for i in range(0, num_samples, batch_size):
        X_batch = X_shuffled[i:i+batch_size]
        y_batch = y_shuffled[i:i+batch_size]
        bs = X_batch.shape[0]

        # Forward pass
        z1 = np.dot(X_batch, W1) + b1
        a1 = relu(z1)

        z2 = np.dot(a1, W2) + b2
        a2 = relu(z2)

        z3 = np.dot(a2, W3) + b3
        preds = softmax(z3)

        # Loss with L2
        loss = cross_entropy(preds, y_batch)
        loss += (l2_lambda/2) * (np.sum(W1**2) + np.sum(W2**2) + np.sum(W3**2))
        total_loss += loss * bs

        # Backward pass
        dZ3 = preds
        dZ3[range(bs), y_batch] -= 1
        dZ3 /= bs

        dW3 = np.dot(a2.T, dZ3) + l2_lambda * W3
        db3 = np.sum(dZ3, axis=0, keepdims=True)

        dA2 = np.dot(dZ3, W3.T)
        dZ2 = dA2 * relu_derivative(z2)

        dW2 = np.dot(a1.T, dZ2) + l2_lambda * W2
        db2 = np.sum(dZ2, axis=0, keepdims=True)

        dA1 = np.dot(dZ2, W2.T)
        dZ1 = dA1 * relu_derivative(z1)

        dW1 = np.dot(X_batch.T, dZ1) + l2_lambda * W1
        db1 = np.sum(dZ1, axis=0, keepdims=True)

        # Adam updates
        t += 1
        for param, grad, m, v in [
            (W1, dW1, mW1, vW1),
            (b1, db1, mb1, vb1),
            (W2, dW2, mW2, vW2),
            (b2, db2, mb2, vb2),
            (W3, dW3, mW3, vW3),
            (b3, db3, mb3, vb3),
        ]:
            m[:] = beta1 * m + (1 - beta1) * grad
            v[:] = beta2 * v + (1 - beta2) * (grad ** 2)
            m_hat = m / (1 - beta1 ** t)
            v_hat = v / (1 - beta2 ** t)
            param -= learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)

    avg_loss = total_loss / num_samples
    loss_history.append(avg_loss)
    print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")


Starting training...

Epoch 1/100, Avg Loss: 2.6386
Epoch 2/100, Avg Loss: 2.0838
Epoch 3/100, Avg Loss: 1.8139
Epoch 4/100, Avg Loss: 1.6554
Epoch 5/100, Avg Loss: 1.5447
Epoch 6/100, Avg Loss: 1.4707
Epoch 7/100, Avg Loss: 1.4166
Epoch 8/100, Avg Loss: 1.3759
Epoch 9/100, Avg Loss: 1.3441
Epoch 10/100, Avg Loss: 1.3186
Epoch 11/100, Avg Loss: 1.2968
Epoch 12/100, Avg Loss: 1.2788
Epoch 13/100, Avg Loss: 1.2635
Epoch 14/100, Avg Loss: 1.2503
Epoch 15/100, Avg Loss: 1.2382
Epoch 16/100, Avg Loss: 1.2273
Epoch 17/100, Avg Loss: 1.2175
Epoch 18/100, Avg Loss: 1.2088
Epoch 19/100, Avg Loss: 1.2006
Epoch 20/100, Avg Loss: 1.1929
Epoch 21/100, Avg Loss: 1.1862
Epoch 22/100, Avg Loss: 1.1794
Epoch 23/100, Avg Loss: 1.1739
Epoch 24/100, Avg Loss: 1.1681
Epoch 25/100, Avg Loss: 1.1632
Epoch 26/100, Avg Loss: 1.1585
Epoch 27/100, Avg Loss: 1.1535
Epoch 28/100, Avg Loss: 1.1490
Epoch 29/100, Avg Loss: 1.1443
Epoch 30/100, Avg Loss: 1.1409
Epoch 31/100, Avg Loss: 1.1372
Epoch 32/100, Avg Loss: 1

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.plot(range(1, epochs + 1), loss_history, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.title('Training Loss Curve')
plt.grid(True)
plt.show()

NameError: name 'epochs' is not defined

<Figure size 800x500 with 0 Axes>

In [None]:
# Evaluate

print("\nEvaluating on test set...")
z1_test = np.dot(X_test_np, W1) + b1
a1_test = relu(z1_test)

z2_test = np.dot(a1_test, W2) + b2
a2_test = relu(z2_test)

z3_test = np.dot(a2_test, W3) + b3
preds_test = softmax(z3_test)
y_pred = np.argmax(preds_test, axis=1)

accuracy = np.mean(y_pred == y_test_np)
print(f"\nTest Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test_np, y_pred, target_names=encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test_np, y_pred))


Evaluating on test set...

Test Accuracy: 60.47%

Classification Report:
                 precision    recall  f1-score   support

      Childrens       0.55      0.45      0.49       217
       Classics       0.51      0.46      0.49       403
        Fantasy       0.67      0.72      0.70      1498
        Fiction       0.55      0.62      0.58      1348
     Historical       0.62      0.50      0.55       477
        History       0.67      0.59      0.62       186
         Horror       0.59      0.41      0.48       210
        Mystery       0.61      0.57      0.59       416
     Nonfiction       0.64      0.68      0.66       511
     Paranormal       0.57      0.38      0.46       120
         Poetry       0.80      0.55      0.65       166
        Romance       0.66      0.69      0.68       875
Science Fiction       0.59      0.51      0.55       379
 Sequential Art       0.56      0.50      0.53       298
    Young Adult       0.53      0.60      0.56       739

       accur